{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "root_path = r\".\\tick\\rb\"\n", "output_path = r\".\\data\\rb.csv\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "files = []\n", "\n", "for r, ds, fs in os.walk(root_path):\n", " for f in fs:\n", " # if f[0:4] == '2023':\n", " abs_filepath = os.path.join(r, f)\n", " files.append(abs_filepath)\n", "files = sorted(files)\n", "\n", "df = pd.DataFrame()\n", "for f in files:\n", " df_temp = pd.read_csv(\n", " f,\n", " usecols=[0, 1, 4, 11, 20, 21, 22, 23, 24, 25],\n", " names=[\n", " \"交易日\",\n", " \"合约代码\",\n", " \"最新价\",\n", " \"数量\",\n", " \"最后修改时间\",\n", " \"最后修改毫秒\",\n", " \"申买价一\",\n", " \"申买量一\",\n", " \"申卖价一\",\n", " \"申卖量一\",\n", " ],\n", " skiprows=1,\n", " encoding=\"gbk\",\n", " )\n", " # df_temp = pd.read_csv(f, usecols=[0,5], names=[\n", " # 'datetime', 'volume'])\n", " df = pd.concat([df, df_temp])" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
交易日合约代码最新价数量最后修改时间最后修改毫秒申买价一申买量一申卖价一申卖量一
4132320231229rb24054003.0120190514:59:5904002.02474003.0116
4132420231229rb24054003.0120202814:59:595004002.02244003.016
4132520231229rb24054002.0120206015:00:0004003.0234004.07
4132620231229rb24054002.0120206015:00:005004003.0234004.07
4132720231229rb24054002.0120206015:17:295004003.0234004.07
\n", "
" ], "text/plain": [ " 交易日 合约代码 最新价 数量 最后修改时间 最后修改毫秒 申买价一 申买量一 \\\n", "41323 20231229 rb2405 4003.0 1201905 14:59:59 0 4002.0 247 \n", "41324 20231229 rb2405 4003.0 1202028 14:59:59 500 4002.0 224 \n", "41325 20231229 rb2405 4002.0 1202060 15:00:00 0 4003.0 23 \n", "41326 20231229 rb2405 4002.0 1202060 15:00:00 500 4003.0 23 \n", "41327 20231229 rb2405 4002.0 1202060 15:17:29 500 4003.0 23 \n", "\n", " 申卖价一 申卖量一 \n", "41323 4003.0 116 \n", "41324 4003.0 16 \n", "41325 4004.0 7 \n", "41326 4004.0 7 \n", "41327 4004.0 7 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.tail()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
交易日合约代码最新价数量最后修改时间最后修改毫秒申买价一申买量一申卖价一申卖量一
020220104rb22054302.0464308:59:005004302.01154305.096
120220104rb22054305.0575009:00:005004305.03594310.036
220220104rb22054306.0803909:00:0104306.0184308.07
320220104rb22054308.0906509:00:015004308.0434310.074
420220104rb22054310.0968209:00:0204311.044314.019
\n", "
" ], "text/plain": [ " 交易日 合约代码 最新价 数量 最后修改时间 最后修改毫秒 申买价一 申买量一 申卖价一 \\\n", "0 20220104 rb2205 4302.0 4643 08:59:00 500 4302.0 115 4305.0 \n", "1 20220104 rb2205 4305.0 5750 09:00:00 500 4305.0 359 4310.0 \n", "2 20220104 rb2205 4306.0 8039 09:00:01 0 4306.0 18 4308.0 \n", "3 20220104 rb2205 4308.0 9065 09:00:01 500 4308.0 43 4310.0 \n", "4 20220104 rb2205 4310.0 9682 09:00:02 0 4311.0 4 4314.0 \n", "\n", " 申卖量一 \n", "0 96 \n", "1 36 \n", "2 7 \n", "3 74 \n", "4 19 " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "df.reset_index(drop=True, inplace=True)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 19813536 entries, 0 to 19813535\n", "Data columns (total 10 columns):\n", " # Column Dtype \n", "--- ------ ----- \n", " 0 交易日 int64 \n", " 1 合约代码 object \n", " 2 最新价 float64\n", " 3 数量 int64 \n", " 4 最后修改时间 object \n", " 5 最后修改毫秒 int64 \n", " 6 申买价一 float64\n", " 7 申买量一 int64 \n", " 8 申卖价一 float64\n", " 9 申卖量一 int64 \n", "dtypes: float64(3), int64(5), object(2)\n", "memory usage: 1.5+ GB\n" ] } ], "source": [ "df.info()\n", "# 21754840" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "drop_index1 = df.query('最后修改时间>\"15:00:00\" & 最后修改时间<\"21:00:00\"')[\n", " \"最后修改时间\"\n", "].index\n", "# drop_index1 = df.query('最后修改时间>\"15:00:00\"')[\"最后修改时间\"].index\n", "# drop_index2 = df.query('最后修改时间>\"01:00:00\" & 最后修改时间<\"09:00:00\"')[\"最后修改时间\"].index\n", "# drop_index2 = df.query('最后修改时间>\"01:00:00\" & 最后修改时间<\"09:00:00\"')[\"最后修改时间\"].index\n", "drop_index2 = df.query('最后修改时间<\"09:00:00\"')[\"最后修改时间\"].index\n", "drop_index3 = df.query('最后修改时间>\"23:00:00\" & 最后修改时间<\"23:59:59\"')[\n", " \"最后修改时间\"\n", "].index\n", "drop_index4 = df.query('最后修改时间>\"11:30:00\" & 最后修改时间<\"13:30:00\"')[\n", " \"最后修改时间\"\n", "].index" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "df.drop(labels=drop_index1, axis=0, inplace=True)\n", "df.drop(drop_index2, axis=0, inplace=True)\n", "df.drop(drop_index3, axis=0, inplace=True)\n", "df.drop(drop_index4, axis=0, inplace=True)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
交易日合约代码最新价数量最后修改时间最后修改毫秒申买价一申买量一申卖价一申卖量一
1981353020231229rb24054003.0120183614:59:585004002.02884003.0140
1981353120231229rb24054003.0120190514:59:5904002.02474003.0116
1981353220231229rb24054003.0120202814:59:595004002.02244003.016
1981353320231229rb24054002.0120206015:00:0004003.0234004.07
1981353420231229rb24054002.0120206015:00:005004003.0234004.07
\n", "
" ], "text/plain": [ " 交易日 合约代码 最新价 数量 最后修改时间 最后修改毫秒 申买价一 申买量一 \\\n", "19813530 20231229 rb2405 4003.0 1201836 14:59:58 500 4002.0 288 \n", "19813531 20231229 rb2405 4003.0 1201905 14:59:59 0 4002.0 247 \n", "19813532 20231229 rb2405 4003.0 1202028 14:59:59 500 4002.0 224 \n", "19813533 20231229 rb2405 4002.0 1202060 15:00:00 0 4003.0 23 \n", "19813534 20231229 rb2405 4002.0 1202060 15:00:00 500 4003.0 23 \n", "\n", " 申卖价一 申卖量一 \n", "19813530 4003.0 140 \n", "19813531 4003.0 116 \n", "19813532 4003.0 16 \n", "19813533 4004.0 7 \n", "19813534 4004.0 7 " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.tail()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 19812430 entries, 1 to 19813534\n", "Data columns (total 10 columns):\n", " # Column Dtype \n", "--- ------ ----- \n", " 0 交易日 int64 \n", " 1 合约代码 object \n", " 2 最新价 float64\n", " 3 数量 int64 \n", " 4 最后修改时间 object \n", " 5 最后修改毫秒 int64 \n", " 6 申买价一 float64\n", " 7 申买量一 int64 \n", " 8 申卖价一 float64\n", " 9 申卖量一 int64 \n", "dtypes: float64(3), int64(5), object(2)\n", "memory usage: 1.6+ GB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "df.reset_index(drop=True, inplace=True)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "df[\"datetime\"] = pd.to_datetime(\n", " pd.to_datetime(df[\"交易日\"].astype(str)).astype(str)\n", " + \" \"\n", " + df[\"最后修改时间\"].astype(str)\n", " + \".\"\n", " + df[\"最后修改毫秒\"].astype(str)\n", ")" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
交易日合约代码最新价数量最后修改时间最后修改毫秒申买价一申买量一申卖价一申卖量一datetime
1981242520231229rb24054003.0120183614:59:585004002.02884003.01402023-12-29 14:59:58.500
1981242620231229rb24054003.0120190514:59:5904002.02474003.01162023-12-29 14:59:59.000
1981242720231229rb24054003.0120202814:59:595004002.02244003.0162023-12-29 14:59:59.500
1981242820231229rb24054002.0120206015:00:0004003.0234004.072023-12-29 15:00:00.000
1981242920231229rb24054002.0120206015:00:005004003.0234004.072023-12-29 15:00:00.500
\n", "
" ], "text/plain": [ " 交易日 合约代码 最新价 数量 最后修改时间 最后修改毫秒 申买价一 申买量一 \\\n", "19812425 20231229 rb2405 4003.0 1201836 14:59:58 500 4002.0 288 \n", "19812426 20231229 rb2405 4003.0 1201905 14:59:59 0 4002.0 247 \n", "19812427 20231229 rb2405 4003.0 1202028 14:59:59 500 4002.0 224 \n", "19812428 20231229 rb2405 4002.0 1202060 15:00:00 0 4003.0 23 \n", "19812429 20231229 rb2405 4002.0 1202060 15:00:00 500 4003.0 23 \n", "\n", " 申卖价一 申卖量一 datetime \n", "19812425 4003.0 140 2023-12-29 14:59:58.500 \n", "19812426 4003.0 116 2023-12-29 14:59:59.000 \n", "19812427 4003.0 16 2023-12-29 14:59:59.500 \n", "19812428 4004.0 7 2023-12-29 15:00:00.000 \n", "19812429 4004.0 7 2023-12-29 15:00:00.500 " ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.tail()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "df.rename(\n", " columns={\n", " \"最新价\": \"lastprice\",\n", " \"数量\": \"volume\",\n", " \"申买价一\": \"bid_p\",\n", " \"申买量一\": \"bid_v\",\n", " \"申卖价一\": \"ask_p\",\n", " \"申卖量一\": \"ask_v\",\n", " \"合约代码\": \"symbol\",\n", " },\n", " inplace=True,\n", ")" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "df[\"vol_diff\"] = df[\"volume\"].diff()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
交易日symbollastpricevolume最后修改时间最后修改毫秒bid_pbid_vask_pask_vdatetimevol_diff
020220104rb22054305.0575009:00:005004305.03594310.0362022-01-04 09:00:00.500NaN
120220104rb22054306.0803909:00:0104306.0184308.072022-01-04 09:00:01.0002289.0
220220104rb22054308.0906509:00:015004308.0434310.0742022-01-04 09:00:01.5001026.0
320220104rb22054310.0968209:00:0204311.044314.0192022-01-04 09:00:02.000617.0
420220104rb22054314.01032809:00:025004314.01374316.0192022-01-04 09:00:02.500646.0
\n", "
" ], "text/plain": [ " 交易日 symbol lastprice volume 最后修改时间 最后修改毫秒 bid_p bid_v \\\n", "0 20220104 rb2205 4305.0 5750 09:00:00 500 4305.0 359 \n", "1 20220104 rb2205 4306.0 8039 09:00:01 0 4306.0 18 \n", "2 20220104 rb2205 4308.0 9065 09:00:01 500 4308.0 43 \n", "3 20220104 rb2205 4310.0 9682 09:00:02 0 4311.0 4 \n", "4 20220104 rb2205 4314.0 10328 09:00:02 500 4314.0 137 \n", "\n", " ask_p ask_v datetime vol_diff \n", "0 4310.0 36 2022-01-04 09:00:00.500 NaN \n", "1 4308.0 7 2022-01-04 09:00:01.000 2289.0 \n", "2 4310.0 74 2022-01-04 09:00:01.500 1026.0 \n", "3 4314.0 19 2022-01-04 09:00:02.000 617.0 \n", "4 4316.0 19 2022-01-04 09:00:02.500 646.0 " ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "df.loc[df[\"vol_diff\"].isnull(), \"vol_diff\"] = df.loc[df[\"vol_diff\"].isnull(), \"volume\"]" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "df[\"volume\"] = df[\"vol_diff\"]" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "df.to_csv(output_path)" ] } ], "metadata": { "kernelspec": { "display_name": "orderflow", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.17" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }