{ "cells": [ { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "root_path = r\"C:/Users/zhouj/Desktop/data\"\n", "output_path = r\"C:/Users/zhouj/Desktop/a88.csv\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 等差复权\n", "adjust = df['close'].shift() - df['open']\n", "adjust = np.where(df['symbol'] != df['symbol'].shift(), adjust, 0)\n", "df['open_adj'] = df['open'] + adjust.cumsum()\n", "df['close_adj'] = df['close'] + adjust.cumsum()\n", "df['low_adj'] = df['low'] + adjust.cumsum()\n", "df['high_adj'] = df['high'] + adjust.cumsum()\n", "# 等比复权\n", "adjust = df['close'].shift() / df['open']\n", "adjust = np.where(df['symbol'] != df['symbol'].shift(), adjust, 1)\n", "df['open_adj'] = df['open'] * adjust.cumprod()\n", "df['close_adj'] = df['close'] * adjust.cumprod()\n", "df['low_adj'] = df['low'] * adjust.cumprod()\n", "df['high_adj'] = df['high'] * adjust.cumprod()\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "files = []\n", "\n", "for r, ds, fs in os.walk(root_path):\n", " for f in fs:\n", " # if f[0:4] == '2023':\n", " abs_filepath = os.path.join(r, f)\n", " files.append(abs_filepath)\n", "files = sorted(files)\n", "\n", "df = pd.DataFrame()\n", "for f in files:\n", " df_temp = pd.read_csv(\n", " f,\n", " usecols=[1, 2, 3, 4, 8, 13, 14, 15, 16],\n", " names=[\n", " \"统一代码\",\n", " \"合约代码\",\n", " \"时间\",\n", " \"最新\",\n", " \"成交量\",\n", " \"买一价\",\n", " \"卖一价\",\n", " \"买一量\",\n", " \"卖一量\",\n", " ],\n", " skiprows=1,\n", " encoding=\"utf-8\",\n", " )\n", " # df_temp = pd.read_csv(f, usecols=[0,5], names=[\n", " # 'datetime', 'volume'])\n", " df = pd.concat([df, df_temp])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# files = []\n", "\n", "# for r, ds, fs in os.walk(root_path):\n", "# for f in fs:\n", "# # if f[0:4] == '2023':\n", "# abs_filepath = os.path.join(r, f)\n", "# files.append(abs_filepath)\n", "# files = sorted(files)\n", "\n", "# df = pd.DataFrame()\n", "# for f in files:\n", "# df_temp = pd.read_csv(\n", "# f,\n", "# usecols=[0, 1, 4, 11, 20, 21, 22, 23, 24, 25],\n", "# names=[\n", "# \"交易日\",\n", "# \"合约代码\",\n", "# \"最新价\",\n", "# \"数量\",\n", "# \"最后修改时间\",\n", "# \"最后修改毫秒\",\n", "# \"申买价一\",\n", "# \"申买量一\",\n", "# \"申卖价一\",\n", "# \"申卖量一\",\n", "# ],\n", "# skiprows=1,\n", "# encoding=\"gbk\",\n", "# )\n", "# # df_temp = pd.read_csv(f, usecols=[0,5], names=[\n", "# # 'datetime', 'volume'])\n", "# df = pd.concat([df, df_temp])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.tail()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.reset_index(drop=True, inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.info()\n", "# 21754840" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "import numpy as np" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 统一代码 合约代码 时间 最新 成交量 买一价 卖一价 \\\n", "1305669 a888 a1905 2019-04-22 15:00:00.568 3309.0 0 3308.0 3311.0 \n", "1305670 a888 a1905 2019-04-22 15:00:36.638 3309.0 0 3308.0 3311.0 \n", "1305671 a888 a1909 2019-04-22 20:59:00.014 3412.0 224 3411.0 3412.0 \n", "1305672 a888 a1909 2019-04-22 21:00:00.461 3412.0 108 3412.0 3413.0 \n", "1305673 a888 a1909 2019-04-22 21:00:00.958 3411.0 150 3410.0 3411.0 \n", "\n", " 买一量 卖一量 \n", "1305669 25 10 \n", "1305670 25 10 \n", "1305671 2 8 \n", "1305672 10 19 \n", "1305673 43 3 \n" ] } ], "source": [ "print(df.loc[1305669:1305673])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 等比复权\n", "# df['复权因子'] = df['卖一价'].shift() / df['买一价']\n", "df['复权因子'] = np.where(df['合约代码'] != df['合约代码'].shift(), df['卖一价'].shift() / df['买一价'], 1)\n", "df['复权因子'] = df['复权因子'].fillna(1)\n", "# df['复权因子'].loc[0] = 1\n", "df['买一价_adj'] = df['买一价'] * df['复权因子'].cumprod()\n", "df['卖一价_adj'] = df['卖一价'] * df['复权因子'].cumprod()\n", "df['最新_adj'] = df['最新'] * df['复权因子'].cumprod()\n", "# df['low_adj'] = df['low'] * adjust.cumprod()\n", "# df['high_adj'] = df['high'] * adjust.cumprod()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# 等差复权\n", "df['复权因子'] = np.where(df['合约代码'] != df['合约代码'].shift(), df['卖一价'].shift() - df['买一价'], 0)\n", "df['复权因子'] = df['复权因子'].fillna(0)\n", "# df['复权因子'].loc[0] = 1\n", "df['买一价_adj'] = df['买一价'] + df['复权因子'].cumsum()\n", "df['卖一价_adj'] = df['卖一价'] + df['复权因子'].cumsum()\n", "df['最新_adj'] = df['最新'] + df['复权因子'].cumsum()\n", "# df['low_adj'] = df['low'] + df['复权因子'].cumsum()\n", "# df['high_adj'] = df['high'] + df['复权因子'].cumsum()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 统一代码 合约代码 时间 最新 成交量 买一价 卖一价 \\\n", "1305669 a888 a1905 2019-04-22 15:00:00.568 3309.0 0 3308.0 3311.0 \n", "1305670 a888 a1905 2019-04-22 15:00:36.638 3309.0 0 3308.0 3311.0 \n", "1305671 a888 a1909 2019-04-22 20:59:00.014 3412.0 224 3411.0 3412.0 \n", "1305672 a888 a1909 2019-04-22 21:00:00.461 3412.0 108 3412.0 3413.0 \n", "1305673 a888 a1909 2019-04-22 21:00:00.958 3411.0 150 3410.0 3411.0 \n", "\n", " 买一量 卖一量 复权因子 买一价_adj 卖一价_adj 最新_adj \n", "1305669 25 10 0.0 3308.0 3311.0 3309.0 \n", "1305670 25 10 0.0 3308.0 3311.0 3309.0 \n", "1305671 2 8 -100.0 3311.0 3312.0 3312.0 \n", "1305672 10 19 0.0 3312.0 3313.0 3312.0 \n", "1305673 43 3 0.0 3310.0 3311.0 3311.0 \n" ] } ], "source": [ "print(df.loc[1305669:1305673])" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "df['买一价'] = df['买一价_adj']\n", "df['卖一价'] = df['卖一价_adj']\n", "df['最新'] = df['最新_adj']" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 统一代码 合约代码 时间 最新 成交量 买一价 卖一价 \\\n", "1305669 a888 a1905 2019-04-22 15:00:00.568 3309.0 0 3308.0 3311.0 \n", "1305670 a888 a1905 2019-04-22 15:00:36.638 3309.0 0 3308.0 3311.0 \n", "1305671 a888 a1909 2019-04-22 20:59:00.014 3312.0 224 3311.0 3312.0 \n", "1305672 a888 a1909 2019-04-22 21:00:00.461 3312.0 108 3312.0 3313.0 \n", "1305673 a888 a1909 2019-04-22 21:00:00.958 3311.0 150 3310.0 3311.0 \n", "\n", " 买一量 卖一量 复权因子 买一价_adj 卖一价_adj 最新_adj \n", "1305669 25 10 0.0 3308.0 3311.0 3309.0 \n", "1305670 25 10 0.0 3308.0 3311.0 3309.0 \n", "1305671 2 8 -100.0 3311.0 3312.0 3312.0 \n", "1305672 10 19 0.0 3312.0 3313.0 3312.0 \n", "1305673 43 3 0.0 3310.0 3311.0 3311.0 \n" ] } ], "source": [ "print(df.loc[1305669:1305673])" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# df.drop('复权因子', axis=1)\n", "# df.drop('买一价_adj', axis=1)\n", "# df.drop('卖一价_adj', axis=1)\n", "del df['复权因子']\n", "del df['买一价_adj']\n", "del df['卖一价_adj']\n", "del df['最新_adj']" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 统一代码 合约代码 时间 最新 成交量 买一价 卖一价 \\\n", "1305670 a888 a1905 2019-04-22 15:00:36.638 3309.0 0 3308.0 3311.0 \n", "1305671 a888 a1909 2019-04-22 20:59:00.014 3312.0 224 3311.0 3312.0 \n", "1305672 a888 a1909 2019-04-22 21:00:00.461 3312.0 108 3312.0 3313.0 \n", "1305673 a888 a1909 2019-04-22 21:00:00.958 3311.0 150 3310.0 3311.0 \n", "1305674 a888 a1909 2019-04-22 21:00:01.464 3312.0 86 3311.0 3312.0 \n", "\n", " 买一量 卖一量 \n", "1305670 25 10 \n", "1305671 2 8 \n", "1305672 10 19 \n", "1305673 43 3 \n", "1305674 18 80 \n" ] } ], "source": [ "print(df.loc[1305670:1305674])" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "df.to_csv(output_path, index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "drop_index1 = df.query('最后修改时间>\"15:00:00\" & 最后修改时间<\"21:00:00\"')[\n", " \"最后修改时间\"\n", "].index\n", "# drop_index1 = df.query('最后修改时间>\"15:00:00\"')[\"最后修改时间\"].index\n", "# drop_index2 = df.query('最后修改时间>\"01:00:00\" & 最后修改时间<\"09:00:00\"')[\"最后修改时间\"].index\n", "# drop_index2 = df.query('最后修改时间>\"01:00:00\" & 最后修改时间<\"09:00:00\"')[\"最后修改时间\"].index\n", "drop_index2 = df.query('最后修改时间<\"09:00:00\"')[\"最后修改时间\"].index\n", "drop_index3 = df.query('最后修改时间>\"23:00:00\" & 最后修改时间<\"23:59:59\"')[\n", " \"最后修改时间\"\n", "].index\n", "drop_index4 = df.query('最后修改时间>\"11:30:00\" & 最后修改时间<\"13:30:00\"')[\n", " \"最后修改时间\"\n", "].index" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.drop(labels=drop_index1, axis=0, inplace=True)\n", "df.drop(drop_index2, axis=0, inplace=True)\n", "df.drop(drop_index3, axis=0, inplace=True)\n", "df.drop(drop_index4, axis=0, inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.tail()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.reset_index(drop=True, inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"datetime\"] = pd.to_datetime(\n", " pd.to_datetime(df[\"交易日\"].astype(str)).astype(str)\n", " + \" \"\n", " + df[\"最后修改时间\"].astype(str)\n", " + \".\"\n", " + df[\"最后修改毫秒\"].astype(str)\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.tail()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.rename(\n", " columns={\n", " \"最新价\": \"lastprice\",\n", " \"数量\": \"volume\",\n", " \"申买价一\": \"bid_p\",\n", " \"申买量一\": \"bid_v\",\n", " \"申卖价一\": \"ask_p\",\n", " \"申卖量一\": \"ask_v\",\n", " \"合约代码\": \"symbol\",\n", " },\n", " inplace=True,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"vol_diff\"] = df[\"volume\"].diff()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.loc[df[\"vol_diff\"].isnull(), \"vol_diff\"] = df.loc[df[\"vol_diff\"].isnull(), \"volume\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"volume\"] = df[\"vol_diff\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.to_csv(output_path)" ] } ], "metadata": { "kernelspec": { "display_name": "orderflow", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }