{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 使用说明:\n", " 1.需要修改chdir到当前目录\n", " 2.需要修改最后输出的文件名称\n", " 3.依据情况需要修改保留的列数\n", " 4.不同品种的交易时间不一样,要修改删除" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import os\n", "import datetime as datetime" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "os.chdir('E:/data/ru')\n", "all_csv_files = [file for file in os.listdir('.') if file.endswith('.csv')]\n", "all_csv_files = sorted(all_csv_files)\n", "print(\"文件中所有CSV文件:\",all_csv_files)\n", "\n", "sp_old_chars = ['_2019','_2020','_2021']\n", "sp_old_chars = sorted(sp_old_chars)\n", "print(\"旧格式文件名关键字:\",sp_old_chars)\n", "sp_new_chars = ['_2022','_2023']\n", "sp_new_chars = sorted(sp_new_chars)\n", "print(\"新格式文件名关键字:\",sp_new_chars)\n", "\n", "# # 设置后面数据的采集对于的行数# 用 \"old_type\" 或者 \"new_type\" 区分\n", "# if all(char in ['_2019','_2020','_2021'] for char in sp_old_chars):\n", "# year_type = 'old_type'\n", "# print(\"使用旧年份格式采集!!!\")\n", "# elif all(char in ['_2022','_2023'] for char in sp_chars):\n", "# year_type = 'new_type' \n", "# print(\"使用新年份格式采集!!!\")\n", "# else:\n", "# print(\"文件夹中CSV没有相关年份的数据或者新旧年份混用!!!\")\n", "\n", "csv_old_files = [file for file in all_csv_files if any(sp_char in file for sp_char in sp_old_chars)]\n", "print(\"筛选结果后的CSV文件:\",csv_old_files)\n", "csv_new_files = [file for file in all_csv_files if any(sp_char in file for sp_char in sp_new_chars)]\n", "print(\"筛选结果后的CSV文件:\",csv_new_files)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_old = pd.DataFrame()\n", "for f in csv_old_files:\n", " df_old_temp = pd.read_csv(\n", " f,\n", " usecols=[1, 2, 3, 4, 8, 13, 14, 15, 16],\n", " names=[\n", " \"统一代码\",\n", " \"合约代码\",\n", " \"时间\",\n", " \"最新\",\n", " \"成交量\",\n", " \"买一价\",\n", " \"卖一价\",\n", " \"买一量\",\n", " \"卖一量\",\n", " ],\n", " skiprows=1,\n", " encoding=\"utf-8\",\n", " parse_dates=['时间']#注意此处增加的排序,为了后面按时间排序\n", " )\n", " # df_temp = pd.read_csv(f, usecols=[0,5], names=[\n", " # 'datetime', 'volume'])\n", " df_old = pd.concat([df_old, df_old_temp])\n", "del df_old_temp" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_old =pd.DataFrame({'main_contract':df_old['统一代码'],'symbol':df_old['合约代码'],'datetime':df_old['时间'],'lastprice':df_old['最新'],'volume':df_old['成交量'],\n", " 'bid_p':df_old['买一价'],'ask_p':df_old['卖一价'],'bid_v':df_old['买一量'],'ask_v':df_old['卖一量']})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_old['time'] = df_old['datetime'].dt.strftime('%H:%M:%S')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_old.info()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_old.tail()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 不同品种交易时间不一样,需要修改\n", "# 商品期货\n", "drop_index1 = df_old.query('time>\"15:00:00\" & time<\"21:00:00\"')[\"time\"].index\n", "# drop_index1 = df_old.query('time>\"15:00:00\"')[\"time\"].index\n", "# drop_index2 = df_old.query('time>\"01:00:00\" & time<\"09:00:00\"')[\"time\"].index\n", "#drop_index2 = df_old.query('time>\"02:30:00\" & time<\"09:00:00\"')[\"time\"].index\n", "drop_index2 = df_old.query('time<\"09:00:00\"')[\"time\"].index\n", "drop_index3 = df_old.query('time>\"23:00:00\" & time<\"23:59:59\"')[\"time\"].index\n", "# drop_index3 = df_old.query('time>\"11:30:00\" & time<\"13:30:00\"')[\"time\"].index\n", "drop_index4 = df_old.query('time>\"10:15:00\" & time<\"10:30:00\"')[\"time\"].index\n", "\n", "# 清理不在交易时间段的数据\n", "df_old.drop(labels=drop_index1, axis=0, inplace=True)\n", "df_old.drop(drop_index2, axis=0, inplace=True)\n", "df_old.drop(drop_index3, axis=0, inplace=True)\n", "df_old.drop(drop_index4, axis=0, inplace=True)\n", "\n", "df_old.info()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_old.tail()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "del df_old['time']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_old.tail()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_new = pd.DataFrame()\n", "for f in csv_new_files:\n", " df_new_temp = pd.read_csv(\n", " f,\n", " usecols=[0, 1, 2, 5, 12, 21, 22, 23, 24, 25, 26, 44],\n", " names=[\n", " \"交易日\",\n", " \"统一代码\",\n", " \"合约代码\",\n", " \"最新价\",\n", " \"数量\",\n", " \"最后修改时间\",\n", " \"最后修改毫秒\",\n", " \"申买价一\",\n", " \"申买量一\",\n", " \"申卖价一\",\n", " \"申卖量一\",\n", " \"业务日期\",\n", " ],\n", " skiprows=1,\n", " encoding=\"utf-8\",\n", " parse_dates=['业务日期','最后修改时间','最后修改毫秒']#注意此处增加的排序,为了后面按时间排序\n", " )\n", "\n", " # df_temp = pd.read_csv(f, usecols=[0,5], names=[\n", " # 'datetime', 'volume'])\n", " df_new = pd.concat([df_new, df_new_temp])\n", "del df_new_temp" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 不同品种交易时间不一样,需要修改\n", "# 商品期货\n", "drop_index1 = df_new.query('最后修改时间>\"15:00:00\" & 最后修改时间<\"21:00:00\"')[\"最后修改时间\"].index\n", "# drop_index1 = df_new.query('最后修改时间>\"15:00:00\"')[\"最后修改时间\"].index\n", "# drop_index2 = df_new.query('最后修改时间>\"01:00:00\" & 最后修改时间<\"09:00:00\"')[\"最后修改时间\"].index\n", "# drop_index2 = df_new.query('最后修改时间>\"02:30:00\" & 最后修改时间<\"09:00:00\"')[\"最后修改时间\"].index\n", "drop_index2 = df_new.query('最后修改时间<\"09:00:00\"')[\"最后修改时间\"].index\n", "drop_index3 = df_new.query('最后修改时间>\"23:00:00\" & 最后修改时间<\"23:59:59\"')[\"最后修改时间\"].index\n", "# drop_index3 = df_new.query('最后修改时间>\"11:30:00\" & 最后修改时间<\"13:30:00\"')[\"最后修改时间\"].index\n", "drop_index4 = df_new.query('最后修改时间>\"10:15:00\" & 最后修改时间<\"10:30:00\"')[\"最后修改时间\"].index\n", "\n", "# 清理不在交易时间段的数据\n", "df_new.drop(labels=drop_index1, axis=0, inplace=True)\n", "df_new.drop(drop_index2, axis=0, inplace=True)\n", "df_new.drop(drop_index3, axis=0, inplace=True)\n", "df_new.drop(drop_index4, axis=0, inplace=True)\n", "\n", "df_new.info()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#日期修正\n", "#df_new['业务日期'] = pd.to_datetime(df_new['业务日期'])\n", "df_new['业务日期'] = df_new['业务日期'].dt.strftime('%Y-%m-%d')\n", "df_new['datetime'] = df_new['业务日期'] + ' '+df_new['最后修改时间'].dt.time.astype(str) + '.' + df_new['最后修改毫秒'].astype(str)\n", "# 将 'datetime' 列的数据类型更改为 datetime 格式,如果数据转换少8个小时,可以用timedelta处理\n", "df_new['datetime'] = pd.to_datetime(df_new['datetime'], errors='coerce', format='%Y-%m-%d %H:%M:%S.%f')\n", "# 如果需要,可以将 datetime 列格式化为字符串\n", "#df_new['formatted_date'] = df_new['datetime'].dt.strftime('%Y-%m-%d %H:%M:%S.%f')\n", "#计算瞬时成交量\n", "df_new['volume'] = df_new['数量'] - df_new['数量'].shift(1)\n", "df_new['volume'] = df_new['volume'].fillna(0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_new =pd.DataFrame({'main_contract':df_new['统一代码'],'symbol':df_new['合约代码'],'datetime':df_new['datetime'],'lastprice':df_new['最新价'],'volume':df_new['volume'],\n", " 'bid_p':df_new['申买价一'],'ask_p':df_new['申卖价一'],'bid_v':df_new['申买量一'],'ask_v':df_new['申卖量一']})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_old.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_old.tail()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_new.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_new.tail()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame()\n", "df = pd.concat([df_old, df_new],axis=0, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "del df_old,df_new" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df['main_contract'] = df['main_contract'].astype(str)\n", "df['symbol'] = df['symbol'].astype(str)\n", "df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce', format='%Y-%m-%d %H:%M:%S.%f')\n", "df['lastprice'] = df['lastprice'].astype(float)\n", "df['volume'] = df['volume'].astype(int)\n", "df['bid_p'] = df['bid_p'].astype(float)\n", "df['ask_p'] = df['ask_p'].astype(float)\n", "df['bid_v'] = df['bid_v'].astype(int)\n", "df['ask_v'] = df['ask_v'].astype(int)\n", "#df = df_old.append(df_new, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.tail()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 删除重复行\n", "df.drop_duplicates(inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 确保日期列按升序排序\n", "df.sort_values(by='datetime', inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 查看数据的头部和尾部:head()、tail()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.reset_index(drop=True, inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 查看dataframe的基本情况\n", "df.info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 等比复权,先不考虑\n", "# df['复权因子'] = df['卖一价'].shift() / df['买一价']\n", "df['复权因子'] = np.where(df['合约代码'] != df['合约代码'].shift(), df['卖一价'].shift() / df['买一价'], 1)\n", "df['复权因子'] = df['复权因子'].fillna(1)\n", "# df['复权因子'].loc[0] = 1\n", "df['买一价_adj'] = df['买一价'] * df['复权因子'].cumprod()\n", "df['卖一价_adj'] = df['卖一价'] * df['复权因子'].cumprod()\n", "df['最新_adj'] = df['最新'] * df['复权因子'].cumprod()\n", "# df['low_adj'] = df['low'] * adjust.cumprod()\n", "# df['high_adj'] = df['high'] * adjust.cumprod()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 等差复权\n", "df['复权因子'] = np.where(df['symbol'] != df['symbol'].shift(), df['ask_p'].shift() - df['bid_p'], 0)\n", "df['复权因子'] = df['复权因子'].fillna(0)\n", "# df['复权因子'].loc[0] = 1\n", "df['bid_p_adj'] = df['bid_p'] + df['复权因子'].cumsum()\n", "df['ask_p_adj'] = df['ask_p'] + df['复权因子'].cumsum()\n", "df['lastprice_adj'] = df['lastprice'] + df['复权因子'].cumsum()\n", "# df['low_adj'] = df['low'] + df['复权因子'].cumsum()\n", "# df['high_adj'] = df['high'] + df['复权因子'].cumsum()\n", "# df_new =pd.DataFrame({'main_contract':df_new['统一代码'],'symbol':df_new['合约代码'],'datetime':df_new['datetime'],'lastprice':df_new['最新价'],'volume':df_new['volume'],\n", "# 'bid_p':df_new['申买价一'],'ask_p':df_new['申卖量一'],'bid_v':df_new['申买量一'],'ask_v':df_new['申卖量一']})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(df['复权因子'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "contains_null = df.isnull().values.any()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(contains_null)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 查找换期需要复权的索引\n", "non_zero_indices = df[df['复权因子'] != 0].index\n", "print(non_zero_indices)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 查看未调整买价、卖价和最新价的数据\n", "df.loc[non_zero_indices[0]-5:non_zero_indices[0]+5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 将调整后的数值替换原来的值\n", "df['bid_p'] = df['bid_p_adj']\n", "df['ask_p'] = df['ask_p_adj']\n", "df['lastprice'] = df['lastprice_adj']\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 查看调整买价、卖价和最新价的数据\n", "df.loc[non_zero_indices[0]-5:non_zero_indices[0]+5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 删除多余的值\n", "del df['复权因子']\n", "del df['bid_p_adj']\n", "del df['ask_p_adj']\n", "del df['lastprice_adj']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.tail()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.loc[non_zero_indices[0]-5:non_zero_indices[0]+5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.to_csv('./ru888.csv', index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "del df" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import os\n", "import datetime as datetime\n", "import pyarrow as pa\n", "import pyarrow.feather as feather" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# feature 测试\n", "df = pd.read_csv('E:/data/ru/ru888.csv',encoding='UTF-8',parse_dates=['datetime'])" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "feather.write_feather(df, 'df_feather.feather')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "df = feather.read_feather('df_feather.feather')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
main_contractsymboldatetimelastpricevolumebid_pask_pbid_vask_v
44287432ru888ru24052023-12-29 14:59:58.5006755.0276750.06755.012815
44287433ru888ru24052023-12-29 14:59:59.0006760.0276755.06760.0214
44287434ru888ru24052023-12-29 14:59:59.5006760.0176760.06765.03533
44287435ru888ru24052023-12-29 15:00:00.0006760.066760.06765.04542
44287436ru888ru24052023-12-29 15:00:00.5006760.006760.06765.04542
\n", "
" ], "text/plain": [ " main_contract symbol datetime lastprice volume \\\n", "44287432 ru888 ru2405 2023-12-29 14:59:58.500 6755.0 27 \n", "44287433 ru888 ru2405 2023-12-29 14:59:59.000 6760.0 27 \n", "44287434 ru888 ru2405 2023-12-29 14:59:59.500 6760.0 17 \n", "44287435 ru888 ru2405 2023-12-29 15:00:00.000 6760.0 6 \n", "44287436 ru888 ru2405 2023-12-29 15:00:00.500 6760.0 0 \n", "\n", " bid_p ask_p bid_v ask_v \n", "44287432 6750.0 6755.0 128 15 \n", "44287433 6755.0 6760.0 2 14 \n", "44287434 6760.0 6765.0 35 33 \n", "44287435 6760.0 6765.0 45 42 \n", "44287436 6760.0 6765.0 45 42 " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.tail()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" } }, "nbformat": 4, "nbformat_minor": 2 }