Files
Quant_Code/1.交易策略/999.其他策略/1.松鼠SF08_基于盘口数据的择时趋势策略/使用文档/过程文件/转换合并tick数据(2019-2021).ipynb

274 lines
7.8 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 使用说明:\n",
" 1.需要修改chdir到当前目录\n",
" 2.需要修改最后输出的文件名称\n",
" 3.依据情况需要修改保留的列数"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"文件中所有CSV文件: ['ag888_2019.csv', 'ag888_2020.csv', 'ag888_2021.csv', 'ag888_2022.csv', 'ag888_2022_2023.csv', 'ag888_2023.csv']\n",
"需要筛选的文件名关键字: ['_2022']\n",
"使用新年份格式采集!!!\n",
"筛选结果后的CSV文件: ['ag888_2022.csv', 'ag888_2022_2023.csv']\n"
]
}
],
"source": [
"os.chdir('E:/data/ag')\n",
"all_csv_files = [file for file in os.listdir('.') if file.endswith('.csv')]\n",
"all_csv_files = sorted(all_csv_files)\n",
"print(\"文件中所有CSV文件:\",all_csv_files)\n",
"\n",
"sp_chars = ['_2022']\n",
"sp_chars = sorted(sp_chars)\n",
"print(\"需要筛选的文件名关键字:\",sp_chars)\n",
"\n",
"# 设置后面数据的采集对于的行数# 用 \"old_type\" 或者 \"new_type\" 区分\n",
"if all(char in ['_2019','_2020','_2021'] for char in sp_chars):\n",
" year_type = 'old_type'\n",
" print(\"使用旧年份格式采集!!!\")\n",
"elif all(char in ['_2022','_2023'] for char in sp_chars):\n",
" year_type = 'new_type' \n",
" print(\"使用新年份格式采集!!!\")\n",
"else:\n",
" print(\"文件夹中CSV没有相关年份的数据或者新旧年份混用!!!\")\n",
"\n",
"csv_files = [file for file in all_csv_files if any(sp_char in file for sp_char in sp_chars)]\n",
"print(\"筛选结果后的CSV文件:\",csv_files)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame()\n",
"for f in csv_files:\n",
" if year_type == 'old_type':\n",
" df_temp = pd.read_csv(\n",
" f,\n",
" usecols=[1, 2, 3, 4, 8, 13, 14, 15, 16],\n",
" names=[\n",
" \"统一代码\",\n",
" \"合约代码\",\n",
" \"时间\",\n",
" \"最新\",\n",
" \"成交量\",\n",
" \"买一价\",\n",
" \"卖一价\",\n",
" \"买一量\",\n",
" \"卖一量\",\n",
" ],\n",
" skiprows=1,\n",
" encoding=\"utf-8\",\n",
" )\n",
" elif year_type == 'new_type':\n",
" df_temp = pd.read_csv(\n",
" f,\n",
" usecols=[0, 1, 2, 5, 12, 21, 22, 23, 24, 25, 26, 44],\n",
" names=[\n",
" \"交易日\",\n",
" \"统一代码\",\n",
" \"合约代码\",\n",
" \"最新价\",\n",
" \"数量\",\n",
" \"最后修改时间\",\n",
" \"最后修改毫秒\",\n",
" \"申买价一\",\n",
" \"申买量一\",\n",
" \"申卖价一\",\n",
" \"申卖量一\",\n",
" \"业务日期\",\n",
" ],\n",
" skiprows=1,\n",
" encoding=\"utf-8\",\n",
" )\n",
"\n",
" # df_temp = pd.read_csv(f, usecols=[0,5], names=[\n",
" # 'datetime', 'volume'])\n",
" df = pd.concat([df, df_temp])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 查看数据的头部和尾部head()、tail()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.reset_index(drop=True, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 查看dataframe的基本情况\n",
"df.info()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 等比复权,先不考虑\n",
"# df['复权因子'] = df['卖一价'].shift() / df['买一价']\n",
"df['复权因子'] = np.where(df['合约代码'] != df['合约代码'].shift(), df['卖一价'].shift() / df['买一价'], 1)\n",
"df['复权因子'] = df['复权因子'].fillna(1)\n",
"# df['复权因子'].loc[0] = 1\n",
"df['买一价_adj'] = df['买一价'] * df['复权因子'].cumprod()\n",
"df['卖一价_adj'] = df['卖一价'] * df['复权因子'].cumprod()\n",
"df['最新_adj'] = df['最新'] * df['复权因子'].cumprod()\n",
"# df['low_adj'] = df['low'] * adjust.cumprod()\n",
"# df['high_adj'] = df['high'] * adjust.cumprod()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 等差复权\n",
"df['复权因子'] = np.where(df['合约代码'] != df['合约代码'].shift(), df['申卖价一'].shift() - df['申买价一'], 0)\n",
"df['复权因子'] = df['复权因子'].fillna(0)\n",
"# df['复权因子'].loc[0] = 1\n",
"df['申买价一_adj'] = df['申买价一'] + df['复权因子'].cumsum()\n",
"df['申卖价一_adj'] = df['申卖价一'] + df['复权因子'].cumsum()\n",
"df['最新价_adj'] = df['最新价'] + df['复权因子'].cumsum()\n",
"# df['low_adj'] = df['low'] + df['复权因子'].cumsum()\n",
"# df['high_adj'] = df['high'] + df['复权因子'].cumsum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 查找换期需要复权的索引\n",
"non_zero_indices = df[df['复权因子'] != 0].index\n",
"print(non_zero_indices)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 查看未调整买价、卖价和最新价的数据\n",
"df.loc[non_zero_indices[0]-5:non_zero_indices[0]+5]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 将调整后的数值替换原来的值\n",
"df['申买价一'] = df['申买价一_adj']\n",
"df['申卖价一'] = df['申卖价一_adj']\n",
"df['最新价'] = df['最新价_adj']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 查看调整买价、卖价和最新价的数据\n",
"df.loc[non_zero_indices[0]-5:non_zero_indices[0]+5]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 删除多余的值\n",
"del df['复权因子']\n",
"del df['申买价一_adj']\n",
"del df['申卖价一_adj']\n",
"del df['最新价_adj']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.loc[non_zero_indices[0]-5:non_zero_indices[0]+5]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.to_csv('./ag888_2022_2023.csv', index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "orderflow",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}