{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 使用说明:\n", " 1.需要修改chdir到当前目录\n", " 2.需要修改最后输出的文件名称\n", " 3.依据情况需要修改保留的列数" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import os" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "文件中所有CSV文件: ['ag888_2019.csv', 'ag888_2020.csv', 'ag888_2021.csv', 'ag888_2022.csv', 'ag888_2022_2023.csv', 'ag888_2023.csv']\n", "需要筛选的文件名关键字: ['_2022']\n", "使用新年份格式采集!!!\n", "筛选结果后的CSV文件: ['ag888_2022.csv', 'ag888_2022_2023.csv']\n" ] } ], "source": [ "os.chdir('E:/data/ag')\n", "all_csv_files = [file for file in os.listdir('.') if file.endswith('.csv')]\n", "all_csv_files = sorted(all_csv_files)\n", "print(\"文件中所有CSV文件:\",all_csv_files)\n", "\n", "sp_chars = ['_2022']\n", "sp_chars = sorted(sp_chars)\n", "print(\"需要筛选的文件名关键字:\",sp_chars)\n", "\n", "# 设置后面数据的采集对于的行数# 用 \"old_type\" 或者 \"new_type\" 区分\n", "if all(char in ['_2019','_2020','_2021'] for char in sp_chars):\n", " year_type = 'old_type'\n", " print(\"使用旧年份格式采集!!!\")\n", "elif all(char in ['_2022','_2023'] for char in sp_chars):\n", " year_type = 'new_type' \n", " print(\"使用新年份格式采集!!!\")\n", "else:\n", " print(\"文件夹中CSV没有相关年份的数据或者新旧年份混用!!!\")\n", "\n", "csv_files = [file for file in all_csv_files if any(sp_char in file for sp_char in sp_chars)]\n", "print(\"筛选结果后的CSV文件:\",csv_files)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame()\n", "for f in csv_files:\n", " if year_type == 'old_type':\n", " df_temp = pd.read_csv(\n", " f,\n", " usecols=[1, 2, 3, 4, 8, 13, 14, 15, 16],\n", " names=[\n", " \"统一代码\",\n", " \"合约代码\",\n", " \"时间\",\n", " \"最新\",\n", " \"成交量\",\n", " \"买一价\",\n", " \"卖一价\",\n", " \"买一量\",\n", " \"卖一量\",\n", " ],\n", " skiprows=1,\n", " encoding=\"utf-8\",\n", " )\n", " elif year_type == 'new_type':\n", " df_temp = pd.read_csv(\n", " f,\n", " usecols=[0, 1, 2, 5, 12, 21, 22, 23, 24, 25, 26, 44],\n", " names=[\n", " \"交易日\",\n", " \"统一代码\",\n", " \"合约代码\",\n", " \"最新价\",\n", " \"数量\",\n", " \"最后修改时间\",\n", " \"最后修改毫秒\",\n", " \"申买价一\",\n", " \"申买量一\",\n", " \"申卖价一\",\n", " \"申卖量一\",\n", " \"业务日期\",\n", " ],\n", " skiprows=1,\n", " encoding=\"utf-8\",\n", " )\n", "\n", " # df_temp = pd.read_csv(f, usecols=[0,5], names=[\n", " # 'datetime', 'volume'])\n", " df = pd.concat([df, df_temp])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 查看数据的头部和尾部:head()、tail()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.reset_index(drop=True, inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 查看dataframe的基本情况\n", "df.info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 等比复权,先不考虑\n", "# df['复权因子'] = df['卖一价'].shift() / df['买一价']\n", "df['复权因子'] = np.where(df['合约代码'] != df['合约代码'].shift(), df['卖一价'].shift() / df['买一价'], 1)\n", "df['复权因子'] = df['复权因子'].fillna(1)\n", "# df['复权因子'].loc[0] = 1\n", "df['买一价_adj'] = df['买一价'] * df['复权因子'].cumprod()\n", "df['卖一价_adj'] = df['卖一价'] * df['复权因子'].cumprod()\n", "df['最新_adj'] = df['最新'] * df['复权因子'].cumprod()\n", "# df['low_adj'] = df['low'] * adjust.cumprod()\n", "# df['high_adj'] = df['high'] * adjust.cumprod()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 等差复权\n", "df['复权因子'] = np.where(df['合约代码'] != df['合约代码'].shift(), df['申卖价一'].shift() - df['申买价一'], 0)\n", "df['复权因子'] = df['复权因子'].fillna(0)\n", "# df['复权因子'].loc[0] = 1\n", "df['申买价一_adj'] = df['申买价一'] + df['复权因子'].cumsum()\n", "df['申卖价一_adj'] = df['申卖价一'] + df['复权因子'].cumsum()\n", "df['最新价_adj'] = df['最新价'] + df['复权因子'].cumsum()\n", "# df['low_adj'] = df['low'] + df['复权因子'].cumsum()\n", "# df['high_adj'] = df['high'] + df['复权因子'].cumsum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 查找换期需要复权的索引\n", "non_zero_indices = df[df['复权因子'] != 0].index\n", "print(non_zero_indices)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 查看未调整买价、卖价和最新价的数据\n", "df.loc[non_zero_indices[0]-5:non_zero_indices[0]+5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 将调整后的数值替换原来的值\n", "df['申买价一'] = df['申买价一_adj']\n", "df['申卖价一'] = df['申卖价一_adj']\n", "df['最新价'] = df['最新价_adj']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 查看调整买价、卖价和最新价的数据\n", "df.loc[non_zero_indices[0]-5:non_zero_indices[0]+5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 删除多余的值\n", "del df['复权因子']\n", "del df['申买价一_adj']\n", "del df['申卖价一_adj']\n", "del df['最新价_adj']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.loc[non_zero_indices[0]-5:non_zero_indices[0]+5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.to_csv('./ag888_2022_2023.csv', index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "orderflow", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }