# 使用说明：
    1.需要修改chdir到当前目录
    2.需要修改最后输出的文件名称
    3.依据情况需要修改保留的列数

In [1]:
import pandas as pd
import numpy as np
import os

In [5]:
os.chdir('E:/data/ag')
all_csv_files = [file for file in os.listdir('.') if file.endswith('.csv')]
all_csv_files = sorted(all_csv_files)
print("文件中所有CSV文件:",all_csv_files)

sp_chars = ['_2022']
sp_chars = sorted(sp_chars)
print("需要筛选的文件名关键字:",sp_chars)

# 设置后面数据的采集对于的行数# 用 "old_type" 或者 "new_type" 区分
if all(char in ['_2019','_2020','_2021'] for char in sp_chars):
    year_type = 'old_type'
    print("使用旧年份格式采集!!!")
elif all(char in ['_2022','_2023'] for char in sp_chars):
    year_type = 'new_type' 
    print("使用新年份格式采集!!!")
else:
    print("文件夹中CSV没有相关年份的数据或者新旧年份混用!!!")

csv_files = [file for file in all_csv_files if any(sp_char in file for sp_char in sp_chars)]
print("筛选结果后的CSV文件:",csv_files)

文件中所有CSV文件: ['ag888_2019.csv', 'ag888_2020.csv', 'ag888_2021.csv', 'ag888_2022.csv', 'ag888_2022_2023.csv', 'ag888_2023.csv']
需要筛选的文件名关键字: ['_2022']
使用新年份格式采集!!!
筛选结果后的CSV文件: ['ag888_2022.csv', 'ag888_2022_2023.csv']


In [None]:
df = pd.DataFrame()
for f in csv_files:
    if year_type == 'old_type':
        df_temp = pd.read_csv(
            f,
            usecols=[1, 2, 3, 4, 8, 13, 14, 15, 16],
            names=[
                "统一代码",
                "合约代码",
                "时间",
                "最新",
                "成交量",
                "买一价",
                "卖一价",
                "买一量",
                "卖一量",
            ],
            skiprows=1,
            encoding="utf-8",
        )
    elif year_type == 'new_type':
            df_temp = pd.read_csv(
        f,
        usecols=[0, 1, 2, 5, 12, 21, 22, 23, 24, 25, 26, 44],
        names=[
            "交易日",
            "统一代码",
            "合约代码",
            "最新价",
            "数量",
            "最后修改时间",
            "最后修改毫秒",
            "申买价一",
            "申买量一",
            "申卖价一",
            "申卖量一",
            "业务日期",
        ],
        skiprows=1,
        encoding="utf-8",
    )

    # df_temp = pd.read_csv(f, usecols=[0,5], names=[
    #                  'datetime',  'volume'])
    df = pd.concat([df, df_temp])

# 查看数据的头部和尾部：head()、tail()

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
# 查看dataframe的基本情况
df.info()

# 等比复权,先不考虑
# df['复权因子'] = df['卖一价'].shift() / df['买一价']
df['复权因子'] = np.where(df['合约代码'] != df['合约代码'].shift(), df['卖一价'].shift() / df['买一价'], 1)
df['复权因子'] = df['复权因子'].fillna(1)
# df['复权因子'].loc[0] = 1
df['买一价_adj'] = df['买一价'] * df['复权因子'].cumprod()
df['卖一价_adj'] = df['卖一价'] * df['复权因子'].cumprod()
df['最新_adj'] = df['最新'] * df['复权因子'].cumprod()
# df['low_adj'] = df['low'] * adjust.cumprod()
# df['high_adj'] = df['high'] * adjust.cumprod()

In [None]:
# 等差复权
df['复权因子'] = np.where(df['合约代码'] != df['合约代码'].shift(), df['申卖价一'].shift() - df['申买价一'], 0)
df['复权因子'] = df['复权因子'].fillna(0)
# df['复权因子'].loc[0] = 1
df['申买价一_adj'] = df['申买价一'] + df['复权因子'].cumsum()
df['申卖价一_adj'] = df['申卖价一'] + df['复权因子'].cumsum()
df['最新价_adj'] = df['最新价'] + df['复权因子'].cumsum()
# df['low_adj'] = df['low'] + df['复权因子'].cumsum()
# df['high_adj'] = df['high'] + df['复权因子'].cumsum()

In [None]:
# 查找换期需要复权的索引
non_zero_indices = df[df['复权因子'] != 0].index
print(non_zero_indices)

In [None]:
# 查看未调整买价、卖价和最新价的数据
df.loc[non_zero_indices[0]-5:non_zero_indices[0]+5]

In [None]:
# 将调整后的数值替换原来的值
df['申买价一'] = df['申买价一_adj']
df['申卖价一'] = df['申卖价一_adj']
df['最新价'] = df['最新价_adj']

In [None]:
# 查看调整买价、卖价和最新价的数据
df.loc[non_zero_indices[0]-5:non_zero_indices[0]+5]

In [None]:
# 删除多余的值
del df['复权因子']
del df['申买价一_adj']
del df['申卖价一_adj']
del df['最新价_adj']

In [None]:
df.loc[non_zero_indices[0]-5:non_zero_indices[0]+5]

In [None]:
df.to_csv('./ag888_2022_2023.csv', index=False)