Files

336 lines
19 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import pandas as pd
import os
from datetime import time as s_time
from datetime import datetime
import chardet
# 日盘商品期货交易品种
commodity_day_dict = {'bb': s_time(15,00), 'jd': s_time(15,00), 'lh': s_time(15,00), 'l': s_time(15,00), 'fb': s_time(15,00), 'ec': s_time(15,00),
'AP': s_time(15,00), 'CJ': s_time(15,00), 'JR': s_time(15,00), 'LR': s_time(15,00), 'RS': s_time(15,00), 'PK': s_time(15,00),
'PM': s_time(15,00), 'PX': s_time(15,00), 'RI': s_time(15,00), 'ao': s_time(15,00), 'br': s_time(15,00), 'wr': s_time(15,00),}
# 夜盘商品期货交易品种
commodity_night_dict = {'sc': s_time(2,30), 'bc': s_time(1,0), 'lu': s_time(23,0), 'nr': s_time(23,0),'au': s_time(2,30), 'ag': s_time(2,30),
'ss': s_time(1,0), 'sn': s_time(1,0), 'ni': s_time(1,0), 'pb': s_time(1,0),'zn': s_time(1,0), 'al': s_time(1,0), 'cu': s_time(1,0),
'ru': s_time(23,0), 'rb': s_time(23,0), 'hc': s_time(23,0), 'fu': s_time(23,0), 'bu': s_time(23,0), 'sp': s_time(23,0),
'PF': s_time(23,0), 'SR': s_time(23,0), 'CF': s_time(23,0), 'CY': s_time(23,0), 'RM': s_time(23,0), 'MA': s_time(23,0),
'TA': s_time(23,0), 'ZC': s_time(23,0), 'FG': s_time(23,0), 'OI': s_time(23,0), 'SA': s_time(23,0),
'p': s_time(23,0), 'j': s_time(23,0), 'jm': s_time(23,0), 'i': s_time(23,0), 'l': s_time(23,0), 'v': s_time(23,0),
'pp': s_time(23,0), 'eg': s_time(23,0), 'c': s_time(23,0), 'cs': s_time(23,0), 'y': s_time(23,0), 'm': s_time(23,0),
'a': s_time(23,0), 'b': s_time(23,0), 'rr': s_time(23,0), 'eb': s_time(23,0), 'pg': s_time(23,0)}
# 金融期货交易品种
financial_time_dict = {'IH': s_time(15,00), 'IF': s_time(15,00), 'IC': s_time(15,00), 'IM': s_time(15,00),'T': s_time(15,00), 'TS': s_time(15,00),
'TF': s_time(15,00), 'TL': s_time(15,00)}
# 所有已列入的筛选品种
all_dict = {k: v for d in [commodity_day_dict, commodity_night_dict, financial_time_dict] for k, v in d.items()}
# def has_common_keys(*dicts):
# keys_union = set().union(*dicts) # 计算所有字典键的并集
# keys_intersection = set().intersection(*dicts) # 计算所有字典键的交集
# return len(keys_intersection) > 0
# has_common_keys(commodity_day_dict, commodity_night_dict,financial_time_dict)
# import chardet
# # 假设file_path是你要读取的文件路径
# with open(file_path, 'rb') as file:
# data = file.read()
# # 使用chardet检测编码
# detected_encoding = chardet.detect(data)['encoding']
# # 如果检测到的编码不是gbk可以尝试转换编码后再读取
# if detected_encoding and detected_encoding != 'gbk':
# with open(file_path, 'rb') as file:
# data = file.read().decode(detected_encoding)
def split_alpha_numeric(string):
alpha_chars = ""
numeric_chars = ""
for char in string:
if char.isalpha():
alpha_chars += char
elif char.isdigit():
numeric_chars += char
return alpha_chars, numeric_chars
def find_files(all_csv_files):
all_csv_files = sorted(all_csv_files)
sp_old_chars = ['_2019','_2020','_2021']
sp_old_chars = sorted(sp_old_chars)
sp_new_chars = ['_2022','_2023']
sp_new_chars = sorted(sp_new_chars)
csv_old_files = [file for file in all_csv_files if any(sp_char in file for sp_char in sp_old_chars)]
csv_new_files = [file for file in all_csv_files if any(sp_char in file for sp_char in sp_new_chars)]
return csv_old_files, csv_new_files
def merged_old_tickdata(all_csv_files, sp_char):
merged_up_df = pd.DataFrame()
merged_up_df = merged_old_unprocessed_tickdata(all_csv_files, sp_char)
# 获取当前目录下的所有文件名包含sp_char的csv文件
# 添加主力连续的合约代码主力连续为888指数连续可以用999次主力连续可以使用889表头用“统一代码”
alpha_chars, numeric_chars = split_alpha_numeric(merged_up_df.loc[0,'合约代码'])
code_value = alpha_chars + "888"
print("code_value characters:", code_value)
merged_up_df.insert(loc=0,column="统一代码", value=code_value)
while alpha_chars not in all_dict.keys():
print("%s期货品种未列入所有筛选条件中!!!"%(code_value))
continue
# merged_df['时间'] = pd.to_datetime(merged_df['时间'])
merged_df =pd.DataFrame({'main_contract':merged_df['统一代码'],'symbol':merged_df['合约代码'],'datetime':merged_df['时间'],'lastprice':merged_df['最新'],'volume':merged_df['成交量'],
'bid_p':merged_df['买一价'],'ask_p':merged_df['卖一价'],'bid_v':merged_df['买一量'],'ask_v':merged_df['卖一量']})
merged_df['tmp_time'] = merged_df['datetime'].dt.strftime('%H:%M:%S.%f')
merged_df['time'] = merged_df['tmp_time'].apply(lambda x: datetime.strptime(x, '%H:%M:%S.%f')).dt.time
del merged_df['tmp_time']
if alpha_chars in financial_time_dict.keys():
drop_index1 = pd.DataFrame().index
drop_index2 = merged_df.loc[(merged_df['time'] > s_time(11, 30, 0, 000000)) & (merged_df['time'] < s_time(13, 0, 0, 000000))].index
drop_index3 = merged_df.loc[(merged_df['time'] > s_time(15, 0, 0, 000000)) | (merged_df['time'] < s_time(9, 30, 0, 000000))].index
drop_index4 = pd.DataFrame().index
print("按照中金所交易时间筛选金融期货品种")
# else:
elif alpha_chars in commodity_night_dict.keys():
if commodity_night_dict[alpha_chars] == s_time(23,00):
drop_index1 = merged_df.loc[(merged_df['time'] > s_time(10, 15, 0, 000000)) & (merged_df['time'] < s_time(10, 30, 0, 000000))].index
drop_index2 = merged_df.loc[(merged_df['time'] > s_time(11, 30, 0, 000000)) & (merged_df['time'] < s_time(13, 30, 0, 000000))].index
drop_index3 = merged_df.loc[(merged_df['time'] > s_time(15, 0, 0, 000000)) & (merged_df['time'] < s_time(21, 0, 0, 000000))].index
drop_index4 = merged_df.loc[(merged_df['time'] > s_time(23, 0, 0, 000000)) | (merged_df['time'] < s_time(9, 0, 0, 000000))].index
print("按照夜盘截止交易时间为23:00筛选商品期货品种")
elif commodity_night_dict[alpha_chars] == s_time(1,00):
drop_index1 = merged_df.loc[(merged_df['time'] > s_time(10, 15, 0, 000000)) & (merged_df['time'] < s_time(10, 30, 0, 000000))].index
drop_index2 = merged_df.loc[(merged_df['time'] > s_time(11, 30, 0, 000000)) & (merged_df['time'] < s_time(13, 30, 0, 000000))].index
drop_index3 = merged_df.loc[(merged_df['time'] > s_time(15, 0, 0, 000000)) & (merged_df['time'] < s_time(21, 0, 0, 000000))].index
drop_index4 = merged_df.loc[(merged_df['time'] > s_time(1, 0, 0, 000000)) & (merged_df['time'] < s_time(9, 0, 0, 000000))].index
print("按照夜盘截止交易时间为1:00筛选商品期货品种")
elif commodity_night_dict[alpha_chars] == s_time(2,30):
drop_index1 = merged_df.loc[(merged_df['time'] > s_time(10, 15, 0, 000000)) & (merged_df['time'] < s_time(10, 30, 0, 000000))].index
drop_index2 = merged_df.loc[(merged_df['time'] > s_time(11, 30, 0, 000000)) & (merged_df['time'] < s_time(13, 30, 0, 000000))].index
drop_index3 = merged_df.loc[(merged_df['time'] > s_time(15, 0, 0, 000000)) & (merged_df['time'] < s_time(21, 0, 0, 000000))].index
drop_index4 = merged_df.loc[(merged_df['time'] > s_time(2, 30, 0, 000000)) & (merged_df['time'] < s_time(9, 0, 0, 000000))].index
print("按照夜盘截止交易时间为2:30筛选商品期货品种")
else:
print("夜盘截止交易时间未设置或者设置错误!!!")
elif alpha_chars in commodity_day_dict.keys():
drop_index1 = merged_df.loc[(merged_df['time'] > s_time(10, 15, 0, 000000)) & (merged_df['time'] < s_time(10, 30, 0, 000000))].index
drop_index2 = merged_df.loc[(merged_df['time'] > s_time(11, 30, 0, 000000)) & (merged_df['time'] < s_time(13, 30, 0, 000000))].index
drop_index3 = merged_df.loc[(merged_df['time'] > s_time(15, 0, 0, 000000)) | (merged_df['time'] < s_time(9, 0, 0, 000000))].index
drop_index4 = pd.DataFrame().index
print("按照无夜盘筛选商品期货品种")
else:
print("%s期货品种未列入筛选条件中!!!"%(code_value))
# 清理不在交易时间段的数据
merged_df.drop(labels=drop_index1, axis=0, inplace=True)
merged_df.drop(drop_index2, axis=0, inplace=True)
merged_df.drop(drop_index3, axis=0, inplace=True)
merged_df.drop(drop_index4, axis=0, inplace=True)
del merged_df['time']
# sorted_merged_df = merged_df.sort_values(by = ['datetime'], ascending=True)
# merged_df['datetime'] = pd.to_datetime(merged_df['datetime'])
merged_df['datetime'] = sorted(merged_df['datetime'])
print("%s%s数据生成成功!"%(code_value,sp_char))
return merged_df, code_value
def merged_new_tickdata(all_csv_files, sp_char):
# 获取当前目录下的所有文件名包含sp_char的csv文件
csv_files = [sp_file for sp_file in all_csv_files if sp_char in sp_file]
print("csv_files:", csv_files)
merged_df = pd.DataFrame()
dir = os.getcwd()
# 循环遍历每个csv文件
for file in csv_files:
# 读取csv文件并使用第一行为列标题编译不通过可以改为gbk
try:
df = pd.read_csv(
file,
header=0,
usecols=[0, 1, 4, 11, 20, 21, 22, 23, 24, 25, 43],
names=[
"交易日",
"合约代码",
"最新价",
"数量",
"最后修改时间",
"最后修改毫秒",
"申买价一",
"申买量一",
"申卖价一",
"申卖量一",
"业务日期",
],
encoding='gbk',
# skiprows=0,
parse_dates=['业务日期','最后修改时间','最后修改毫秒'])#注意此处增加的排序,为了后面按时间排序
except:
# 假设file_path是你要读取的文件路径
file_path = os.path.join(dir, file)
with open(file_path, 'rb') as file:
data = file.read()
# 使用chardet检测编码
detected_encoding = chardet.detect(data)['encoding']
print("当前读取文件读取错误:", file)
print("当前读取文件正确解码格式", detected_encoding)
# 删除重复行
df.drop_duplicates(inplace=True)
# 将数据合并到新的DataFrame中
merged_df = pd.concat([merged_df, df], ignore_index=True)
# 删除重复列
merged_df.drop_duplicates(subset = merged_df.columns.tolist(), inplace=True)
# 重置行索引
merged_df.reset_index(inplace=True, drop=True)
#print("合约代码:", merged_df["合约代码"])
# 插入新的数据
alpha_chars, numeric_chars = split_alpha_numeric(merged_df.loc[0,'合约代码'])
# print("Alphabetical characters:", alpha_chars)
# 添加主力连续的合约代码主力连续为888指数连续可以用999次主力连续可以使用889表头用“统一代码”
code_value = alpha_chars + "888"
print("code_value characters:", code_value)
merged_df.insert(loc=1, column="统一代码", value=code_value)
while alpha_chars not in all_dict.keys():
print("%s期货品种未列入所有筛选条件中!!!"%(code_value))
continue
#日期修正
#merged_df['业务日期'] = pd.to_datetime(merged_df['业务日期'])
merged_df['业务日期'] = merged_df['业务日期'].dt.strftime('%Y-%m-%d')
merged_df['datetime'] = merged_df['业务日期'] + ' '+merged_df['最后修改时间'].dt.time.astype(str) + '.' + merged_df['最后修改毫秒'].astype(str)
# 将'datetime' 列的数据类型更改为 datetime 格式如果数据转换少8个小时可以用timedelta处理
merged_df['datetime'] = pd.to_datetime(merged_df['datetime'], errors='coerce', format='%Y-%m-%d %H:%M:%S.%f')
#计算瞬时成交量
merged_df['volume'] = merged_df['数量'] - merged_df['数量'].shift(1)
merged_df['volume'] = merged_df['volume'].fillna(0)
merged_df =pd.DataFrame({'main_contract':merged_df['统一代码'],'symbol':merged_df['合约代码'],'datetime':merged_df['datetime'],'lastprice':merged_df['最新价'],'volume':merged_df['volume'],
'bid_p':merged_df['申买价一'],'ask_p':merged_df['申卖价一'],'bid_v':merged_df['申买量一'],'ask_v':merged_df['申卖量一']})
merged_df['tmp_time'] = merged_df['datetime'].dt.strftime('%H:%M:%S.%f')
merged_df['time'] = merged_df['tmp_time'].apply(lambda x: datetime.strptime(x, '%H:%M:%S.%f')).dt.time
del merged_df['tmp_time']
if alpha_chars in financial_time_dict.keys():
drop_index1 = pd.DataFrame().index
drop_index2 = merged_df.loc[(merged_df['time'] > s_time(11, 30, 0, 000000)) & (merged_df['time'] < s_time(13, 0, 0, 000000))].index
drop_index3 = merged_df.loc[(merged_df['time'] > s_time(15, 0, 0, 000000)) | (merged_df['time'] < s_time(9, 30, 0, 000000))].index
drop_index4 = pd.DataFrame().index
print("按照中金所交易时间筛选金融期货品种")
# else:
elif alpha_chars in commodity_night_dict.keys():
if commodity_night_dict[alpha_chars] == s_time(23,00):
drop_index1 = merged_df.loc[(merged_df['time'] > s_time(10, 15, 0, 000000)) & (merged_df['time'] < s_time(10, 30, 0, 000000))].index
drop_index2 = merged_df.loc[(merged_df['time'] > s_time(11, 30, 0, 000000)) & (merged_df['time'] < s_time(13, 30, 0, 000000))].index
drop_index3 = merged_df.loc[(merged_df['time'] > s_time(15, 0, 0, 000000)) & (merged_df['time'] < s_time(21, 0, 0, 000000))].index
drop_index4 = merged_df.loc[(merged_df['time'] > s_time(23, 0, 0, 000000)) | (merged_df['time'] < s_time(9, 0, 0, 000000))].index
print("按照夜盘截止交易时间为23:00筛选商品期货品种")
elif commodity_night_dict[alpha_chars] == s_time(1,00):
drop_index1 = merged_df.loc[(merged_df['time'] > s_time(10, 15, 0, 000000)) & (merged_df['time'] < s_time(10, 30, 0, 000000))].index
drop_index2 = merged_df.loc[(merged_df['time'] > s_time(11, 30, 0, 000000)) & (merged_df['time'] < s_time(13, 30, 0, 000000))].index
drop_index3 = merged_df.loc[(merged_df['time'] > s_time(15, 0, 0, 000000)) & (merged_df['time'] < s_time(21, 0, 0, 000000))].index
drop_index4 = merged_df.loc[(merged_df['time'] > s_time(1, 0, 0, 000000)) & (merged_df['time'] < s_time(9, 0, 0, 000000))].index
print("按照夜盘截止交易时间为1:00筛选商品期货品种")
elif commodity_night_dict[alpha_chars] == s_time(2,30):
drop_index1 = merged_df.loc[(merged_df['time'] > s_time(10, 15, 0, 000000)) & (merged_df['time'] < s_time(10, 30, 0, 000000))].index
drop_index2 = merged_df.loc[(merged_df['time'] > s_time(11, 30, 0, 000000)) & (merged_df['time'] < s_time(13, 30, 0, 000000))].index
drop_index3 = merged_df.loc[(merged_df['time'] > s_time(15, 0, 0, 000000)) & (merged_df['time'] < s_time(21, 0, 0, 000000))].index
drop_index4 = merged_df.loc[(merged_df['time'] > s_time(2, 30, 0, 000000)) & (merged_df['time'] < s_time(9, 0, 0, 000000))].index
print("按照夜盘截止交易时间为2:30筛选商品期货品种")
else:
print("夜盘截止交易时间未设置或者设置错误!!!")
elif alpha_chars in commodity_day_dict.keys():
drop_index1 = merged_df.loc[(merged_df['time'] > s_time(10, 15, 0, 000000)) & (merged_df['time'] < s_time(10, 30, 0, 000000))].index
drop_index2 = merged_df.loc[(merged_df['time'] > s_time(11, 30, 0, 000000)) & (merged_df['time'] < s_time(13, 30, 0, 000000))].index
drop_index3 = merged_df.loc[(merged_df['time'] > s_time(15, 0, 0, 000000)) | (merged_df['time'] < s_time(9, 0, 0, 000000))].index
drop_index4 = pd.DataFrame().index
print("按照无夜盘筛选商品期货品种")
else:
print("%s期货品种未列入筛选条件中!!!"%(code_value))
# 清理不在交易时间段的数据
merged_df.drop(labels=drop_index1, axis=0, inplace=True)
merged_df.drop(drop_index2, axis=0, inplace=True)
merged_df.drop(drop_index3, axis=0, inplace=True)
merged_df.drop(drop_index4, axis=0, inplace=True)
del merged_df['time']
# sorted_merged_df = merged_df.sort_values(by = ['datetime'], inplace=True)
merged_df['datetime'] = sorted(merged_df['datetime'])
print("%s%s数据生成成功!"%(code_value,sp_char))
return merged_df, code_value
def merged_old_unprocessed_tickdata(all_csv_files, sp_char):
csv_files = [sp_file for sp_file in all_csv_files if sp_char in sp_file]
print("csv_files:", csv_files)
merged_df = pd.DataFrame()
dir = os.getcwd()
# 循环遍历每个csv文件
for file in csv_files:
try:
# 读取csv文件并使用第一行为列标题编译不通过可以改为gbk
df = pd.read_csv(file, header=0, encoding='gbk')
except:
file_path = os.path.join(dir, file)
with open(file_path, 'rb') as file:
data = file.read()
# 使用chardet检测编码
detected_encoding = chardet.detect(data)['encoding']
print("当前读取文件读取错误:", file)
print("当前读取文件正确解码格式", detected_encoding)
# 删除重复行
df.drop_duplicates(inplace=True)
# 将数据合并到新的DataFrame中
merged_df = pd.concat([merged_df, df], ignore_index=True)
# 删除重复列
merged_df.drop_duplicates(subset=merged_df.columns.tolist(), inplace=True)
# 重置行索引
merged_df.reset_index(inplace=True, drop=True)
# 插入新的数据
alpha_chars, numeric_chars = split_alpha_numeric(merged_df.loc[0,'合约代码'])
# 添加主力连续的合约代码主力连续为888指数连续可以用999次主力连续可以使用889表头用“统一代码”
code_value = alpha_chars + "888"
print("code_value characters:", code_value)
merged_df.insert(loc=1,column="统一代码", value=code_value)
# 将合并后的数据保存到csv文件中
folder_path = "合成tick数据2019-2021"
if not os.path.exists(folder_path):
os.mkdir('合成tick数据2019-2021')
# sorted_merged_df = merged_df.sort_values(by= ['业务日期','最后修改时间','最后修改毫秒'], ascending=[True, True, True])
# sorted_merged_df.to_csv('./合成tick数据/%s.csv'%(code_value), index=False)
merged_df['时间'] = pd.to_datetime(merged_df['时间'])
sorted_merged_df = merged_df.sort_values(by = ['时间'], ascending=True)
sorted_merged_df.to_csv('./合成tick数据2019-2021/%s%s.csv'%(code_value,sp_char), index=False)
del merged_df
del sorted_merged_df
#merged_df.to_csv('./合成tick数据/%s.csv'%(code_value), index=False) #数据按照时间排序,前面文件夹按照时间修改好了可以直接用这里
# 打印提示信息
print("CSV文件合并成功")