import pandas as pd import os from datetime import time as s_time from datetime import datetime import chardet import numpy as np def split_alpha_numeric(string): alpha_chars = "" numeric_chars = "" for char in string: if char.isalpha(): alpha_chars += char elif char.isdigit(): numeric_chars += char return alpha_chars, numeric_chars def merged_old_unprocessed_tickdata(all_csv_files, sp_char): csv_files = [sp_file for sp_file in all_csv_files if sp_char in sp_file] print("csv_files:", csv_files) merged_up_df = pd.DataFrame() dir = os.getcwd() fileNum_errors = 0 # 循环遍历每个csv文件 for file in csv_files: try: df = pd.read_csv(file, header=0, encoding='gbk', low_memory= False, # skiprows=0, # parse_dates=['时间'] # 注意此处增加的排序,为了后面按时间排序 ) except: file_path = os.path.join(dir, file) fileNum_errors += 1 with open(file_path, 'rb') as file: data = file.read() # 使用chardet检测编码 detected_encoding = chardet.detect(data)['encoding'] # print("%s当前文件不为gbk格式,其文件格式为%s,需要转换为gbk格式,错误总数为%s"%(file,detected_encoding,fileNum_errors)) print("%s:%s当前文件不为gbk格式,其文件格式为%s,需要转换为gbk格式,错误总数为%s"%(datetime.now().strftime('%Y-%m-%d %H:%M:%S'),file_path,detected_encoding,fileNum_errors)) with open('output_error.txt', 'a') as f: print("%s:%s当前文件不为gbk格式,其文件格式为%s,需要转换为gbk格式,错误总数为%s"%(datetime.now().strftime('%Y-%m-%d %H:%M:%S'),file_path,detected_encoding,fileNum_errors), file = f) # 删除重复行 df.drop_duplicates(inplace=True) # 将数据合并到新的DataFrame中 merged_up_df = pd.concat([merged_up_df, df], ignore_index=True) # 删除重复列 merged_up_df.drop_duplicates(subset=merged_up_df.columns.tolist(), inplace=True) # 重置行索引 merged_up_df.reset_index(inplace=True, drop=True) # merged_up_df,alpha_chars,code_value = insert_main_contract(merged_up_df) # 打印提示信息 # print("按年份未处理的CSV文件合并成功!") return merged_up_df #,alpha_chars,code_value