import pandas as pd import numpy as np from jqdata import * import datetime # from datetime import timedelta, datetime import matplotlib.pyplot as plt def get_all_key_info(the_type): main = get_all_securities(types=[the_type]).reset_index() the_main = main[main.display_name.str.endswith('主力合约')] the_main.rename(columns={'index': 'code'}, inplace=True) # 若类型是期货,提取期货代码列表 if the_type == 'futures': all_future_list = the_main['code'].unique() return the_main, all_future_list else: return the_main, None def get_the_period_of_year_range(start_year, end_year): now = datetime.datetime.now() start_date = datetime.datetime(start_year, 1, 1) end_date = datetime.datetime(end_year, 12, 31) # 如果结束年份是当前年份,则将结束日期调整为当前日期 if end_year == now.year and now < end_date: end_date = now # 获取交易日历 trade_days = get_trade_days(start_date=start_date, end_date=end_date) # 获取实际的开始和结束日期 actual_start_date = trade_days[0] actual_end_date = trade_days[-1] print(f'start_date: {actual_start_date}, end_date: {actual_end_date}') return actual_start_date, actual_end_date def get_one_future(future, unit, start_date, end_date, export=False): if unit == 'daily': data = get_price(future, start_date=start_date, end_date=end_date, frequency=unit, fields=['close','open','low','high','volume','money'], skip_paused=False, panel=False).dropna() if export: filename = f"{future.split('.')[0]}_{unit}_{end_date.strftime('%Y-%m-%d')}.csv" data.to_csv(filename) return data def get_predict_input(key_list, start_day, latest_day, period, export=False): all_data = [] for key in key_list: idata = get_one_future(key, 'daily', start_day, latest_day, False) if len(idata) >= period + 1: idata['Code'] = key all_data.append(idata) else: print(f'The future {key} has only {len(idata)} data points') data = pd.concat(all_data) data.reset_index(inplace=True) data.rename(columns={'index': 'Date'}, inplace=True) data.set_index(['Code', 'Date'], inplace=True) if export: filename = f'Input_{period}_{latest_day.strftime("%Y-%m-%d")}.csv' data.to_csv(filename) return data def get_train_input(work_future_list, all_future_df, latest_day, limit, export=False): all_data = [] for future in work_future_list: start_date = all_future_df.loc[all_future_df.code == future, 'start_date'].values[0] idata = get_one_future(future, 'daily', start_date, latest_day, False) if len(idata) >= limit: idata['Code'] = future all_data.append(idata) else: print(f'The future {future} has only {len(idata)} data points') data = pd.concat(all_data) data.reset_index(inplace=True) data.rename(columns={'index': 'Date'}, inplace=True) data.set_index(['Code', 'Date'], inplace=True) if export: filename = f'TrainData_{latest_day.strftime("%Y-%m-%d")}.csv' data.to_csv(filename) return data def process_data(df, std_dev_multiplier=3): # 计算价格变化百分比 df['change'] = df.groupby('Code')['close'].pct_change() # 计算开盘价变化的绝对值 df['openChange'] = df.groupby('Code').apply(lambda x: (x['open'] - x['close'].shift(1)).abs()).reset_index(level=0, drop=True) # 去除 NaN 值 df.dropna(subset=['change', 'openChange'], inplace=True) # 计算每日的平均变化、正负变化数量 grouped = df.groupby('Date') summary = grouped['change'].agg(['mean', 'count', positive_change_count, negative_change_count]) summary.rename(columns={'mean': 'Average Change', 'count': 'Total Count'}, inplace=True) # 计算滚动平均值 for col in ['Average Change']: for window in [5, 10]: summary[f'{col}_{window}'] = summary[col].rolling(window=window).mean() # 去除滚动平均值的 NaN 值 summary.dropna(inplace=True) # 过滤异常值 for col in ['Average Change_5', 'Average Change_10']: mean = summary[col].mean() std = summary[col].std() threshold = std_dev_multiplier * std outliers = summary[(summary[col] < mean - threshold) | (summary[col] > mean + threshold)] if not outliers.empty: print(f"异常数据 in '{col}':\n{outliers}") summary = summary[(summary[col] > mean - threshold) & (summary[col] < mean + threshold)] return summary # 正负变化计数的辅助函数 def positive_change_count(series): return (series > 0).sum() def negative_change_count(series): return (series < 0).sum() def plot_data(data, columns): plt.figure(figsize=(10, 6)) for col in columns: plt.plot(data.index, data[col], label=col) plt.axhline(0, color='red', linestyle='--', label='Zero') plt.xlabel('Date') plt.ylabel('Value') plt.title('Trends Over Time') plt.legend() plt.xticks(rotation=45) plt.grid(True) plt.show() # 使用新的函数处理数据 the_start_day, latest_day = get_the_period_of_year_range(2023, 2023) # df = get_predict_input(work_future_list, the_start_day, latest_day, period) df = get_predict_input(all_future_list, the_start_day, latest_day, period) df.reset_index(inplace=True) df.to_csv('test.csv', index=False) all_future_df, all_future_list = get_all_key_info('futures') except_futures = ['BC9999.XINE', 'CJ9999.XZCE', 'EB9999.XDCE', 'EG9999.XDCE', 'ER9999.XZCE', 'GN9999.XZCE', 'LH9999.XDCE', 'LU9999.XINE', 'ME9999.XZCE', 'NR9999.XINE', 'PF9999.XZCE', 'PG9999.XDCE', 'PK9999.XZCE', 'RO9999.XZCE', 'RR9999.XDCE', 'SA9999.XZCE', 'SC9999.XINE', 'SP9999.XSGE', 'SS9999.XSGE', 'T9999.CCFX', 'TC9999.XZCE', 'TF9999.CCFX', 'TS9999.CCFX', 'UR9999.XZCE', 'WS9999.XZCE', 'WT9999.XZCE'] # 这部分是没有模型的标的 work_future_list = [i for i in all_future_list if not i in except_futures] # 获取有模型的代码 period = 130 # 设定需要的数据周期 # 使用新的函数处理数据 the_start_day, latest_day = get_the_period_of_year_range(2023, 2023) # df = get_predict_input(work_future_list, the_start_day, latest_day, period) df = get_predict_input(all_future_list, the_start_day, latest_day, period) processed_data = process_data(df, 4) # 打印处理后的数据 print(f'Processed Data:\n{processed_data.head()}') # 绘制图表 plot_data(processed_data, ['Average Change_5', 'Average Change_10']) # 针对当前年的数据 all_future_df, all_future_list = get_all_key_info('futures') except_futures = ['BC9999.XINE', 'CJ9999.XZCE', 'EB9999.XDCE', 'EG9999.XDCE', 'ER9999.XZCE', 'GN9999.XZCE', 'LH9999.XDCE', 'LU9999.XINE', 'ME9999.XZCE', 'NR9999.XINE', 'PF9999.XZCE', 'PG9999.XDCE', 'PK9999.XZCE', 'RO9999.XZCE', 'RR9999.XDCE', 'SA9999.XZCE', 'SC9999.XINE', 'SP9999.XSGE', 'SS9999.XSGE', 'T9999.CCFX', 'TC9999.XZCE', 'TF9999.CCFX', 'TS9999.CCFX', 'UR9999.XZCE', 'WS9999.XZCE', 'WT9999.XZCE'] # 这部分是没有模型的标的 work_future_list = [i for i in all_future_list if not i in except_futures] # 获取有模型的代码 period = 130 # 设定需要的数据周期 the_start_day, latest_day = get_the_period_of_this_year(period) # 获取预测用的开始和结束时间 print(f'开始时间:{the_start_day}, 结束时间:{latest_day}') df = get_predict_input(work_future_list, the_start_day, latest_day, period) print(f'初始数据结构为:{df.info()}') # 将"index"列设置为索引 # df['change'] = df.groupby('Code')['close'].apply(lambda x: x - x.shift(1)) # 变化绝对值 df['change'] = df.groupby('Code')['close'].apply(lambda x: x.pct_change()) # 变化百分比 df = df.dropna(subset=['change']) # 每天change的平均值 average_change_per_day = df.groupby('index')['change'].mean() # 每天change为正和负的数量 df['positive_change'] = df['change'].apply(lambda x: 1 if x > 0 else 0) df['negative_change'] = df['change'].apply(lambda x: 1 if x < 0 else 0) positive_change_count_per_day = df.groupby('index')['positive_change'].sum() negative_change_count_per_day = df.groupby('index')['negative_change'].sum() # 正和负的各自平均值 positive_changes_df = df[df['change'] > 0] negative_changes_df = df[df['change'] < 0] average_positive_change_per_day = positive_changes_df.groupby('index')['change'].mean() average_negative_change_per_day = negative_changes_df.groupby('index')['change'].mean() data = pd.concat([ average_change_per_day, positive_change_count_per_day, negative_change_count_per_day, average_positive_change_per_day, average_negative_change_per_day ], axis=1) data.columns = [ 'Average Change', 'Positive Change Count', 'Negative Change Count', 'Average Positive Change', 'Average Negative Change' ] # 计算'Average Change'的5日滚动平均值 data['Average Change_5'] = data['Average Change'].rolling(window=5).mean() # 计算'Average Change'的10日滚动平均值 data['Average Change_10'] = data['Average Change'].rolling(window=10).mean() # 计算'Positive Change Count'的5日滚动平均值 data['Positive Change Count_5'] = data['Positive Change Count'].rolling(window=5).mean() # 计算'Positive Change Count'的10日滚动平均值 data['Positive Change Count_10'] = data['Positive Change Count'].rolling(window=10).mean() # 计算'Negative Change Count'的5日滚动平均值 data['Negative Change Count_5'] = data['Negative Change Count'].rolling(window=5).mean() # 计算'Negative Change Count'的10日滚动平均值 data['Negative Change Count_10'] = data['Negative Change Count'].rolling(window=10).mean() data = data.dropna() # 打印修改后的dataFrame print(f'最后完成的数据:{data.head()}') # 绘制第一张折线图 plt.figure(figsize=(10, 6)) # plt.plot(data.index, data['Average Change'], label='Average Change') plt.plot(data.index, data['Average Change_5'], label='Average Change_5') plt.plot(data.index, data['Average Change_10'], label='Average Change_10') # 绘制纵轴上值为0的红色线段 plt.axhline(0, color='red', linestyle='--', label='Zero') plt.xlabel('Date') plt.ylabel('Value') plt.title('Average Change Comparison') plt.legend() plt.xticks(rotation=45) plt.grid(True) plt.show()