| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275 |
- import pandas as pd
- import numpy as np
- from jqdata import *
- import datetime
- # from datetime import timedelta, datetime
- import matplotlib.pyplot as plt
- def get_all_key_info(the_type):
- main = get_all_securities(types=[the_type]).reset_index()
- the_main = main[main.display_name.str.endswith('主力合约')]
- the_main.rename(columns={'index': 'code'}, inplace=True)
-
- # 若类型是期货,提取期货代码列表
- if the_type == 'futures':
- all_future_list = the_main['code'].unique()
- return the_main, all_future_list
- else:
- return the_main, None
-
- def get_the_period_of_year_range(start_year, end_year):
- now = datetime.datetime.now()
- start_date = datetime.datetime(start_year, 1, 1)
- end_date = datetime.datetime(end_year, 12, 31)
- # 如果结束年份是当前年份,则将结束日期调整为当前日期
- if end_year == now.year and now < end_date:
- end_date = now
- # 获取交易日历
- trade_days = get_trade_days(start_date=start_date, end_date=end_date)
- # 获取实际的开始和结束日期
- actual_start_date = trade_days[0]
- actual_end_date = trade_days[-1]
- print(f'start_date: {actual_start_date}, end_date: {actual_end_date}')
- return actual_start_date, actual_end_date
- def get_one_future(future, unit, start_date, end_date, export=False):
- if unit == 'daily':
- data = get_price(future, start_date=start_date, end_date=end_date, frequency=unit,
- fields=['close','open','low','high','volume','money'], skip_paused=False,
- panel=False).dropna()
- if export:
- filename = f"{future.split('.')[0]}_{unit}_{end_date.strftime('%Y-%m-%d')}.csv"
- data.to_csv(filename)
- return data
- def get_predict_input(key_list, start_day, latest_day, period, export=False):
- all_data = []
- for key in key_list:
- idata = get_one_future(key, 'daily', start_day, latest_day, False)
- if len(idata) >= period + 1:
- idata['Code'] = key
- all_data.append(idata)
- else:
- print(f'The future {key} has only {len(idata)} data points')
- data = pd.concat(all_data)
- data.reset_index(inplace=True)
- data.rename(columns={'index': 'Date'}, inplace=True)
- data.set_index(['Code', 'Date'], inplace=True)
- if export:
- filename = f'Input_{period}_{latest_day.strftime("%Y-%m-%d")}.csv'
- data.to_csv(filename)
- return data
- def get_train_input(work_future_list, all_future_df, latest_day, limit, export=False):
- all_data = []
- for future in work_future_list:
- start_date = all_future_df.loc[all_future_df.code == future, 'start_date'].values[0]
- idata = get_one_future(future, 'daily', start_date, latest_day, False)
- if len(idata) >= limit:
- idata['Code'] = future
- all_data.append(idata)
- else:
- print(f'The future {future} has only {len(idata)} data points')
- data = pd.concat(all_data)
- data.reset_index(inplace=True)
- data.rename(columns={'index': 'Date'}, inplace=True)
- data.set_index(['Code', 'Date'], inplace=True)
- if export:
- filename = f'TrainData_{latest_day.strftime("%Y-%m-%d")}.csv'
- data.to_csv(filename)
- return data
- def process_data(df, std_dev_multiplier=3):
- # 计算价格变化百分比
- df['change'] = df.groupby('Code')['close'].pct_change()
- # 计算开盘价变化的绝对值
- df['openChange'] = df.groupby('Code').apply(lambda x: (x['open'] - x['close'].shift(1)).abs()).reset_index(level=0, drop=True)
- # 去除 NaN 值
- df.dropna(subset=['change', 'openChange'], inplace=True)
- # 计算每日的平均变化、正负变化数量
- grouped = df.groupby('Date')
- summary = grouped['change'].agg(['mean', 'count', positive_change_count, negative_change_count])
- summary.rename(columns={'mean': 'Average Change', 'count': 'Total Count'}, inplace=True)
- # 计算滚动平均值
- for col in ['Average Change']:
- for window in [5, 10]:
- summary[f'{col}_{window}'] = summary[col].rolling(window=window).mean()
- # 去除滚动平均值的 NaN 值
- summary.dropna(inplace=True)
- # 过滤异常值
- for col in ['Average Change_5', 'Average Change_10']:
- mean = summary[col].mean()
- std = summary[col].std()
- threshold = std_dev_multiplier * std
- outliers = summary[(summary[col] < mean - threshold) | (summary[col] > mean + threshold)]
- if not outliers.empty:
- print(f"异常数据 in '{col}':\n{outliers}")
- summary = summary[(summary[col] > mean - threshold) & (summary[col] < mean + threshold)]
- return summary
- # 正负变化计数的辅助函数
- def positive_change_count(series):
- return (series > 0).sum()
- def negative_change_count(series):
- return (series < 0).sum()
- def plot_data(data, columns):
- plt.figure(figsize=(10, 6))
- for col in columns:
- plt.plot(data.index, data[col], label=col)
- plt.axhline(0, color='red', linestyle='--', label='Zero')
- plt.xlabel('Date')
- plt.ylabel('Value')
- plt.title('Trends Over Time')
- plt.legend()
- plt.xticks(rotation=45)
- plt.grid(True)
- plt.show()
- # 使用新的函数处理数据
- the_start_day, latest_day = get_the_period_of_year_range(2023, 2023)
- # df = get_predict_input(work_future_list, the_start_day, latest_day, period)
- df = get_predict_input(all_future_list, the_start_day, latest_day, period)
- df.reset_index(inplace=True)
- df.to_csv('test.csv', index=False)
- all_future_df, all_future_list = get_all_key_info('futures')
- except_futures = ['BC9999.XINE', 'CJ9999.XZCE', 'EB9999.XDCE', 'EG9999.XDCE', 'ER9999.XZCE', 'GN9999.XZCE', 'LH9999.XDCE',
- 'LU9999.XINE', 'ME9999.XZCE', 'NR9999.XINE', 'PF9999.XZCE', 'PG9999.XDCE', 'PK9999.XZCE', 'RO9999.XZCE',
- 'RR9999.XDCE', 'SA9999.XZCE', 'SC9999.XINE', 'SP9999.XSGE', 'SS9999.XSGE', 'T9999.CCFX', 'TC9999.XZCE',
- 'TF9999.CCFX', 'TS9999.CCFX', 'UR9999.XZCE', 'WS9999.XZCE', 'WT9999.XZCE'] # 这部分是没有模型的标的
- work_future_list = [i for i in all_future_list if not i in except_futures] # 获取有模型的代码
- period = 130 # 设定需要的数据周期
- # 使用新的函数处理数据
- the_start_day, latest_day = get_the_period_of_year_range(2023, 2023)
- # df = get_predict_input(work_future_list, the_start_day, latest_day, period)
- df = get_predict_input(all_future_list, the_start_day, latest_day, period)
- processed_data = process_data(df, 4)
- # 打印处理后的数据
- print(f'Processed Data:\n{processed_data.head()}')
- # 绘制图表
- plot_data(processed_data, ['Average Change_5', 'Average Change_10'])
- # 针对当前年的数据
- all_future_df, all_future_list = get_all_key_info('futures')
- except_futures = ['BC9999.XINE', 'CJ9999.XZCE', 'EB9999.XDCE', 'EG9999.XDCE', 'ER9999.XZCE', 'GN9999.XZCE', 'LH9999.XDCE',
- 'LU9999.XINE', 'ME9999.XZCE', 'NR9999.XINE', 'PF9999.XZCE', 'PG9999.XDCE', 'PK9999.XZCE', 'RO9999.XZCE',
- 'RR9999.XDCE', 'SA9999.XZCE', 'SC9999.XINE', 'SP9999.XSGE', 'SS9999.XSGE', 'T9999.CCFX', 'TC9999.XZCE',
- 'TF9999.CCFX', 'TS9999.CCFX', 'UR9999.XZCE', 'WS9999.XZCE', 'WT9999.XZCE'] # 这部分是没有模型的标的
- work_future_list = [i for i in all_future_list if not i in except_futures] # 获取有模型的代码
- period = 130 # 设定需要的数据周期
- the_start_day, latest_day = get_the_period_of_this_year(period) # 获取预测用的开始和结束时间
- print(f'开始时间:{the_start_day}, 结束时间:{latest_day}')
- df = get_predict_input(work_future_list, the_start_day, latest_day, period)
- print(f'初始数据结构为:{df.info()}')
- # 将"index"列设置为索引
- # df['change'] = df.groupby('Code')['close'].apply(lambda x: x - x.shift(1)) # 变化绝对值
- df['change'] = df.groupby('Code')['close'].apply(lambda x: x.pct_change()) # 变化百分比
- df = df.dropna(subset=['change'])
- # 每天change的平均值
- average_change_per_day = df.groupby('index')['change'].mean()
- # 每天change为正和负的数量
- df['positive_change'] = df['change'].apply(lambda x: 1 if x > 0 else 0)
- df['negative_change'] = df['change'].apply(lambda x: 1 if x < 0 else 0)
- positive_change_count_per_day = df.groupby('index')['positive_change'].sum()
- negative_change_count_per_day = df.groupby('index')['negative_change'].sum()
- # 正和负的各自平均值
- positive_changes_df = df[df['change'] > 0]
- negative_changes_df = df[df['change'] < 0]
- average_positive_change_per_day = positive_changes_df.groupby('index')['change'].mean()
- average_negative_change_per_day = negative_changes_df.groupby('index')['change'].mean()
- data = pd.concat([
- average_change_per_day,
- positive_change_count_per_day,
- negative_change_count_per_day,
- average_positive_change_per_day,
- average_negative_change_per_day
- ], axis=1)
- data.columns = [
- 'Average Change',
- 'Positive Change Count',
- 'Negative Change Count',
- 'Average Positive Change',
- 'Average Negative Change'
- ]
- # 计算'Average Change'的5日滚动平均值
- data['Average Change_5'] = data['Average Change'].rolling(window=5).mean()
- # 计算'Average Change'的10日滚动平均值
- data['Average Change_10'] = data['Average Change'].rolling(window=10).mean()
- # 计算'Positive Change Count'的5日滚动平均值
- data['Positive Change Count_5'] = data['Positive Change Count'].rolling(window=5).mean()
- # 计算'Positive Change Count'的10日滚动平均值
- data['Positive Change Count_10'] = data['Positive Change Count'].rolling(window=10).mean()
- # 计算'Negative Change Count'的5日滚动平均值
- data['Negative Change Count_5'] = data['Negative Change Count'].rolling(window=5).mean()
- # 计算'Negative Change Count'的10日滚动平均值
- data['Negative Change Count_10'] = data['Negative Change Count'].rolling(window=10).mean()
- data = data.dropna()
- # 打印修改后的dataFrame
- print(f'最后完成的数据:{data.head()}')
- # 绘制第一张折线图
- plt.figure(figsize=(10, 6))
- # plt.plot(data.index, data['Average Change'], label='Average Change')
- plt.plot(data.index, data['Average Change_5'], label='Average Change_5')
- plt.plot(data.index, data['Average Change_10'], label='Average Change_10')
- # 绘制纵轴上值为0的红色线段
- plt.axhline(0, color='red', linestyle='--', label='Zero')
- plt.xlabel('Date')
- plt.ylabel('Value')
- plt.title('Average Change Comparison')
- plt.legend()
- plt.xticks(rotation=45)
- plt.grid(True)
- plt.show()
|