future_all_change_trend.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275
  1. import pandas as pd
  2. import numpy as np
  3. from jqdata import *
  4. import datetime
  5. # from datetime import timedelta, datetime
  6. import matplotlib.pyplot as plt
  7. def get_all_key_info(the_type):
  8. main = get_all_securities(types=[the_type]).reset_index()
  9. the_main = main[main.display_name.str.endswith('主力合约')]
  10. the_main.rename(columns={'index': 'code'}, inplace=True)
  11. # 若类型是期货,提取期货代码列表
  12. if the_type == 'futures':
  13. all_future_list = the_main['code'].unique()
  14. return the_main, all_future_list
  15. else:
  16. return the_main, None
  17. def get_the_period_of_year_range(start_year, end_year):
  18. now = datetime.datetime.now()
  19. start_date = datetime.datetime(start_year, 1, 1)
  20. end_date = datetime.datetime(end_year, 12, 31)
  21. # 如果结束年份是当前年份,则将结束日期调整为当前日期
  22. if end_year == now.year and now < end_date:
  23. end_date = now
  24. # 获取交易日历
  25. trade_days = get_trade_days(start_date=start_date, end_date=end_date)
  26. # 获取实际的开始和结束日期
  27. actual_start_date = trade_days[0]
  28. actual_end_date = trade_days[-1]
  29. print(f'start_date: {actual_start_date}, end_date: {actual_end_date}')
  30. return actual_start_date, actual_end_date
  31. def get_one_future(future, unit, start_date, end_date, export=False):
  32. if unit == 'daily':
  33. data = get_price(future, start_date=start_date, end_date=end_date, frequency=unit,
  34. fields=['close','open','low','high','volume','money'], skip_paused=False,
  35. panel=False).dropna()
  36. if export:
  37. filename = f"{future.split('.')[0]}_{unit}_{end_date.strftime('%Y-%m-%d')}.csv"
  38. data.to_csv(filename)
  39. return data
  40. def get_predict_input(key_list, start_day, latest_day, period, export=False):
  41. all_data = []
  42. for key in key_list:
  43. idata = get_one_future(key, 'daily', start_day, latest_day, False)
  44. if len(idata) >= period + 1:
  45. idata['Code'] = key
  46. all_data.append(idata)
  47. else:
  48. print(f'The future {key} has only {len(idata)} data points')
  49. data = pd.concat(all_data)
  50. data.reset_index(inplace=True)
  51. data.rename(columns={'index': 'Date'}, inplace=True)
  52. data.set_index(['Code', 'Date'], inplace=True)
  53. if export:
  54. filename = f'Input_{period}_{latest_day.strftime("%Y-%m-%d")}.csv'
  55. data.to_csv(filename)
  56. return data
  57. def get_train_input(work_future_list, all_future_df, latest_day, limit, export=False):
  58. all_data = []
  59. for future in work_future_list:
  60. start_date = all_future_df.loc[all_future_df.code == future, 'start_date'].values[0]
  61. idata = get_one_future(future, 'daily', start_date, latest_day, False)
  62. if len(idata) >= limit:
  63. idata['Code'] = future
  64. all_data.append(idata)
  65. else:
  66. print(f'The future {future} has only {len(idata)} data points')
  67. data = pd.concat(all_data)
  68. data.reset_index(inplace=True)
  69. data.rename(columns={'index': 'Date'}, inplace=True)
  70. data.set_index(['Code', 'Date'], inplace=True)
  71. if export:
  72. filename = f'TrainData_{latest_day.strftime("%Y-%m-%d")}.csv'
  73. data.to_csv(filename)
  74. return data
  75. def process_data(df, std_dev_multiplier=3):
  76. # 计算价格变化百分比
  77. df['change'] = df.groupby('Code')['close'].pct_change()
  78. # 计算开盘价变化的绝对值
  79. df['openChange'] = df.groupby('Code').apply(lambda x: (x['open'] - x['close'].shift(1)).abs()).reset_index(level=0, drop=True)
  80. # 去除 NaN 值
  81. df.dropna(subset=['change', 'openChange'], inplace=True)
  82. # 计算每日的平均变化、正负变化数量
  83. grouped = df.groupby('Date')
  84. summary = grouped['change'].agg(['mean', 'count', positive_change_count, negative_change_count])
  85. summary.rename(columns={'mean': 'Average Change', 'count': 'Total Count'}, inplace=True)
  86. # 计算滚动平均值
  87. for col in ['Average Change']:
  88. for window in [5, 10]:
  89. summary[f'{col}_{window}'] = summary[col].rolling(window=window).mean()
  90. # 去除滚动平均值的 NaN 值
  91. summary.dropna(inplace=True)
  92. # 过滤异常值
  93. for col in ['Average Change_5', 'Average Change_10']:
  94. mean = summary[col].mean()
  95. std = summary[col].std()
  96. threshold = std_dev_multiplier * std
  97. outliers = summary[(summary[col] < mean - threshold) | (summary[col] > mean + threshold)]
  98. if not outliers.empty:
  99. print(f"异常数据 in '{col}':\n{outliers}")
  100. summary = summary[(summary[col] > mean - threshold) & (summary[col] < mean + threshold)]
  101. return summary
  102. # 正负变化计数的辅助函数
  103. def positive_change_count(series):
  104. return (series > 0).sum()
  105. def negative_change_count(series):
  106. return (series < 0).sum()
  107. def plot_data(data, columns):
  108. plt.figure(figsize=(10, 6))
  109. for col in columns:
  110. plt.plot(data.index, data[col], label=col)
  111. plt.axhline(0, color='red', linestyle='--', label='Zero')
  112. plt.xlabel('Date')
  113. plt.ylabel('Value')
  114. plt.title('Trends Over Time')
  115. plt.legend()
  116. plt.xticks(rotation=45)
  117. plt.grid(True)
  118. plt.show()
  119. # 使用新的函数处理数据
  120. the_start_day, latest_day = get_the_period_of_year_range(2023, 2023)
  121. # df = get_predict_input(work_future_list, the_start_day, latest_day, period)
  122. df = get_predict_input(all_future_list, the_start_day, latest_day, period)
  123. df.reset_index(inplace=True)
  124. df.to_csv('test.csv', index=False)
  125. all_future_df, all_future_list = get_all_key_info('futures')
  126. except_futures = ['BC9999.XINE', 'CJ9999.XZCE', 'EB9999.XDCE', 'EG9999.XDCE', 'ER9999.XZCE', 'GN9999.XZCE', 'LH9999.XDCE',
  127. 'LU9999.XINE', 'ME9999.XZCE', 'NR9999.XINE', 'PF9999.XZCE', 'PG9999.XDCE', 'PK9999.XZCE', 'RO9999.XZCE',
  128. 'RR9999.XDCE', 'SA9999.XZCE', 'SC9999.XINE', 'SP9999.XSGE', 'SS9999.XSGE', 'T9999.CCFX', 'TC9999.XZCE',
  129. 'TF9999.CCFX', 'TS9999.CCFX', 'UR9999.XZCE', 'WS9999.XZCE', 'WT9999.XZCE'] # 这部分是没有模型的标的
  130. work_future_list = [i for i in all_future_list if not i in except_futures] # 获取有模型的代码
  131. period = 130 # 设定需要的数据周期
  132. # 使用新的函数处理数据
  133. the_start_day, latest_day = get_the_period_of_year_range(2023, 2023)
  134. # df = get_predict_input(work_future_list, the_start_day, latest_day, period)
  135. df = get_predict_input(all_future_list, the_start_day, latest_day, period)
  136. processed_data = process_data(df, 4)
  137. # 打印处理后的数据
  138. print(f'Processed Data:\n{processed_data.head()}')
  139. # 绘制图表
  140. plot_data(processed_data, ['Average Change_5', 'Average Change_10'])
  141. # 针对当前年的数据
  142. all_future_df, all_future_list = get_all_key_info('futures')
  143. except_futures = ['BC9999.XINE', 'CJ9999.XZCE', 'EB9999.XDCE', 'EG9999.XDCE', 'ER9999.XZCE', 'GN9999.XZCE', 'LH9999.XDCE',
  144. 'LU9999.XINE', 'ME9999.XZCE', 'NR9999.XINE', 'PF9999.XZCE', 'PG9999.XDCE', 'PK9999.XZCE', 'RO9999.XZCE',
  145. 'RR9999.XDCE', 'SA9999.XZCE', 'SC9999.XINE', 'SP9999.XSGE', 'SS9999.XSGE', 'T9999.CCFX', 'TC9999.XZCE',
  146. 'TF9999.CCFX', 'TS9999.CCFX', 'UR9999.XZCE', 'WS9999.XZCE', 'WT9999.XZCE'] # 这部分是没有模型的标的
  147. work_future_list = [i for i in all_future_list if not i in except_futures] # 获取有模型的代码
  148. period = 130 # 设定需要的数据周期
  149. the_start_day, latest_day = get_the_period_of_this_year(period) # 获取预测用的开始和结束时间
  150. print(f'开始时间:{the_start_day}, 结束时间:{latest_day}')
  151. df = get_predict_input(work_future_list, the_start_day, latest_day, period)
  152. print(f'初始数据结构为:{df.info()}')
  153. # 将"index"列设置为索引
  154. # df['change'] = df.groupby('Code')['close'].apply(lambda x: x - x.shift(1)) # 变化绝对值
  155. df['change'] = df.groupby('Code')['close'].apply(lambda x: x.pct_change()) # 变化百分比
  156. df = df.dropna(subset=['change'])
  157. # 每天change的平均值
  158. average_change_per_day = df.groupby('index')['change'].mean()
  159. # 每天change为正和负的数量
  160. df['positive_change'] = df['change'].apply(lambda x: 1 if x > 0 else 0)
  161. df['negative_change'] = df['change'].apply(lambda x: 1 if x < 0 else 0)
  162. positive_change_count_per_day = df.groupby('index')['positive_change'].sum()
  163. negative_change_count_per_day = df.groupby('index')['negative_change'].sum()
  164. # 正和负的各自平均值
  165. positive_changes_df = df[df['change'] > 0]
  166. negative_changes_df = df[df['change'] < 0]
  167. average_positive_change_per_day = positive_changes_df.groupby('index')['change'].mean()
  168. average_negative_change_per_day = negative_changes_df.groupby('index')['change'].mean()
  169. data = pd.concat([
  170. average_change_per_day,
  171. positive_change_count_per_day,
  172. negative_change_count_per_day,
  173. average_positive_change_per_day,
  174. average_negative_change_per_day
  175. ], axis=1)
  176. data.columns = [
  177. 'Average Change',
  178. 'Positive Change Count',
  179. 'Negative Change Count',
  180. 'Average Positive Change',
  181. 'Average Negative Change'
  182. ]
  183. # 计算'Average Change'的5日滚动平均值
  184. data['Average Change_5'] = data['Average Change'].rolling(window=5).mean()
  185. # 计算'Average Change'的10日滚动平均值
  186. data['Average Change_10'] = data['Average Change'].rolling(window=10).mean()
  187. # 计算'Positive Change Count'的5日滚动平均值
  188. data['Positive Change Count_5'] = data['Positive Change Count'].rolling(window=5).mean()
  189. # 计算'Positive Change Count'的10日滚动平均值
  190. data['Positive Change Count_10'] = data['Positive Change Count'].rolling(window=10).mean()
  191. # 计算'Negative Change Count'的5日滚动平均值
  192. data['Negative Change Count_5'] = data['Negative Change Count'].rolling(window=5).mean()
  193. # 计算'Negative Change Count'的10日滚动平均值
  194. data['Negative Change Count_10'] = data['Negative Change Count'].rolling(window=10).mean()
  195. data = data.dropna()
  196. # 打印修改后的dataFrame
  197. print(f'最后完成的数据:{data.head()}')
  198. # 绘制第一张折线图
  199. plt.figure(figsize=(10, 6))
  200. # plt.plot(data.index, data['Average Change'], label='Average Change')
  201. plt.plot(data.index, data['Average Change_5'], label='Average Change_5')
  202. plt.plot(data.index, data['Average Change_10'], label='Average Change_10')
  203. # 绘制纵轴上值为0的红色线段
  204. plt.axhline(0, color='red', linestyle='--', label='Zero')
  205. plt.xlabel('Date')
  206. plt.ylabel('Value')
  207. plt.title('Average Change Comparison')
  208. plt.legend()
  209. plt.xticks(rotation=45)
  210. plt.grid(True)
  211. plt.show()