max
/
JUKUAN


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275
							import pandas as pd
import numpy as np
from jqdata import *
import datetime
# from datetime import timedelta, datetime
import matplotlib.pyplot as plt

def get_all_key_info(the_type):
    main = get_all_securities(types=[the_type]).reset_index()
    the_main = main[main.display_name.str.endswith('主力合约')]
    the_main.rename(columns={'index': 'code'}, inplace=True)
    
    # 若类型是期货，提取期货代码列表
    if the_type == 'futures':
        all_future_list = the_main['code'].unique()
        return the_main, all_future_list
    else:
        return the_main, None

    
def get_the_period_of_year_range(start_year, end_year):
    now = datetime.datetime.now()
    start_date = datetime.datetime(start_year, 1, 1)
    end_date = datetime.datetime(end_year, 12, 31)

    # 如果结束年份是当前年份，则将结束日期调整为当前日期
    if end_year == now.year and now < end_date:
        end_date = now

    # 获取交易日历
    trade_days = get_trade_days(start_date=start_date, end_date=end_date)

    # 获取实际的开始和结束日期
    actual_start_date = trade_days[0]
    actual_end_date = trade_days[-1]
    print(f'start_date: {actual_start_date}, end_date: {actual_end_date}')

    return actual_start_date, actual_end_date


def get_one_future(future, unit, start_date, end_date, export=False):
    if unit == 'daily':
        data = get_price(future, start_date=start_date, end_date=end_date, frequency=unit, 
                         fields=['close','open','low','high','volume','money'], skip_paused=False,
                         panel=False).dropna()
        if export:
            filename = f"{future.split('.')[0]}_{unit}_{end_date.strftime('%Y-%m-%d')}.csv"
            data.to_csv(filename)
    return data

def get_predict_input(key_list, start_day, latest_day, period, export=False):
    all_data = []

    for key in key_list:
        idata = get_one_future(key, 'daily', start_day, latest_day, False)
        if len(idata) >= period + 1:
            idata['Code'] = key
            all_data.append(idata)
        else:
            print(f'The future {key} has only {len(idata)} data points')

    data = pd.concat(all_data)
    data.reset_index(inplace=True)
    data.rename(columns={'index': 'Date'}, inplace=True)
    data.set_index(['Code', 'Date'], inplace=True)

    if export:
        filename = f'Input_{period}_{latest_day.strftime("%Y-%m-%d")}.csv'
        data.to_csv(filename)

    return data


def get_train_input(work_future_list, all_future_df, latest_day, limit, export=False):
    all_data = []

    for future in work_future_list:
        start_date = all_future_df.loc[all_future_df.code == future, 'start_date'].values[0]
        idata = get_one_future(future, 'daily', start_date, latest_day, False)
        if len(idata) >= limit:
            idata['Code'] = future
            all_data.append(idata)
        else:
            print(f'The future {future} has only {len(idata)} data points')

    data = pd.concat(all_data)
    data.reset_index(inplace=True)
    data.rename(columns={'index': 'Date'}, inplace=True)
    data.set_index(['Code', 'Date'], inplace=True)

    if export:
        filename = f'TrainData_{latest_day.strftime("%Y-%m-%d")}.csv'
        data.to_csv(filename)

    return data

def process_data(df, std_dev_multiplier=3):
    # 计算价格变化百分比
    df['change'] = df.groupby('Code')['close'].pct_change()

    # 计算开盘价变化的绝对值
    df['openChange'] = df.groupby('Code').apply(lambda x: (x['open'] - x['close'].shift(1)).abs()).reset_index(level=0, drop=True)

    # 去除 NaN 值
    df.dropna(subset=['change', 'openChange'], inplace=True)

    # 计算每日的平均变化、正负变化数量
    grouped = df.groupby('Date')
    summary = grouped['change'].agg(['mean', 'count', positive_change_count, negative_change_count])
    summary.rename(columns={'mean': 'Average Change', 'count': 'Total Count'}, inplace=True)

    # 计算滚动平均值
    for col in ['Average Change']:
        for window in [5, 10]:
            summary[f'{col}_{window}'] = summary[col].rolling(window=window).mean()

    # 去除滚动平均值的 NaN 值
    summary.dropna(inplace=True)

    # 过滤异常值
    for col in ['Average Change_5', 'Average Change_10']:
        mean = summary[col].mean()
        std = summary[col].std()
        threshold = std_dev_multiplier * std
        outliers = summary[(summary[col] < mean - threshold) | (summary[col] > mean + threshold)]
        if not outliers.empty:
            print(f"异常数据 in '{col}':\n{outliers}")

        summary = summary[(summary[col] > mean - threshold) & (summary[col] < mean + threshold)]

    return summary


# 正负变化计数的辅助函数
def positive_change_count(series):
    return (series > 0).sum()

def negative_change_count(series):
    return (series < 0).sum()

def plot_data(data, columns):
    plt.figure(figsize=(10, 6))
    for col in columns:
        plt.plot(data.index, data[col], label=col)

    plt.axhline(0, color='red', linestyle='--', label='Zero')
    plt.xlabel('Date')
    plt.ylabel('Value')
    plt.title('Trends Over Time')
    plt.legend()
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.show()


# 使用新的函数处理数据
the_start_day, latest_day = get_the_period_of_year_range(2023, 2023)

# df = get_predict_input(work_future_list, the_start_day, latest_day, period)
df = get_predict_input(all_future_list, the_start_day, latest_day, period)
df.reset_index(inplace=True)

df.to_csv('test.csv', index=False)

all_future_df, all_future_list = get_all_key_info('futures')
except_futures = ['BC9999.XINE', 'CJ9999.XZCE', 'EB9999.XDCE', 'EG9999.XDCE', 'ER9999.XZCE', 'GN9999.XZCE', 'LH9999.XDCE', 
                  'LU9999.XINE', 'ME9999.XZCE', 'NR9999.XINE', 'PF9999.XZCE', 'PG9999.XDCE', 'PK9999.XZCE', 'RO9999.XZCE', 
                  'RR9999.XDCE', 'SA9999.XZCE', 'SC9999.XINE', 'SP9999.XSGE', 'SS9999.XSGE', 'T9999.CCFX', 'TC9999.XZCE',
                  'TF9999.CCFX', 'TS9999.CCFX', 'UR9999.XZCE', 'WS9999.XZCE', 'WT9999.XZCE'] # 这部分是没有模型的标的
work_future_list = [i for i in all_future_list if not i in except_futures] # 获取有模型的代码
period = 130 # 设定需要的数据周期

# 使用新的函数处理数据
the_start_day, latest_day = get_the_period_of_year_range(2023, 2023)

# df = get_predict_input(work_future_list, the_start_day, latest_day, period)
df = get_predict_input(all_future_list, the_start_day, latest_day, period)

processed_data = process_data(df, 4)

# 打印处理后的数据
print(f'Processed Data:\n{processed_data.head()}')

# 绘制图表
plot_data(processed_data, ['Average Change_5', 'Average Change_10'])

# 针对当前年的数据
all_future_df, all_future_list = get_all_key_info('futures')
except_futures = ['BC9999.XINE', 'CJ9999.XZCE', 'EB9999.XDCE', 'EG9999.XDCE', 'ER9999.XZCE', 'GN9999.XZCE', 'LH9999.XDCE', 
                  'LU9999.XINE', 'ME9999.XZCE', 'NR9999.XINE', 'PF9999.XZCE', 'PG9999.XDCE', 'PK9999.XZCE', 'RO9999.XZCE', 
                  'RR9999.XDCE', 'SA9999.XZCE', 'SC9999.XINE', 'SP9999.XSGE', 'SS9999.XSGE', 'T9999.CCFX', 'TC9999.XZCE',
                  'TF9999.CCFX', 'TS9999.CCFX', 'UR9999.XZCE', 'WS9999.XZCE', 'WT9999.XZCE'] # 这部分是没有模型的标的
work_future_list = [i for i in all_future_list if not i in except_futures] # 获取有模型的代码
period = 130 # 设定需要的数据周期
the_start_day, latest_day = get_the_period_of_this_year(period) # 获取预测用的开始和结束时间
print(f'开始时间：{the_start_day}, 结束时间：{latest_day}')
df = get_predict_input(work_future_list, the_start_day, latest_day, period)
print(f'初始数据结构为：{df.info()}')

# 将"index"列设置为索引
# df['change'] = df.groupby('Code')['close'].apply(lambda x: x - x.shift(1)) # 变化绝对值
df['change'] = df.groupby('Code')['close'].apply(lambda x: x.pct_change()) # 变化百分比
df = df.dropna(subset=['change'])

# 每天change的平均值
average_change_per_day = df.groupby('index')['change'].mean()

# 每天change为正和负的数量
df['positive_change'] = df['change'].apply(lambda x: 1 if x > 0 else 0)
df['negative_change'] = df['change'].apply(lambda x: 1 if x < 0 else 0)

positive_change_count_per_day = df.groupby('index')['positive_change'].sum()
negative_change_count_per_day = df.groupby('index')['negative_change'].sum()

# 正和负的各自平均值
positive_changes_df = df[df['change'] > 0]
negative_changes_df = df[df['change'] < 0]

average_positive_change_per_day = positive_changes_df.groupby('index')['change'].mean()
average_negative_change_per_day = negative_changes_df.groupby('index')['change'].mean()

data = pd.concat([
    average_change_per_day,
    positive_change_count_per_day,
    negative_change_count_per_day,
    average_positive_change_per_day,
    average_negative_change_per_day
], axis=1)

data.columns = [
    'Average Change',
    'Positive Change Count',
    'Negative Change Count',
    'Average Positive Change',
    'Average Negative Change'
]

# 计算'Average Change'的5日滚动平均值
data['Average Change_5'] = data['Average Change'].rolling(window=5).mean()

# 计算'Average Change'的10日滚动平均值
data['Average Change_10'] = data['Average Change'].rolling(window=10).mean()

# 计算'Positive Change Count'的5日滚动平均值
data['Positive Change Count_5'] = data['Positive Change Count'].rolling(window=5).mean()

# 计算'Positive Change Count'的10日滚动平均值
data['Positive Change Count_10'] = data['Positive Change Count'].rolling(window=10).mean()

# 计算'Negative Change Count'的5日滚动平均值
data['Negative Change Count_5'] = data['Negative Change Count'].rolling(window=5).mean()

# 计算'Negative Change Count'的10日滚动平均值
data['Negative Change Count_10'] = data['Negative Change Count'].rolling(window=10).mean()

data = data.dropna()

# 打印修改后的dataFrame
print(f'最后完成的数据：{data.head()}')

# 绘制第一张折线图
plt.figure(figsize=(10, 6))
# plt.plot(data.index, data['Average Change'], label='Average Change')
plt.plot(data.index, data['Average Change_5'], label='Average Change_5')
plt.plot(data.index, data['Average Change_10'], label='Average Change_10')

# 绘制纵轴上值为0的红色线段
plt.axhline(0, color='red', linestyle='--', label='Zero')
plt.xlabel('Date')
plt.ylabel('Value')
plt.title('Average Change Comparison')
plt.legend()
plt.xticks(rotation=45)
plt.grid(True)
plt.show()