#!/usr/bin/env python # -*- coding: utf-8 -*- """ 期货保证金爬取和更新工具 功能: 1. 从指定网站爬取期货保证金数据 2. 更新策略代码文件中的保证金配置 3. 支持数据备份和文件备份 """ import os import re import shutil from abc import ABC, abstractmethod from datetime import datetime from typing import Dict, List, Tuple import pandas as pd import requests from bs4 import BeautifulSoup # ============================================================================ # 抽象基类 # ============================================================================ class WebCrawler(ABC): """网页爬虫基类""" @abstractmethod def crawl(self) -> pd.DataFrame: """ 爬取数据 Returns: pd.DataFrame: 包含['合约代码', '投机%']的DataFrame """ pass class CodeUpdater(ABC): """代码文件更新基类""" @abstractmethod def read_config(self, file_path: str) -> Dict: """ 从代码文件读取现有配置 Args: file_path: 代码文件路径 Returns: dict: 配置字典 """ pass @abstractmethod def update_config(self, file_path: str, margin_data: pd.DataFrame) -> List[str]: """ 更新保证金配置 Args: file_path: 代码文件路径 margin_data: 保证金数据DataFrame Returns: list: 变更记录列表 """ pass @abstractmethod def add_new_contracts(self, file_path: str, new_contracts: List[str], margin_data: pd.DataFrame) -> List[str]: """ 新增合约配置(抽象方法,不同文件格式实现不同) Args: file_path: 代码文件路径 new_contracts: 新合约代码列表 margin_data: 保证金数据DataFrame Returns: list: 新增记录列表 """ pass def backup_file(self, file_path: str) -> str: """ 创建文件备份 Args: file_path: 要备份的文件路径 Returns: str: 备份文件路径 """ backup_path = f"{file_path}.bak" shutil.copy2(file_path, backup_path) print(f"[备份] 已创建文件备份: {backup_path}") return backup_path # ============================================================================ # 具体实现类 - 爬虫 # ============================================================================ class HuaAnFuturesCrawler(WebCrawler): """华安期货网站爬虫""" def __init__(self, base_url: str): """ 初始化 Args: base_url: 华安期货保证金列表页URL """ self.base_url = base_url self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' }) def crawl(self) -> pd.DataFrame: """ 爬取华安期货保证金数据 Returns: pd.DataFrame: 包含['合约代码', '投机%']的DataFrame """ print(f"[爬取] 开始访问华安期货网站: {self.base_url}") # 1. 访问列表页 response = self.session.get(self.base_url, timeout=30) response.encoding = 'utf-8' soup = BeautifulSoup(response.text, 'html.parser') # 2. 找到"保证金标准"链接 margin_link = None for link in soup.find_all('a'): link_text = link.text.strip() if link.text else '' if '保证金' in link_text: margin_link = link.get('href') if '标准' in link_text or '比例' in link_text: break if not margin_link: raise ValueError("未找到保证金标准链接") # 处理相对路径 if not margin_link.startswith('http'): from urllib.parse import urljoin margin_link = urljoin(self.base_url, margin_link) print(f"[爬取] 找到保证金标准链接: {margin_link}") # 3. 访问保证金详情页 response = self.session.get(margin_link, timeout=30) response.encoding = 'utf-8' soup = BeautifulSoup(response.text, 'html.parser') # 4. 解析第一个tbody tbody = soup.find('tbody') if not tbody: raise ValueError("未找到数据表格") print(f"[爬取] 开始解析保证金数据表格") # 5. 解析表格数据 data = [] rows = tbody.find_all('tr') for row in rows: cols = row.find_all('td') if len(cols) < 7: # 至少需要7列(交易所、品种、合约代码、客户投机、客户套保、交易所投机、交易所套保) continue # 提取合约代码(第3列,索引为2) contract_code = cols[2].text.strip() # 只要纯字母的合约代码,且最多2个字母 if not contract_code.isalpha() or len(contract_code) > 2: continue # 统一转换为大写(代码文件中都是大写) contract_code = contract_code.upper() # 提取客户比例下的投机%(第4列,索引为3) speculation_text = cols[3].text.strip() # 解析百分比数字 try: speculation_rate = float(speculation_text.replace('%', '')) except ValueError: print(f"[警告] 无法解析合约 {contract_code} 的投机比例: {speculation_text}") continue data.append({ '合约代码': contract_code, '投机%': speculation_rate }) df = pd.DataFrame(data) print(f"[爬取] 成功爬取 {len(df)} 个合约的保证金数据") return df # ============================================================================ # 具体实现类 - 代码更新器 # ============================================================================ class FuturesConfigUpdater(CodeUpdater): """期货配置更新器 - 针对g.futures_config字典""" def read_config(self, file_path: str) -> Dict: """ 从代码文件读取g.futures_config配置 Args: file_path: 代码文件路径 Returns: dict: {合约代码: {'long': 值, 'short': 值}} """ with open(file_path, 'r', encoding='utf-8') as f: content = f.read() # 匹配g.futures_config字典 pattern = r'g\.futures_config\s*=\s*\{(.*?)\n \}' match = re.search(pattern, content, re.DOTALL) if not match: raise ValueError(f"未找到g.futures_config配置") config_block = match.group(1) # 解析每个合约的保证金配置 configs = {} # 匹配每个合约配置行,例如:'AU': {'has_night_session': True, 'margin_rate': {'long': 0.14, 'short': 0.14}, ...} contract_pattern = r"'([A-Z]+)':\s*\{[^}]*'margin_rate':\s*\{'long':\s*([\d.]+),\s*'short':\s*([\d.]+)\}" for match in re.finditer(contract_pattern, config_block): contract_code = match.group(1) long_rate = float(match.group(2)) short_rate = float(match.group(3)) configs[contract_code] = {'long': long_rate, 'short': short_rate} print(f"[读取] 从 {file_path} 读取到 {len(configs)} 个合约配置") return configs def update_config(self, file_path: str, margin_data: pd.DataFrame) -> List[str]: """ 更新保证金配置 Args: file_path: 代码文件路径 margin_data: 保证金数据DataFrame Returns: list: 变更记录列表 """ with open(file_path, 'r', encoding='utf-8') as f: content = f.read() changes = [] for _, row in margin_data.iterrows(): contract_code = row['合约代码'] new_rate = row['投机%'] / 100 # 匹配该合约的整个配置行(包含合约代码,确保替换正确的那一行) pattern = f"('{contract_code}':[^}}]*'margin_rate':\\s*\\{{'long':\\s*)([\\d.]+)(,\\s*'short':\\s*)([\\d.]+)(\\}})" match = re.search(pattern, content) if not match: continue old_long = float(match.group(2)) old_short = float(match.group(4)) # 检查是否需要更新 if abs(old_long - new_rate) < 0.0001 and abs(old_short - new_rate) < 0.0001: changes.append(f" {contract_code}: {round(new_rate, 3)} (不变)") else: # 替换保证金值(使用整个匹配模式进行精确替换) old_full_str = match.group(0) # 保留3位小数 new_rate_str = f"{round(new_rate, 3):.3f}".rstrip('0').rstrip('.') new_full_str = f"{match.group(1)}{new_rate_str}{match.group(3)}{new_rate_str}{match.group(5)}" content = content.replace(old_full_str, new_full_str, 1) changes.append(f" {contract_code}: {round(old_long, 3)} -> {round(new_rate, 3)}") # 写回文件 with open(file_path, 'w', encoding='utf-8') as f: f.write(content) return changes def add_new_contracts(self, file_path: str, new_contracts: List[str], margin_data: pd.DataFrame) -> List[str]: """ 新增合约配置到g.futures_config Args: file_path: 代码文件路径 new_contracts: 新合约代码列表 margin_data: 保证金数据DataFrame Returns: list: 新增记录列表 """ with open(file_path, 'r', encoding='utf-8') as f: content = f.read() additions = [] # 找到g.futures_config字典的结束位置 pattern = r'(g\.futures_config\s*=\s*\{.*?)(\n \})' match = re.search(pattern, content, re.DOTALL) if not match: raise ValueError("未找到g.futures_config配置块") config_block = match.group(1) config_end = match.group(2) # 检查最后一行是否已经有逗号 config_lines = config_block.rstrip().split('\n') last_line = config_lines[-1] if config_lines else '' needs_comma = last_line.strip() and not last_line.rstrip().endswith(',') # 如果最后一行需要逗号,添加逗号 if needs_comma: config_block = config_block.rstrip() + ',' # 准备新增的配置行 new_lines = [] for contract_code in new_contracts: # 从margin_data中获取保证金率 rate_row = margin_data[margin_data['合约代码'] == contract_code] if rate_row.empty: continue rate = rate_row.iloc[0]['投机%'] / 100 # 保留3位小数,去掉末尾的0 rate_str = f"{round(rate, 3):.3f}".rstrip('0').rstrip('.') # 生成新配置行(使用默认模板) new_config = f" '{contract_code}': {{'has_night_session': True, 'margin_rate': {{'long': {rate_str}, 'short': {rate_str}}}, 'multiplier': 1, 'trading_start_time': '21:00'}}," new_lines.append(new_config) additions.append(f" 新增 {contract_code}: 保证金率={round(rate, 3)}, multiplier=1 (需手动调整)") # 插入新配置(在字典结束前) if new_lines: new_content = config_block + '\n' + '\n'.join(new_lines) + config_end content = content.replace(match.group(0), new_content) # 写回文件 with open(file_path, 'w', encoding='utf-8') as f: f.write(content) return additions # ============================================================================ # 管理类 # ============================================================================ class MarginCrawlerManager: """保证金爬取管理器""" def __init__(self, workspace_root: str = '/Users/maxfeng/Documents/GitHub/jukuan'): """ 初始化管理器 Args: workspace_root: 工作区根目录 """ self.workspace_root = workspace_root self.backup_dir = os.path.join(workspace_root, 'data/future_margin') # 确保备份目录存在 os.makedirs(self.backup_dir, exist_ok=True) # 爬虫配置 self.crawler_configs = { 'hua_future': { 'name': '华安期货', 'url': 'https://www.haqh.com/index.php?m=content&c=index&a=lists&catid=167', 'crawler_class': HuaAnFuturesCrawler } } # 更新器配置 self.updater_configs = { 'MAPatternStrategy_v001': { 'name': 'MA形态策略v001', 'file': os.path.join(workspace_root, 'Lib/future/MAPatternStrategy_v001.py'), 'updater_class': FuturesConfigUpdater }, 'MAPatternStrategy_v002': { 'name': 'MA形态策略v002', 'file': os.path.join(workspace_root, 'Lib/future/MAPatternStrategy_v002.py'), 'updater_class': FuturesConfigUpdater } } def run(self, crawler_key: str, updater_key: str): """ 执行完整的爬取和更新流程 Args: crawler_key: 爬虫配置的key updater_key: 更新器配置的key """ print("=" * 80) print("期货保证金爬取和更新工具") print("=" * 80) # 1. 获取配置 if crawler_key not in self.crawler_configs: raise ValueError(f"未找到爬虫配置: {crawler_key}") if updater_key not in self.updater_configs: raise ValueError(f"未找到更新器配置: {updater_key}") crawler_config = self.crawler_configs[crawler_key] updater_config = self.updater_configs[updater_key] print(f"\n[配置] 数据源: {crawler_config['name']}") print(f"[配置] 目标文件: {updater_config['name']}") # 2. 爬取数据 print(f"\n{'=' * 80}") print("步骤1: 爬取保证金数据") print(f"{'=' * 80}") crawler = crawler_config['crawler_class'](crawler_config['url']) margin_data = crawler.crawl() # 验证测试数据 self._validate_test_data(margin_data) # 3. 读取现有配置 print(f"\n{'=' * 80}") print("步骤2: 读取现有配置") print(f"{'=' * 80}") updater = updater_config['updater_class']() file_path = updater_config['file'] existing_config = updater.read_config(file_path) # 4. 比对合约代码 print(f"\n{'=' * 80}") print("步骤3: 比对合约代码") print(f"{'=' * 80}") existing_contracts = set(existing_config.keys()) crawled_contracts = set(margin_data['合约代码'].tolist()) missing_in_crawl = existing_contracts - crawled_contracts new_in_crawl = crawled_contracts - existing_contracts if missing_in_crawl: print(f"[警告] 以下合约在代码文件中存在,但爬取结果中没有:") for contract in sorted(missing_in_crawl): print(f" - {contract}") if new_in_crawl: print(f"[发现] 以下合约在爬取结果中存在,但代码文件中没有:") for contract in sorted(new_in_crawl): rate = margin_data[margin_data['合约代码'] == contract].iloc[0]['投机%'] print(f" - {contract} (保证金率: {rate}%)") if not missing_in_crawl and not new_in_crawl: print("[信息] 合约代码完全一致") # 5. 备份数据 print(f"\n{'=' * 80}") print("步骤4: 备份数据") print(f"{'=' * 80}") timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') # 备份原始配置数据 origin_data = [] for contract, rates in existing_config.items(): origin_data.append({ '合约代码': contract, 'long': rates['long'], 'short': rates['short'] }) origin_df = pd.DataFrame(origin_data) origin_file = os.path.join(self.backup_dir, f"origin_{updater_key}_{timestamp}.csv") origin_df.to_csv(origin_file, index=False, encoding='utf-8') print(f"[备份] 原始配置已保存: {origin_file}") # 备份爬取数据 update_file = os.path.join(self.backup_dir, f"update_{crawler_key}_{timestamp}.csv") margin_data.to_csv(update_file, index=False, encoding='utf-8') print(f"[备份] 爬取数据已保存: {update_file}") # 6. 备份代码文件 print(f"\n{'=' * 80}") print("步骤5: 备份代码文件") print(f"{'=' * 80}") updater.backup_file(file_path) # 7. 更新配置 print(f"\n{'=' * 80}") print("步骤6: 更新保证金配置") print(f"{'=' * 80}") changes = updater.update_config(file_path, margin_data) print("[更新] 保证金配置变更记录:") for change in changes: print(change) # 8. 新增合约 if new_in_crawl: print(f"\n{'=' * 80}") print("步骤7: 新增合约配置") print(f"{'=' * 80}") additions = updater.add_new_contracts(file_path, list(new_in_crawl), margin_data) print("[新增] 合约配置新增记录:") for addition in additions: print(addition) print(f"\n{'=' * 80}") print("完成!") print(f"{'=' * 80}") def _validate_test_data(self, margin_data: pd.DataFrame): """ 验证测试数据 Args: margin_data: 爬取的保证金数据 """ print(f"\n[验证] 检查测试数据...") test_cases = [ ('A', 16), ('CJ', 14) ] all_passed = True for contract_code, expected_rate in test_cases: row = margin_data[margin_data['合约代码'] == contract_code] if row.empty: print(f" ✗ {contract_code}: 未找到数据") all_passed = False else: actual_rate = row.iloc[0]['投机%'] if abs(actual_rate - expected_rate) < 0.01: print(f" ✓ {contract_code}: {actual_rate}% (预期: {expected_rate}%)") else: print(f" ✗ {contract_code}: {actual_rate}% (预期: {expected_rate}%)") all_passed = False if all_passed: print("[验证] 测试数据验证通过!") else: print("[验证] 测试数据验证失败!") # ============================================================================ # 主程序入口 # ============================================================================ def main(): """主函数""" # 创建管理器 manager = MarginCrawlerManager() # 执行爬取和更新 manager.run( crawler_key='hua_future', updater_key='MAPatternStrategy_v001' ) if __name__ == '__main__': main()