| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580 |
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- """
- 期货保证金爬取和更新工具
- 功能:
- 1. 从指定网站爬取期货保证金数据
- 2. 更新策略代码文件中的保证金配置
- 3. 支持数据备份和文件备份
- """
- import os
- import re
- import shutil
- from abc import ABC, abstractmethod
- from datetime import datetime
- from typing import Dict, List, Tuple
- import pandas as pd
- import requests
- from bs4 import BeautifulSoup
- # ============================================================================
- # 抽象基类
- # ============================================================================
- class WebCrawler(ABC):
- """网页爬虫基类"""
-
- @abstractmethod
- def crawl(self) -> pd.DataFrame:
- """
- 爬取数据
-
- Returns:
- pd.DataFrame: 包含['合约代码', '投机%']的DataFrame
- """
- pass
- class CodeUpdater(ABC):
- """代码文件更新基类"""
-
- @abstractmethod
- def read_config(self, file_path: str) -> Dict:
- """
- 从代码文件读取现有配置
-
- Args:
- file_path: 代码文件路径
-
- Returns:
- dict: 配置字典
- """
- pass
-
- @abstractmethod
- def update_config(self, file_path: str, margin_data: pd.DataFrame) -> List[str]:
- """
- 更新保证金配置
-
- Args:
- file_path: 代码文件路径
- margin_data: 保证金数据DataFrame
-
- Returns:
- list: 变更记录列表
- """
- pass
-
- @abstractmethod
- def add_new_contracts(self, file_path: str, new_contracts: List[str],
- margin_data: pd.DataFrame) -> List[str]:
- """
- 新增合约配置(抽象方法,不同文件格式实现不同)
-
- Args:
- file_path: 代码文件路径
- new_contracts: 新合约代码列表
- margin_data: 保证金数据DataFrame
-
- Returns:
- list: 新增记录列表
- """
- pass
-
- def backup_file(self, file_path: str) -> str:
- """
- 创建文件备份
-
- Args:
- file_path: 要备份的文件路径
-
- Returns:
- str: 备份文件路径
- """
- backup_path = f"{file_path}.bak"
- shutil.copy2(file_path, backup_path)
- print(f"[备份] 已创建文件备份: {backup_path}")
- return backup_path
- # ============================================================================
- # 具体实现类 - 爬虫
- # ============================================================================
- class HuaAnFuturesCrawler(WebCrawler):
- """华安期货网站爬虫"""
-
- def __init__(self, base_url: str):
- """
- 初始化
-
- Args:
- base_url: 华安期货保证金列表页URL
- """
- self.base_url = base_url
- self.session = requests.Session()
- self.session.headers.update({
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
- })
-
- def crawl(self) -> pd.DataFrame:
- """
- 爬取华安期货保证金数据
-
- Returns:
- pd.DataFrame: 包含['合约代码', '投机%']的DataFrame
- """
- print(f"[爬取] 开始访问华安期货网站: {self.base_url}")
-
- # 1. 访问列表页
- response = self.session.get(self.base_url, timeout=30)
- response.encoding = 'utf-8'
- soup = BeautifulSoup(response.text, 'html.parser')
-
- # 2. 找到"保证金标准"链接
- margin_link = None
- for link in soup.find_all('a'):
- link_text = link.text.strip() if link.text else ''
- if '保证金' in link_text:
- margin_link = link.get('href')
- if '标准' in link_text or '比例' in link_text:
- break
-
- if not margin_link:
- raise ValueError("未找到保证金标准链接")
-
- # 处理相对路径
- if not margin_link.startswith('http'):
- from urllib.parse import urljoin
- margin_link = urljoin(self.base_url, margin_link)
-
- print(f"[爬取] 找到保证金标准链接: {margin_link}")
-
- # 3. 访问保证金详情页
- response = self.session.get(margin_link, timeout=30)
- response.encoding = 'utf-8'
- soup = BeautifulSoup(response.text, 'html.parser')
-
- # 4. 解析第一个tbody
- tbody = soup.find('tbody')
- if not tbody:
- raise ValueError("未找到数据表格")
-
- print(f"[爬取] 开始解析保证金数据表格")
-
- # 5. 解析表格数据
- data = []
- rows = tbody.find_all('tr')
-
- for row in rows:
- cols = row.find_all('td')
- if len(cols) < 7: # 至少需要7列(交易所、品种、合约代码、客户投机、客户套保、交易所投机、交易所套保)
- continue
-
- # 提取合约代码(第3列,索引为2)
- contract_code = cols[2].text.strip()
-
- # 只要纯字母的合约代码,且最多2个字母
- if not contract_code.isalpha() or len(contract_code) > 2:
- continue
-
- # 统一转换为大写(代码文件中都是大写)
- contract_code = contract_code.upper()
-
- # 提取客户比例下的投机%(第4列,索引为3)
- speculation_text = cols[3].text.strip()
-
- # 解析百分比数字
- try:
- speculation_rate = float(speculation_text.replace('%', ''))
- except ValueError:
- print(f"[警告] 无法解析合约 {contract_code} 的投机比例: {speculation_text}")
- continue
-
- data.append({
- '合约代码': contract_code,
- '投机%': speculation_rate
- })
-
- df = pd.DataFrame(data)
- print(f"[爬取] 成功爬取 {len(df)} 个合约的保证金数据")
-
- return df
- # ============================================================================
- # 具体实现类 - 代码更新器
- # ============================================================================
- class FuturesConfigUpdater(CodeUpdater):
- """期货配置更新器 - 针对g.futures_config字典"""
-
- def read_config(self, file_path: str) -> Dict:
- """
- 从代码文件读取g.futures_config配置
-
- Args:
- file_path: 代码文件路径
-
- Returns:
- dict: {合约代码: {'long': 值, 'short': 值}}
- """
- with open(file_path, 'r', encoding='utf-8') as f:
- content = f.read()
-
- # 匹配g.futures_config字典
- pattern = r'g\.futures_config\s*=\s*\{(.*?)\n \}'
- match = re.search(pattern, content, re.DOTALL)
-
- if not match:
- raise ValueError(f"未找到g.futures_config配置")
-
- config_block = match.group(1)
-
- # 解析每个合约的保证金配置
- configs = {}
- # 匹配每个合约配置行,例如:'AU': {'has_night_session': True, 'margin_rate': {'long': 0.14, 'short': 0.14}, ...}
- contract_pattern = r"'([A-Z]+)':\s*\{[^}]*'margin_rate':\s*\{'long':\s*([\d.]+),\s*'short':\s*([\d.]+)\}"
-
- for match in re.finditer(contract_pattern, config_block):
- contract_code = match.group(1)
- long_rate = float(match.group(2))
- short_rate = float(match.group(3))
- configs[contract_code] = {'long': long_rate, 'short': short_rate}
-
- print(f"[读取] 从 {file_path} 读取到 {len(configs)} 个合约配置")
- return configs
-
- def update_config(self, file_path: str, margin_data: pd.DataFrame) -> List[str]:
- """
- 更新保证金配置
-
- Args:
- file_path: 代码文件路径
- margin_data: 保证金数据DataFrame
-
- Returns:
- list: 变更记录列表
- """
- with open(file_path, 'r', encoding='utf-8') as f:
- content = f.read()
-
- changes = []
-
- for _, row in margin_data.iterrows():
- contract_code = row['合约代码']
- new_rate = row['投机%'] / 100
-
- # 匹配该合约的整个配置行(包含合约代码,确保替换正确的那一行)
- pattern = f"('{contract_code}':[^}}]*'margin_rate':\\s*\\{{'long':\\s*)([\\d.]+)(,\\s*'short':\\s*)([\\d.]+)(\\}})"
- match = re.search(pattern, content)
-
- if not match:
- continue
-
- old_long = float(match.group(2))
- old_short = float(match.group(4))
-
- # 检查是否需要更新
- if abs(old_long - new_rate) < 0.0001 and abs(old_short - new_rate) < 0.0001:
- changes.append(f" {contract_code}: {round(new_rate, 3)} (不变)")
- else:
- # 替换保证金值(使用整个匹配模式进行精确替换)
- old_full_str = match.group(0)
- # 保留3位小数
- new_rate_str = f"{round(new_rate, 3):.3f}".rstrip('0').rstrip('.')
- new_full_str = f"{match.group(1)}{new_rate_str}{match.group(3)}{new_rate_str}{match.group(5)}"
- content = content.replace(old_full_str, new_full_str, 1)
- changes.append(f" {contract_code}: {round(old_long, 3)} -> {round(new_rate, 3)}")
-
- # 写回文件
- with open(file_path, 'w', encoding='utf-8') as f:
- f.write(content)
-
- return changes
-
- def add_new_contracts(self, file_path: str, new_contracts: List[str],
- margin_data: pd.DataFrame) -> List[str]:
- """
- 新增合约配置到g.futures_config
-
- Args:
- file_path: 代码文件路径
- new_contracts: 新合约代码列表
- margin_data: 保证金数据DataFrame
-
- Returns:
- list: 新增记录列表
- """
- with open(file_path, 'r', encoding='utf-8') as f:
- content = f.read()
-
- additions = []
-
- # 找到g.futures_config字典的结束位置
- pattern = r'(g\.futures_config\s*=\s*\{.*?)(\n \})'
- match = re.search(pattern, content, re.DOTALL)
-
- if not match:
- raise ValueError("未找到g.futures_config配置块")
-
- config_block = match.group(1)
- config_end = match.group(2)
-
- # 检查最后一行是否已经有逗号
- config_lines = config_block.rstrip().split('\n')
- last_line = config_lines[-1] if config_lines else ''
- needs_comma = last_line.strip() and not last_line.rstrip().endswith(',')
-
- # 如果最后一行需要逗号,添加逗号
- if needs_comma:
- config_block = config_block.rstrip() + ','
-
- # 准备新增的配置行
- new_lines = []
- for contract_code in new_contracts:
- # 从margin_data中获取保证金率
- rate_row = margin_data[margin_data['合约代码'] == contract_code]
- if rate_row.empty:
- continue
-
- rate = rate_row.iloc[0]['投机%'] / 100
- # 保留3位小数,去掉末尾的0
- rate_str = f"{round(rate, 3):.3f}".rstrip('0').rstrip('.')
-
- # 生成新配置行(使用默认模板)
- new_config = f" '{contract_code}': {{'has_night_session': True, 'margin_rate': {{'long': {rate_str}, 'short': {rate_str}}}, 'multiplier': 1, 'trading_start_time': '21:00'}},"
- new_lines.append(new_config)
- additions.append(f" 新增 {contract_code}: 保证金率={round(rate, 3)}, multiplier=1 (需手动调整)")
-
- # 插入新配置(在字典结束前)
- if new_lines:
- new_content = config_block + '\n' + '\n'.join(new_lines) + config_end
- content = content.replace(match.group(0), new_content)
-
- # 写回文件
- with open(file_path, 'w', encoding='utf-8') as f:
- f.write(content)
-
- return additions
- # ============================================================================
- # 管理类
- # ============================================================================
- class MarginCrawlerManager:
- """保证金爬取管理器"""
-
- def __init__(self, workspace_root: str = '/Users/maxfeng/Documents/GitHub/jukuan'):
- """
- 初始化管理器
-
- Args:
- workspace_root: 工作区根目录
- """
- self.workspace_root = workspace_root
- self.backup_dir = os.path.join(workspace_root, 'data/future_margin')
-
- # 确保备份目录存在
- os.makedirs(self.backup_dir, exist_ok=True)
-
- # 爬虫配置
- self.crawler_configs = {
- 'hua_future': {
- 'name': '华安期货',
- 'url': 'https://www.haqh.com/index.php?m=content&c=index&a=lists&catid=167',
- 'crawler_class': HuaAnFuturesCrawler
- }
- }
-
- # 更新器配置
- self.updater_configs = {
- 'MAPatternStrategy_v001': {
- 'name': 'MA形态策略v001',
- 'file': os.path.join(workspace_root, 'Lib/future/MAPatternStrategy_v001.py'),
- 'updater_class': FuturesConfigUpdater
- },
- 'MAPatternStrategy_v002': {
- 'name': 'MA形态策略v002',
- 'file': os.path.join(workspace_root, 'Lib/future/MAPatternStrategy_v002.py'),
- 'updater_class': FuturesConfigUpdater
- }
- }
-
- def run(self, crawler_key: str, updater_key: str):
- """
- 执行完整的爬取和更新流程
-
- Args:
- crawler_key: 爬虫配置的key
- updater_key: 更新器配置的key
- """
- print("=" * 80)
- print("期货保证金爬取和更新工具")
- print("=" * 80)
-
- # 1. 获取配置
- if crawler_key not in self.crawler_configs:
- raise ValueError(f"未找到爬虫配置: {crawler_key}")
- if updater_key not in self.updater_configs:
- raise ValueError(f"未找到更新器配置: {updater_key}")
-
- crawler_config = self.crawler_configs[crawler_key]
- updater_config = self.updater_configs[updater_key]
-
- print(f"\n[配置] 数据源: {crawler_config['name']}")
- print(f"[配置] 目标文件: {updater_config['name']}")
-
- # 2. 爬取数据
- print(f"\n{'=' * 80}")
- print("步骤1: 爬取保证金数据")
- print(f"{'=' * 80}")
- crawler = crawler_config['crawler_class'](crawler_config['url'])
- margin_data = crawler.crawl()
-
- # 验证测试数据
- self._validate_test_data(margin_data)
-
- # 3. 读取现有配置
- print(f"\n{'=' * 80}")
- print("步骤2: 读取现有配置")
- print(f"{'=' * 80}")
- updater = updater_config['updater_class']()
- file_path = updater_config['file']
- existing_config = updater.read_config(file_path)
-
- # 4. 比对合约代码
- print(f"\n{'=' * 80}")
- print("步骤3: 比对合约代码")
- print(f"{'=' * 80}")
- existing_contracts = set(existing_config.keys())
- crawled_contracts = set(margin_data['合约代码'].tolist())
-
- missing_in_crawl = existing_contracts - crawled_contracts
- new_in_crawl = crawled_contracts - existing_contracts
-
- if missing_in_crawl:
- print(f"[警告] 以下合约在代码文件中存在,但爬取结果中没有:")
- for contract in sorted(missing_in_crawl):
- print(f" - {contract}")
-
- if new_in_crawl:
- print(f"[发现] 以下合约在爬取结果中存在,但代码文件中没有:")
- for contract in sorted(new_in_crawl):
- rate = margin_data[margin_data['合约代码'] == contract].iloc[0]['投机%']
- print(f" - {contract} (保证金率: {rate}%)")
-
- if not missing_in_crawl and not new_in_crawl:
- print("[信息] 合约代码完全一致")
-
- # 5. 备份数据
- print(f"\n{'=' * 80}")
- print("步骤4: 备份数据")
- print(f"{'=' * 80}")
- timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
-
- # 备份原始配置数据
- origin_data = []
- for contract, rates in existing_config.items():
- origin_data.append({
- '合约代码': contract,
- 'long': rates['long'],
- 'short': rates['short']
- })
- origin_df = pd.DataFrame(origin_data)
- origin_file = os.path.join(self.backup_dir, f"origin_{updater_key}_{timestamp}.csv")
- origin_df.to_csv(origin_file, index=False, encoding='utf-8')
- print(f"[备份] 原始配置已保存: {origin_file}")
-
- # 备份爬取数据
- update_file = os.path.join(self.backup_dir, f"update_{crawler_key}_{timestamp}.csv")
- margin_data.to_csv(update_file, index=False, encoding='utf-8')
- print(f"[备份] 爬取数据已保存: {update_file}")
-
- # 6. 备份代码文件
- print(f"\n{'=' * 80}")
- print("步骤5: 备份代码文件")
- print(f"{'=' * 80}")
- updater.backup_file(file_path)
-
- # 7. 更新配置
- print(f"\n{'=' * 80}")
- print("步骤6: 更新保证金配置")
- print(f"{'=' * 80}")
- changes = updater.update_config(file_path, margin_data)
- print("[更新] 保证金配置变更记录:")
- for change in changes:
- print(change)
-
- # 8. 新增合约
- if new_in_crawl:
- print(f"\n{'=' * 80}")
- print("步骤7: 新增合约配置")
- print(f"{'=' * 80}")
- additions = updater.add_new_contracts(file_path, list(new_in_crawl), margin_data)
- print("[新增] 合约配置新增记录:")
- for addition in additions:
- print(addition)
-
- print(f"\n{'=' * 80}")
- print("完成!")
- print(f"{'=' * 80}")
-
- def _validate_test_data(self, margin_data: pd.DataFrame):
- """
- 验证测试数据
-
- Args:
- margin_data: 爬取的保证金数据
- """
- print(f"\n[验证] 检查测试数据...")
-
- test_cases = [
- ('A', 16),
- ('CJ', 14)
- ]
-
- all_passed = True
- for contract_code, expected_rate in test_cases:
- row = margin_data[margin_data['合约代码'] == contract_code]
- if row.empty:
- print(f" ✗ {contract_code}: 未找到数据")
- all_passed = False
- else:
- actual_rate = row.iloc[0]['投机%']
- if abs(actual_rate - expected_rate) < 0.01:
- print(f" ✓ {contract_code}: {actual_rate}% (预期: {expected_rate}%)")
- else:
- print(f" ✗ {contract_code}: {actual_rate}% (预期: {expected_rate}%)")
- all_passed = False
-
- if all_passed:
- print("[验证] 测试数据验证通过!")
- else:
- print("[验证] 测试数据验证失败!")
- # ============================================================================
- # 主程序入口
- # ============================================================================
- def main():
- """主函数"""
- # 创建管理器
- manager = MarginCrawlerManager()
-
- # 执行爬取和更新
- manager.run(
- crawler_key='hua_future',
- updater_key='MAPatternStrategy_v001'
- )
- if __name__ == '__main__':
- main()
|