margin_crawler.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. """
  4. 期货保证金爬取和更新工具
  5. 功能:
  6. 1. 从指定网站爬取期货保证金数据
  7. 2. 更新策略代码文件中的保证金配置
  8. 3. 支持数据备份和文件备份
  9. """
  10. import os
  11. import re
  12. import shutil
  13. from abc import ABC, abstractmethod
  14. from datetime import datetime
  15. from typing import Dict, List, Tuple
  16. import pandas as pd
  17. import requests
  18. from bs4 import BeautifulSoup
  19. # ============================================================================
  20. # 抽象基类
  21. # ============================================================================
  22. class WebCrawler(ABC):
  23. """网页爬虫基类"""
  24. @abstractmethod
  25. def crawl(self) -> pd.DataFrame:
  26. """
  27. 爬取数据
  28. Returns:
  29. pd.DataFrame: 包含['合约代码', '投机%']的DataFrame
  30. """
  31. pass
  32. class CodeUpdater(ABC):
  33. """代码文件更新基类"""
  34. @abstractmethod
  35. def read_config(self, file_path: str) -> Dict:
  36. """
  37. 从代码文件读取现有配置
  38. Args:
  39. file_path: 代码文件路径
  40. Returns:
  41. dict: 配置字典
  42. """
  43. pass
  44. @abstractmethod
  45. def update_config(self, file_path: str, margin_data: pd.DataFrame) -> List[str]:
  46. """
  47. 更新保证金配置
  48. Args:
  49. file_path: 代码文件路径
  50. margin_data: 保证金数据DataFrame
  51. Returns:
  52. list: 变更记录列表
  53. """
  54. pass
  55. @abstractmethod
  56. def add_new_contracts(self, file_path: str, new_contracts: List[str],
  57. margin_data: pd.DataFrame) -> List[str]:
  58. """
  59. 新增合约配置(抽象方法,不同文件格式实现不同)
  60. Args:
  61. file_path: 代码文件路径
  62. new_contracts: 新合约代码列表
  63. margin_data: 保证金数据DataFrame
  64. Returns:
  65. list: 新增记录列表
  66. """
  67. pass
  68. def backup_file(self, file_path: str) -> str:
  69. """
  70. 创建文件备份
  71. Args:
  72. file_path: 要备份的文件路径
  73. Returns:
  74. str: 备份文件路径
  75. """
  76. backup_path = f"{file_path}.bak"
  77. shutil.copy2(file_path, backup_path)
  78. print(f"[备份] 已创建文件备份: {backup_path}")
  79. return backup_path
  80. # ============================================================================
  81. # 具体实现类 - 爬虫
  82. # ============================================================================
  83. class HuaAnFuturesCrawler(WebCrawler):
  84. """华安期货网站爬虫"""
  85. def __init__(self, base_url: str):
  86. """
  87. 初始化
  88. Args:
  89. base_url: 华安期货保证金列表页URL
  90. """
  91. self.base_url = base_url
  92. self.session = requests.Session()
  93. self.session.headers.update({
  94. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
  95. })
  96. def crawl(self) -> pd.DataFrame:
  97. """
  98. 爬取华安期货保证金数据
  99. Returns:
  100. pd.DataFrame: 包含['合约代码', '投机%']的DataFrame
  101. """
  102. print(f"[爬取] 开始访问华安期货网站: {self.base_url}")
  103. # 1. 访问列表页
  104. response = self.session.get(self.base_url, timeout=30)
  105. response.encoding = 'utf-8'
  106. soup = BeautifulSoup(response.text, 'html.parser')
  107. # 2. 找到"保证金标准"链接
  108. margin_link = None
  109. for link in soup.find_all('a'):
  110. link_text = link.text.strip() if link.text else ''
  111. if '保证金' in link_text:
  112. margin_link = link.get('href')
  113. if '标准' in link_text or '比例' in link_text:
  114. break
  115. if not margin_link:
  116. raise ValueError("未找到保证金标准链接")
  117. # 处理相对路径
  118. if not margin_link.startswith('http'):
  119. from urllib.parse import urljoin
  120. margin_link = urljoin(self.base_url, margin_link)
  121. print(f"[爬取] 找到保证金标准链接: {margin_link}")
  122. # 3. 访问保证金详情页
  123. response = self.session.get(margin_link, timeout=30)
  124. response.encoding = 'utf-8'
  125. soup = BeautifulSoup(response.text, 'html.parser')
  126. # 4. 解析第一个tbody
  127. tbody = soup.find('tbody')
  128. if not tbody:
  129. raise ValueError("未找到数据表格")
  130. print(f"[爬取] 开始解析保证金数据表格")
  131. # 5. 解析表格数据
  132. data = []
  133. rows = tbody.find_all('tr')
  134. for row in rows:
  135. cols = row.find_all('td')
  136. if len(cols) < 7: # 至少需要7列(交易所、品种、合约代码、客户投机、客户套保、交易所投机、交易所套保)
  137. continue
  138. # 提取合约代码(第3列,索引为2)
  139. contract_code = cols[2].text.strip()
  140. # 只要纯字母的合约代码,且最多2个字母
  141. if not contract_code.isalpha() or len(contract_code) > 2:
  142. continue
  143. # 统一转换为大写(代码文件中都是大写)
  144. contract_code = contract_code.upper()
  145. # 提取客户比例下的投机%(第4列,索引为3)
  146. speculation_text = cols[3].text.strip()
  147. # 解析百分比数字
  148. try:
  149. speculation_rate = float(speculation_text.replace('%', ''))
  150. except ValueError:
  151. print(f"[警告] 无法解析合约 {contract_code} 的投机比例: {speculation_text}")
  152. continue
  153. data.append({
  154. '合约代码': contract_code,
  155. '投机%': speculation_rate
  156. })
  157. df = pd.DataFrame(data)
  158. print(f"[爬取] 成功爬取 {len(df)} 个合约的保证金数据")
  159. return df
  160. # ============================================================================
  161. # 具体实现类 - 代码更新器
  162. # ============================================================================
  163. class FuturesConfigUpdater(CodeUpdater):
  164. """期货配置更新器 - 针对g.futures_config字典"""
  165. def read_config(self, file_path: str) -> Dict:
  166. """
  167. 从代码文件读取g.futures_config配置
  168. Args:
  169. file_path: 代码文件路径
  170. Returns:
  171. dict: {合约代码: {'long': 值, 'short': 值}}
  172. """
  173. with open(file_path, 'r', encoding='utf-8') as f:
  174. content = f.read()
  175. # 匹配g.futures_config字典
  176. pattern = r'g\.futures_config\s*=\s*\{(.*?)\n \}'
  177. match = re.search(pattern, content, re.DOTALL)
  178. if not match:
  179. raise ValueError(f"未找到g.futures_config配置")
  180. config_block = match.group(1)
  181. # 解析每个合约的保证金配置
  182. configs = {}
  183. # 匹配每个合约配置行,例如:'AU': {'has_night_session': True, 'margin_rate': {'long': 0.14, 'short': 0.14}, ...}
  184. contract_pattern = r"'([A-Z]+)':\s*\{[^}]*'margin_rate':\s*\{'long':\s*([\d.]+),\s*'short':\s*([\d.]+)\}"
  185. for match in re.finditer(contract_pattern, config_block):
  186. contract_code = match.group(1)
  187. long_rate = float(match.group(2))
  188. short_rate = float(match.group(3))
  189. configs[contract_code] = {'long': long_rate, 'short': short_rate}
  190. print(f"[读取] 从 {file_path} 读取到 {len(configs)} 个合约配置")
  191. return configs
  192. def update_config(self, file_path: str, margin_data: pd.DataFrame) -> List[str]:
  193. """
  194. 更新保证金配置
  195. Args:
  196. file_path: 代码文件路径
  197. margin_data: 保证金数据DataFrame
  198. Returns:
  199. list: 变更记录列表
  200. """
  201. with open(file_path, 'r', encoding='utf-8') as f:
  202. content = f.read()
  203. changes = []
  204. for _, row in margin_data.iterrows():
  205. contract_code = row['合约代码']
  206. new_rate = row['投机%'] / 100
  207. # 匹配该合约的整个配置行(包含合约代码,确保替换正确的那一行)
  208. pattern = f"('{contract_code}':[^}}]*'margin_rate':\\s*\\{{'long':\\s*)([\\d.]+)(,\\s*'short':\\s*)([\\d.]+)(\\}})"
  209. match = re.search(pattern, content)
  210. if not match:
  211. continue
  212. old_long = float(match.group(2))
  213. old_short = float(match.group(4))
  214. # 检查是否需要更新
  215. if abs(old_long - new_rate) < 0.0001 and abs(old_short - new_rate) < 0.0001:
  216. changes.append(f" {contract_code}: {round(new_rate, 3)} (不变)")
  217. else:
  218. # 替换保证金值(使用整个匹配模式进行精确替换)
  219. old_full_str = match.group(0)
  220. # 保留3位小数
  221. new_rate_str = f"{round(new_rate, 3):.3f}".rstrip('0').rstrip('.')
  222. new_full_str = f"{match.group(1)}{new_rate_str}{match.group(3)}{new_rate_str}{match.group(5)}"
  223. content = content.replace(old_full_str, new_full_str, 1)
  224. changes.append(f" {contract_code}: {round(old_long, 3)} -> {round(new_rate, 3)}")
  225. # 写回文件
  226. with open(file_path, 'w', encoding='utf-8') as f:
  227. f.write(content)
  228. return changes
  229. def add_new_contracts(self, file_path: str, new_contracts: List[str],
  230. margin_data: pd.DataFrame) -> List[str]:
  231. """
  232. 新增合约配置到g.futures_config
  233. Args:
  234. file_path: 代码文件路径
  235. new_contracts: 新合约代码列表
  236. margin_data: 保证金数据DataFrame
  237. Returns:
  238. list: 新增记录列表
  239. """
  240. with open(file_path, 'r', encoding='utf-8') as f:
  241. content = f.read()
  242. additions = []
  243. # 找到g.futures_config字典的结束位置
  244. pattern = r'(g\.futures_config\s*=\s*\{.*?)(\n \})'
  245. match = re.search(pattern, content, re.DOTALL)
  246. if not match:
  247. raise ValueError("未找到g.futures_config配置块")
  248. config_block = match.group(1)
  249. config_end = match.group(2)
  250. # 检查最后一行是否已经有逗号
  251. config_lines = config_block.rstrip().split('\n')
  252. last_line = config_lines[-1] if config_lines else ''
  253. needs_comma = last_line.strip() and not last_line.rstrip().endswith(',')
  254. # 如果最后一行需要逗号,添加逗号
  255. if needs_comma:
  256. config_block = config_block.rstrip() + ','
  257. # 准备新增的配置行
  258. new_lines = []
  259. for contract_code in new_contracts:
  260. # 从margin_data中获取保证金率
  261. rate_row = margin_data[margin_data['合约代码'] == contract_code]
  262. if rate_row.empty:
  263. continue
  264. rate = rate_row.iloc[0]['投机%'] / 100
  265. # 保留3位小数,去掉末尾的0
  266. rate_str = f"{round(rate, 3):.3f}".rstrip('0').rstrip('.')
  267. # 生成新配置行(使用默认模板)
  268. new_config = f" '{contract_code}': {{'has_night_session': True, 'margin_rate': {{'long': {rate_str}, 'short': {rate_str}}}, 'multiplier': 1, 'trading_start_time': '21:00'}},"
  269. new_lines.append(new_config)
  270. additions.append(f" 新增 {contract_code}: 保证金率={round(rate, 3)}, multiplier=1 (需手动调整)")
  271. # 插入新配置(在字典结束前)
  272. if new_lines:
  273. new_content = config_block + '\n' + '\n'.join(new_lines) + config_end
  274. content = content.replace(match.group(0), new_content)
  275. # 写回文件
  276. with open(file_path, 'w', encoding='utf-8') as f:
  277. f.write(content)
  278. return additions
  279. # ============================================================================
  280. # 管理类
  281. # ============================================================================
  282. class MarginCrawlerManager:
  283. """保证金爬取管理器"""
  284. def __init__(self, workspace_root: str = '/Users/maxfeng/Documents/GitHub/jukuan'):
  285. """
  286. 初始化管理器
  287. Args:
  288. workspace_root: 工作区根目录
  289. """
  290. self.workspace_root = workspace_root
  291. self.backup_dir = os.path.join(workspace_root, 'data/future_margin')
  292. # 确保备份目录存在
  293. os.makedirs(self.backup_dir, exist_ok=True)
  294. # 爬虫配置
  295. self.crawler_configs = {
  296. 'hua_future': {
  297. 'name': '华安期货',
  298. 'url': 'https://www.haqh.com/index.php?m=content&c=index&a=lists&catid=167',
  299. 'crawler_class': HuaAnFuturesCrawler
  300. }
  301. }
  302. # 更新器配置
  303. self.updater_configs = {
  304. 'MAPatternStrategy_v001': {
  305. 'name': 'MA形态策略v001',
  306. 'file': os.path.join(workspace_root, 'Lib/future/MAPatternStrategy_v001.py'),
  307. 'updater_class': FuturesConfigUpdater
  308. },
  309. 'MAPatternStrategy_v002': {
  310. 'name': 'MA形态策略v002',
  311. 'file': os.path.join(workspace_root, 'Lib/future/MAPatternStrategy_v002.py'),
  312. 'updater_class': FuturesConfigUpdater
  313. }
  314. }
  315. def run(self, crawler_key: str, updater_key: str):
  316. """
  317. 执行完整的爬取和更新流程
  318. Args:
  319. crawler_key: 爬虫配置的key
  320. updater_key: 更新器配置的key
  321. """
  322. print("=" * 80)
  323. print("期货保证金爬取和更新工具")
  324. print("=" * 80)
  325. # 1. 获取配置
  326. if crawler_key not in self.crawler_configs:
  327. raise ValueError(f"未找到爬虫配置: {crawler_key}")
  328. if updater_key not in self.updater_configs:
  329. raise ValueError(f"未找到更新器配置: {updater_key}")
  330. crawler_config = self.crawler_configs[crawler_key]
  331. updater_config = self.updater_configs[updater_key]
  332. print(f"\n[配置] 数据源: {crawler_config['name']}")
  333. print(f"[配置] 目标文件: {updater_config['name']}")
  334. # 2. 爬取数据
  335. print(f"\n{'=' * 80}")
  336. print("步骤1: 爬取保证金数据")
  337. print(f"{'=' * 80}")
  338. crawler = crawler_config['crawler_class'](crawler_config['url'])
  339. margin_data = crawler.crawl()
  340. # 验证测试数据
  341. self._validate_test_data(margin_data)
  342. # 3. 读取现有配置
  343. print(f"\n{'=' * 80}")
  344. print("步骤2: 读取现有配置")
  345. print(f"{'=' * 80}")
  346. updater = updater_config['updater_class']()
  347. file_path = updater_config['file']
  348. existing_config = updater.read_config(file_path)
  349. # 4. 比对合约代码
  350. print(f"\n{'=' * 80}")
  351. print("步骤3: 比对合约代码")
  352. print(f"{'=' * 80}")
  353. existing_contracts = set(existing_config.keys())
  354. crawled_contracts = set(margin_data['合约代码'].tolist())
  355. missing_in_crawl = existing_contracts - crawled_contracts
  356. new_in_crawl = crawled_contracts - existing_contracts
  357. if missing_in_crawl:
  358. print(f"[警告] 以下合约在代码文件中存在,但爬取结果中没有:")
  359. for contract in sorted(missing_in_crawl):
  360. print(f" - {contract}")
  361. if new_in_crawl:
  362. print(f"[发现] 以下合约在爬取结果中存在,但代码文件中没有:")
  363. for contract in sorted(new_in_crawl):
  364. rate = margin_data[margin_data['合约代码'] == contract].iloc[0]['投机%']
  365. print(f" - {contract} (保证金率: {rate}%)")
  366. if not missing_in_crawl and not new_in_crawl:
  367. print("[信息] 合约代码完全一致")
  368. # 5. 备份数据
  369. print(f"\n{'=' * 80}")
  370. print("步骤4: 备份数据")
  371. print(f"{'=' * 80}")
  372. timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
  373. # 备份原始配置数据
  374. origin_data = []
  375. for contract, rates in existing_config.items():
  376. origin_data.append({
  377. '合约代码': contract,
  378. 'long': rates['long'],
  379. 'short': rates['short']
  380. })
  381. origin_df = pd.DataFrame(origin_data)
  382. origin_file = os.path.join(self.backup_dir, f"origin_{updater_key}_{timestamp}.csv")
  383. origin_df.to_csv(origin_file, index=False, encoding='utf-8')
  384. print(f"[备份] 原始配置已保存: {origin_file}")
  385. # 备份爬取数据
  386. update_file = os.path.join(self.backup_dir, f"update_{crawler_key}_{timestamp}.csv")
  387. margin_data.to_csv(update_file, index=False, encoding='utf-8')
  388. print(f"[备份] 爬取数据已保存: {update_file}")
  389. # 6. 备份代码文件
  390. print(f"\n{'=' * 80}")
  391. print("步骤5: 备份代码文件")
  392. print(f"{'=' * 80}")
  393. updater.backup_file(file_path)
  394. # 7. 更新配置
  395. print(f"\n{'=' * 80}")
  396. print("步骤6: 更新保证金配置")
  397. print(f"{'=' * 80}")
  398. changes = updater.update_config(file_path, margin_data)
  399. print("[更新] 保证金配置变更记录:")
  400. for change in changes:
  401. print(change)
  402. # 8. 新增合约
  403. if new_in_crawl:
  404. print(f"\n{'=' * 80}")
  405. print("步骤7: 新增合约配置")
  406. print(f"{'=' * 80}")
  407. additions = updater.add_new_contracts(file_path, list(new_in_crawl), margin_data)
  408. print("[新增] 合约配置新增记录:")
  409. for addition in additions:
  410. print(addition)
  411. print(f"\n{'=' * 80}")
  412. print("完成!")
  413. print(f"{'=' * 80}")
  414. def _validate_test_data(self, margin_data: pd.DataFrame):
  415. """
  416. 验证测试数据
  417. Args:
  418. margin_data: 爬取的保证金数据
  419. """
  420. print(f"\n[验证] 检查测试数据...")
  421. test_cases = [
  422. ('A', 16),
  423. ('CJ', 14)
  424. ]
  425. all_passed = True
  426. for contract_code, expected_rate in test_cases:
  427. row = margin_data[margin_data['合约代码'] == contract_code]
  428. if row.empty:
  429. print(f" ✗ {contract_code}: 未找到数据")
  430. all_passed = False
  431. else:
  432. actual_rate = row.iloc[0]['投机%']
  433. if abs(actual_rate - expected_rate) < 0.01:
  434. print(f" ✓ {contract_code}: {actual_rate}% (预期: {expected_rate}%)")
  435. else:
  436. print(f" ✗ {contract_code}: {actual_rate}% (预期: {expected_rate}%)")
  437. all_passed = False
  438. if all_passed:
  439. print("[验证] 测试数据验证通过!")
  440. else:
  441. print("[验证] 测试数据验证失败!")
  442. # ============================================================================
  443. # 主程序入口
  444. # ============================================================================
  445. def main():
  446. """主函数"""
  447. # 创建管理器
  448. manager = MarginCrawlerManager()
  449. # 执行爬取和更新
  450. manager.run(
  451. crawler_key='hua_future',
  452. updater_key='MAPatternStrategy_v001'
  453. )
  454. if __name__ == '__main__':
  455. main()