hace 6 meses · 9f9e20250c
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,6 +6,8 @@ readme = "README.md"
 
				 requires-python = ">=3.10"
			
 
				 dependencies = [
			
 
				     "paddleocr>=2.10.0",
			
 
				+    "paddlepaddle>=3.0.0",
			
 
				     "pdf2image>=1.17.0",
			
 
				     "pyyaml>=6.0.2",
			
 
				+    "setuptools>=79.0.1",
			
 
				 ]
			
--- a/src/main.py
+++ b/src/main.py
@@ -1,10 +1,18 @@
 
				 import os
			
 
				 import glob
			
 
				+import logging
			
 
				+import json # 添加 json 库
			
 
				+from datetime import date # 添加 datetime.date
			
 
				 from config_loader import load_config
			
 
				 from pdf_processor import PdfProcessor
			
 
				 from trading_calculator import TradingCalculator
			
 
				 from data_exporter import export_data
			
 
				 
			
 
				+# 在所有导入之后，强制设置日志配置
			
 
				+logging.basicConfig(level=logging.INFO, 
			
 
				+                    format='%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s', 
			
 
				+                    force=True) # 使用 force=True (Python 3.8+)
			
 
				+
			
 
				 def main():
			
 
				     """
			
 
				     主函数，协调整个处理流程。
			
@@ -13,52 +21,138 @@ def main():
 
				     config_path = 'config/config.yaml'
			
 
				     config = load_config(config_path)
			
 
				     if not config:
			
 
				-        print("无法加载配置，程序退出。")
			
 
				+        logging.error("无法加载配置，程序退出。")
			
 
				         return
			
 
				 
			
 
				     pdf_directory = config.get('pdf_directory')
			
 
				     output_directory = config.get('output_directory')
			
 
				 
			
 
				     if not pdf_directory or not output_directory:
			
 
				-        print("错误：配置文件中缺少 pdf_directory 或 output_directory。")
			
 
				+        logging.error("错误：配置文件中缺少 pdf_directory 或 output_directory。")
			
 
				         return
			
 
				 
			
 
				-    # 2. 获取 PDF 文件列表
			
 
				-    pdf_files = glob.glob(os.path.join(pdf_directory, '*.pdf'))
			
 
				-    if not pdf_files:
			
 
				-        print(f"在目录 {pdf_directory} 未找到 PDF 文件。")
			
 
				+    # --- 缓存逻辑开始 ---
			
 
				+    cache_filename = 'pdf_cache.json'
			
 
				+    cache_path = os.path.join(output_directory, cache_filename)
			
 
				+    cached_pdf_files_info = None # 初始化为 None，用于存储从缓存加载的数据
			
 
				+
			
 
				+    # 尝试加载缓存
			
 
				+    if os.path.exists(cache_path):
			
 
				+        try:
			
 
				+            with open(cache_path, 'r', encoding='utf-8') as f:
			
 
				+                loaded_cache_data = json.load(f)
			
 
				+            # 验证并转换日期
			
 
				+            if isinstance(loaded_cache_data, list):
			
 
				+                validated_cache_data = []
			
 
				+                valid_cache = True
			
 
				+                for item in loaded_cache_data:
			
 
				+                    try:
			
 
				+                        # 转换日期字符串回 date 对象
			
 
				+                        item['record_date'] = date.fromisoformat(item['record_date'])
			
 
				+                        validated_cache_data.append(item)
			
 
				+                    except (TypeError, ValueError, KeyError) as e:
			
 
				+                        logging.warning(f"缓存项格式错误或日期转换失败: {item}, 错误: {e}。将忽略此项并继续。")
			
 
				+                        # 如果希望严格模式，任何一个错误都导致重新处理，可以在这里 valid_cache = False; break
			
 
				+                if validated_cache_data: # 确保列表不为空且有效记录存在
			
 
				+                    cached_pdf_files_info = validated_cache_data
			
 
				+                    logging.info(f"从缓存文件 {cache_path} 加载并转换了 {len(cached_pdf_files_info)} 条记录。")
			
 
				+                else:
			
 
				+                    logging.warning(f"缓存文件 {cache_path} 中没有有效的记录，将重新处理 PDF。")
			
 
				+            else:
			
 
				+                logging.warning(f"缓存文件 {cache_path} 格式不正确（不是列表），将重新处理 PDF。")
			
 
				+        except (json.JSONDecodeError, IOError) as e:
			
 
				+            logging.warning(f"加载缓存文件 {cache_path} 失败: {e}，将重新处理 PDF。")
			
 
				+    else:
			
 
				+        logging.info(f"缓存文件 {cache_path} 不存在。")
			
 
				+
			
 
				+    # 2. 获取当前目录的 PDF 文件列表
			
 
				+    pdf_files_in_directory = glob.glob(os.path.join(pdf_directory, '*.pdf'))
			
 
				+    if not pdf_files_in_directory:
			
 
				+        logging.error(f"在目录 {pdf_directory} 未找到 PDF 文件。")
			
 
				         return
			
 
				-    print(f"找到 {len(pdf_files)} 个 PDF 文件。")
			
 
				-
			
 
				-    # 3. 处理每个 PDF 文件，提取信息
			
 
				-    pdf_files_info = []
			
 
				-    for pdf_path in pdf_files:
			
 
				-        processor = PdfProcessor(pdf_path)
			
 
				-        record_date = processor.record_date
			
 
				-        if record_date: # 只处理能成功提取日期的文件
			
 
				-             holdings = processor.extract_holdings()
			
 
				-             pdf_files_info.append({'record_date': record_date, 'codes': holdings, 'path': pdf_path})
			
 
				+
			
 
				+    logging.info(f"当前目录中找到 {len(pdf_files_in_directory)} 个 PDF 文件。")
			
 
				+
			
 
				+    # 检查缓存是否可以直接使用或需要更新
			
 
				+    should_process_all_pdfs = True # 默认需要处理所有 PDF (如果缓存不存在或无效)
			
 
				+    pdf_files_to_process = [] # 需要处理的 PDF 文件列表
			
 
				+    final_pdf_files_info = [] # 最终用于计算的数据列表
			
 
				+
			
 
				+    if cached_pdf_files_info is not None:
			
 
				+        cached_paths = {item['path'] for item in cached_pdf_files_info}
			
 
				+        # 找出在目录中但不在缓存中的 PDF 文件
			
 
				+        new_pdf_files = [p for p in pdf_files_in_directory if p not in cached_paths]
			
 
				+
			
 
				+        if len(cached_pdf_files_info) == len(pdf_files_in_directory) and not new_pdf_files:
			
 
				+            # 情况 1: 缓存记录数量与 PDF 文件数量一致，且没有新的 PDF
			
 
				+            logging.info("缓存记录数量与目录 PDF 文件数量一致，且没有新增文件，直接使用缓存数据。")
			
 
				+            should_process_all_pdfs = False
			
 
				+            final_pdf_files_info = cached_pdf_files_info # 直接使用缓存数据
			
 
				         else:
			
 
				-            print(f"跳过文件 {pdf_path}，无法提取记录日期。")
			
 
				+            # 情况 2: 缓存记录数量与 PDF 文件数量不一致，或有新的 PDF
			
 
				+            logging.warning("缓存与目录 PDF 文件不一致，将处理新增或有差异的 PDF。")
			
 
				+            should_process_all_pdfs = False # 不处理所有，只处理新增的
			
 
				+            pdf_files_to_process = new_pdf_files # 需要处理的是新文件
			
 
				+            final_pdf_files_info.extend(cached_pdf_files_info) # 将现有缓存数据加入最终列表
			
 
				+
			
 
				+    if should_process_all_pdfs:
			
 
				+         # 如果没有有效的缓存，处理目录中的所有 PDF
			
 
				+        logging.info("没有可用的缓存，将处理目录中的所有 PDF 文件。")
			
 
				+        pdf_files_to_process = pdf_files_in_directory # 需要处理所有 PDF
			
 
				+
			
 
				+    if pdf_files_to_process:
			
 
				+        logging.info(f"开始处理 {len(pdf_files_to_process)} 个 PDF 文件...")
			
 
				+        processed_new_files_info = []
			
 
				+        for pdf_path in pdf_files_to_process:
			
 
				+            processor = PdfProcessor(pdf_path)
			
 
				+            record_date = processor.record_date
			
 
				+            if record_date:
			
 
				+                 holdings = processor.extract_holdings()
			
 
				+                 processed_new_files_info.append({'record_date': record_date, 'codes': holdings, 'path': pdf_path})
			
 
				+            else:
			
 
				+                logging.error(f"跳过文件 {pdf_path}，无法提取记录日期。")
			
 
				 
			
 
				+        # 将新处理的数据添加到最终列表中
			
 
				+        final_pdf_files_info.extend(processed_new_files_info)
			
 
				 
			
 
				-    if not pdf_files_info:
			
 
				-        print("没有成功处理任何 PDF 文件以提取信息。")
			
 
				+        # --- 保存/更新缓存 ---
			
 
				+        if final_pdf_files_info: # 仅在最终数据非空时保存
			
 
				+             # 按 record_date 排序
			
 
				+            final_pdf_files_info.sort(key=lambda x: x['record_date'])
			
 
				+            try:
			
 
				+                # 确保输出目录存在
			
 
				+                os.makedirs(output_directory, exist_ok=True)
			
 
				+                with open(cache_path, 'w', encoding='utf-8') as f:
			
 
				+                    # 在保存前转换 date 对象为 ISO 格式字符串
			
 
				+                    serializable_data = [
			
 
				+                        {**item, 'record_date': item['record_date'].isoformat()}
			
 
				+                        for item in final_pdf_files_info
			
 
				+                    ]
			
 
				+                    json.dump(serializable_data, f, ensure_ascii=False, indent=4)
			
 
				+                logging.info(f"PDF 处理结果已保存/更新到缓存文件 {cache_path}。")
			
 
				+            except IOError as e:
			
 
				+                logging.error(f"保存缓存文件 {cache_path} 失败: {e}")
			
 
				+    elif not final_pdf_files_info and cached_pdf_files_info is not None:
			
 
				+        # 这种情况是缓存存在，但没有新的或不一致的文件需要处理，且缓存本身也可能为空（虽然上面有检查）
			
 
				+        # 或者缓存加载成功但校验后发现没有有效记录，然后也没有新文件...
			
 
				+        # 如果 final_pdf_files_info 是空的，说明没有数据可以用于后续计算
			
 
				+        logging.warning("没有需要处理的新 PDF 文件，且缓存中没有有效数据。")
			
 
				+        # 这里不保存缓存，因为它没有有效数据
			
 
				+
			
 
				+    # 使用 final_pdf_files_info 进行后续计算
			
 
				+    if not final_pdf_files_info:
			
 
				+        logging.error("没有成功获取任何 PDF 文件信息以进行计算。")
			
 
				         return
			
 
				 
			
 
				-    # 确保按记录日期排序
			
 
				-    pdf_files_info.sort(key=lambda x: x['record_date'])
			
 
				+    # --- 缓存逻辑结束 ---
			
 
				 
			
 
				     # 4. 计算交易日期和买卖点
			
 
				-    calculator = TradingCalculator(pdf_files_info)
			
 
				-    trading_dates = calculator.calculate_trading_dates()
			
 
				-    if not trading_dates:
			
 
				-        print("未能计算出任何交易日期。")
			
 
				-        return
			
 
				+    # 现在 calculator 应该使用 final_pdf_files_info
			
 
				+    calculator = TradingCalculator(final_pdf_files_info)
			
 
				 
			
 
				-    buy_sell_data = calculator.determine_buy_sell_dates(trading_dates)
			
 
				+    buy_sell_data = calculator.determine_buy_sell_dates()
			
 
				     if not buy_sell_data:
			
 
				-        print("未能确定任何买卖交易。")
			
 
				+        logging.warning("未能确定任何有明确买卖日期的交易。")
			
 
				         # return # 可能没有交易是正常情况，所以不一定退出
			
 
				 
			
 
				     # 5. 导出数据
			
@@ -66,7 +160,7 @@ def main():
 
				     output_path = os.path.join(output_directory, output_filename)
			
 
				     export_data(buy_sell_data, output_path)
			
 
				 
			
 
				-    print("处理完成。")
			
 
				+    logging.info("处理完成。")
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				     main() 
			
--- a/src/pdf_processor.py
+++ b/src/pdf_processor.py
@@ -1,24 +1,35 @@
 
				 import os
			
 
				 from datetime import datetime
			
 
				-from pdf2image import convert_from_path
			
 
				+from pdf2image import convert_from_path, pdfinfo_from_path
			
 
				 from paddleocr import PaddleOCR
			
 
				+from paddleocr import PPStructure
			
 
				 import re
			
 
				 import numpy as np
			
 
				 import logging
			
 
				 
			
 
				 # 配置日志记录
			
 
				-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
			
 
				+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(filename)s - %(message)s')
			
 
				 
			
 
				-# 初始化 PaddleOCR，指定使用中文和英文模型，并开启表格识别
			
 
				-# 注意：首次运行时会自动下载模型文件
			
 
				+# 初始化 PaddleOCR，指定使用中文和英文模型
			
 
				+# 主要用于非表格文本识别或作为备用
			
 
				 try:
			
 
				-    # 尝试CPU版本，如果需要GPU加速且环境支持，可改为 use_gpu=True
			
 
				-    ocr = PaddleOCR(use_angle_cls=True, lang='ch', use_gpu=False, show_log=False, use_space_char=True, table=True)
			
 
				-    logging.info("PaddleOCR 初始化成功 (CPU)。")
			
 
				+    # 尝试CPU版本
			
 
				+    ocr = PaddleOCR(use_angle_cls=True, lang='ch', use_gpu=False, show_log=False, use_space_char=True)
			
 
				+    logging.info("PaddleOCR (基础 OCR) 初始化成功 (CPU)。")
			
 
				 except Exception as e:
			
 
				     logging.error(f"PaddleOCR 初始化失败: {e}")
			
 
				     ocr = None # 标记初始化失败
			
 
				 
			
 
				+# 初始化 PPStructure，专门用于表格识别
			
 
				+try:
			
 
				+    # layout=False 表示我们主要关心表格内容识别，而不是完整的文档布局分析
			
 
				+    # 如果需要更复杂的文档结构分析，可以将 layout 设置为 True 或使用默认值
			
 
				+    table_engine = PPStructure(lang='ch', show_log=False, layout=False)
			
 
				+    logging.info("PPStructure (表格引擎) 初始化成功。")
			
 
				+except Exception as e:
			
 
				+    logging.error(f"PPStructure 初始化失败: {e}")
			
 
				+    table_engine = None # 标记初始化失败
			
 
				+
			
 
				 class PdfProcessor:
			
 
				     """
			
 
				     处理单个 PDF 文件，提取记录日期和持仓信息。
			
@@ -33,7 +44,9 @@ class PdfProcessor:
 
				         self.pdf_path = pdf_path
			
 
				         self.record_date = self.extract_record_date()
			
 
				         if not ocr:
			
 
				-             logging.warning("PaddleOCR 未成功初始化，extract_holdings 将无法工作。")
			
 
				+             logging.warning("PaddleOCR 未成功初始化，通用 OCR 功能可能受限。")
			
 
				+        if not table_engine:
			
 
				+             logging.warning("PPStructure 未成功初始化，extract_holdings 将无法进行表格识别。")
			
 
				 
			
 
				 
			
 
				     def extract_record_date(self):
			
@@ -64,144 +77,163 @@ class PdfProcessor:
 
				     def _extract_codes_from_table(self, table_html):
			
 
				         """ 从识别出的表格HTML中提取股票代码 """
			
 
				         codes = set() # 使用集合去重
			
 
				-        # 简单的HTML解析，提取<td>标签内容
			
 
				-        # 注意：PaddleOCR返回的HTML结构可能变化，这里需要健壮性处理
			
 
				-        # 找到<tbody>开始的部分
			
 
				-        tbody_match = re.search(r'<tbody.*?>(.*?)</tbody>', table_html, re.DOTALL | re.IGNORECASE)
			
 
				-        if not tbody_match:
			
 
				-            logging.warning("未能在表格HTML中找到 tbody。")
			
 
				-            return list(codes)
			
 
				-
			
 
				-        tbody_content = tbody_match.group(1)
			
 
				-
			
 
				         # 查找所有行 <tr>
			
 
				-        rows = re.findall(r'<tr.*?>(.*?)</tr>', tbody_content, re.DOTALL | re.IGNORECASE)
			
 
				-        if not rows:
			
 
				-             logging.warning("未能在表格 tbody 中找到任何行 <tr>。")
			
 
				+        rows_html = re.findall(r'<tr.*?>(.*?)</tr>', table_html, re.DOTALL | re.IGNORECASE)
			
 
				+        if not rows_html:
			
 
				+             logging.warning("未能在表格 HTML 中找到任何行 <tr>。")
			
 
				              return list(codes)
			
 
				 
			
 
				-        header_cells_text = []
			
 
				-        if rows:
			
 
				-            header_cells = re.findall(r'<td.*?>(.*?)</td>', rows[0], re.DOTALL | re.IGNORECASE)
			
 
				-            header_cells_text = [re.sub('<[^<]+?>', '', cell).strip() for cell in header_cells] # 清理HTML标签
			
 
				-
			
 
				-        code_col_idx = self._find_code_column_index(header_cells_text)
			
 
				-
			
 
				-        if code_col_idx == -1:
			
 
				-            logging.warning(f"未能在表头 {header_cells_text} 中找到'代码'列。")
			
 
				+        header_row_idx = -1
			
 
				+        code_col_idx = -1
			
 
				+
			
 
				+        # 遍历所有行以查找表头和代码列
			
 
				+        for idx, row_content in enumerate(rows_html):
			
 
				+            cells = re.findall(r'<td.*?>(.*?)</td>', row_content, re.DOTALL | re.IGNORECASE)
			
 
				+            cells_text = [re.sub('<[^<]+?>', '', cell).strip() for cell in cells]
			
 
				+            
			
 
				+            # 尝试在该行中查找代码列
			
 
				+            current_code_col_idx = self._find_code_column_index(cells_text)
			
 
				+            if current_code_col_idx != -1:
			
 
				+                header_row_idx = idx
			
 
				+                code_col_idx = current_code_col_idx
			
 
				+                logging.info(f"在第 {idx+1} 行找到表头，'代码'列索引: {code_col_idx}")
			
 
				+                break # 找到第一个包含代码列的行就认为是表头
			
 
				+
			
 
				+        # 如果未找到表头行或代码列
			
 
				+        if header_row_idx == -1:
			
 
				+            logging.warning("未能在任何行中找到包含'代码'列的表头。")
			
 
				             return list(codes)
			
 
				-        logging.info(f"找到'代码'列索引: {code_col_idx}")
			
 
				-
			
 
				 
			
 
				-        # 从第二行开始（跳过表头）处理数据行
			
 
				-        for row_html in rows[1:]:
			
 
				-            cells = re.findall(r'<td.*?>(.*?)</td>', row_html, re.DOTALL | re.IGNORECASE)
			
 
				+        # 从表头的下一行开始处理数据行
			
 
				+        for row_html_content in rows_html[header_row_idx + 1:]:
			
 
				+            cells = re.findall(r'<td.*?>(.*?)</td>', row_html_content, re.DOTALL | re.IGNORECASE)
			
 
				             if len(cells) > code_col_idx:
			
 
				                 # 提取目标列单元格内容并清理HTML标签
			
 
				                 cell_content_raw = cells[code_col_idx]
			
 
				                 cell_content = re.sub('<[^<]+?>', '', cell_content_raw).strip()
			
 
				-                 # 清理常见的OCR错误，例如 'B' -> '8', 'O'/'o' -> '0'
			
 
				+                # 清理常见的OCR错误，例如 'B' -> '8', 'O'/'o' -> '0'
			
 
				                 cell_content = cell_content.replace('B', '8').replace('O', '0').replace('o', '0')
			
 
				                 # 提取所有连续的6位数字作为潜在代码
			
 
				                 potential_codes = re.findall(r'\b(\d{6})\b', cell_content)
			
 
				                 for code in potential_codes:
			
 
				                      # 验证是否是常见的A股代码开头
			
 
				-                     if code.startswith(('0', '3', '6', '8')): # 增加科创板'8'开头
			
 
				+                     if code.startswith(('0', '1', '3', '5', '6', '8')): # 增加科创板'8'开头，'1'开头为转债，'5'为ETF
			
 
				                          codes.add(code)
			
 
				                      else:
			
 
				                          logging.debug(f"忽略无效代码格式: {code} in cell '{cell_content}'")
			
 
				-
			
 
				             else:
			
 
				-                logging.warning(f"行数据单元格数量 ({len(cells)}) 少于代码列索引 ({code_col_idx})，跳过此行: {row_html}")
			
 
				+                # 如果行单元格数不足，记录警告，但继续处理下一行，以防表格结构不规则
			
 
				+                logging.warning(f"行数据单元格数量 ({len(cells)}) 少于代码列索引 ({code_col_idx})，尝试跳过此行。 行内容: {row_html_content[:100]}...")
			
 
				 
			
 
				         return list(codes)
			
 
				 
			
 
				 
			
 
				     def extract_holdings(self):
			
 
				         """
			
 
				-        解析 PDF 文件（特别是最后一页），提取持仓表格中的股票代码。
			
 
				-        使用 pdf2image 和 PaddleOCR。
			
 
				+        解析 PDF 文件，从最后一个包含股票代码的表格里提取持仓代码。
			
 
				+        使用 pdf2image 转换页面，PPStructure 进行表格识别。
			
 
				 
			
 
				         Returns:
			
 
				             list: 包含股票代码的列表。如果解析失败或找不到表格，返回空列表。
			
 
				         """
			
 
				-        if not ocr:
			
 
				-            logging.error("PaddleOCR 未初始化，无法提取持仓。")
			
 
				+        if not table_engine:
			
 
				+            logging.error("PPStructure 未初始化，无法提取表格持仓。")
			
 
				             return []
			
 
				         if not self.pdf_path or not os.path.exists(self.pdf_path):
			
 
				              logging.error(f"PDF 文件路径无效或文件不存在: {self.pdf_path}")
			
 
				              return []
			
 
				 
			
 
				         logging.info(f"开始处理文件: {self.pdf_path}")
			
 
				-        extracted_codes = []
			
 
				 
			
 
				         try:
			
 
				-            # 1. PDF 转图片 (尝试只转最后一页以提高效率)
			
 
				+            # 1. PDF 转图片 (处理后3页以提高找到表格的可能性)
			
 
				             # 注意：poppler路径可能需要根据系统配置调整
			
 
				             # 在Windows上，可能需要指定 poppler_path='C:/path/to/poppler/bin'
			
 
				             try:
			
 
				-                 images = convert_from_path(self.pdf_path, first_page=None, last_page=1, dpi=300) # 先尝试只转第一页，如果需要可以改回最后一页
			
 
				-                 if not images:
			
 
				-                      images = convert_from_path(self.pdf_path, dpi=300) # 如果第一页失败，尝试转所有页
			
 
				-                 # target_image = images[-1] # 假设表格在最后一页
			
 
				-                 target_image = images[0] # 改为处理第一页，根据实际pdf调整
			
 
				-                 logging.info(f"PDF 页面成功转换为图片。")
			
 
				+                # 使用 pdfinfo_from_path 高效获取总页数
			
 
				+                info = pdfinfo_from_path(self.pdf_path)
			
 
				+                total_pages = info["Pages"]
			
 
				+                # total_pages = len(convert_from_path(self.pdf_path, first_page=1, last_page=1, dpi=72))  # 低分辨率只获取页数
			
 
				+                logging.info(f"PDF总页数: {total_pages}")
			
 
				+                
			
 
				+                # 处理最后3页或全部页面(如果总页数少于3)
			
 
				+                start_page = max(1, total_pages - 2)  # 确保起始页至少是1
			
 
				+                logging.info(f"开始处理PDF从第{start_page}页到第{total_pages}页")
			
 
				+                
			
 
				+                images = convert_from_path(self.pdf_path, first_page=start_page, last_page=total_pages, dpi=300)
			
 
				+                if not images:
			
 
				+                    logging.warning("无法转换后几页，尝试转换全部页面")
			
 
				+                    images = convert_from_path(self.pdf_path, dpi=300)
			
 
				+                
			
 
				+                logging.info(f"成功转换了 {len(images)} 页PDF为图片")
			
 
				             except Exception as convert_err:
			
 
				-                 logging.error(f"使用 pdf2image 转换 PDF 失败: {convert_err}")
			
 
				-                 # 尝试直接用 PaddleOCR 处理 PDF 路径 (如果支持) - 通常不直接支持PDF
			
 
				-                 # result = ocr.ocr(self.pdf_path, cls=True) # 这行可能无效，PaddleOCR主要处理图片
			
 
				-                 return [] # 转换失败则无法继续
			
 
				-
			
 
				-            # 将 PIL Image 转换为 numpy array (PaddleOCR 需要)
			
 
				-            img_np = np.array(target_image)
			
 
				-
			
 
				-            # 2. 使用 PaddleOCR 进行识别 (包括表格)
			
 
				-            logging.info("开始使用 PaddleOCR 进行 OCR 和表格识别...")
			
 
				-            # result 是一个列表，每个元素对应一个检测到的文本框或表格
			
 
				-            # 对于表格，结构信息在 result[i][1][1] 中（通常是html格式）
			
 
				-            result = ocr.ocr(img_np, cls=True) # 尝试不开启表格识别，看文本识别效果
			
 
				-            logging.info(f"PaddleOCR 处理完成，获得 {len(result[0]) if result else 0} 个结果块。")
			
 
				-
			
 
				-
			
 
				-            # 3 & 4. 查找表格并提取代码
			
 
				-            found_codes_in_table = False
			
 
				-            if result and result[0]: # 检查结果是否为空
			
 
				-                # 优先查找PaddleOCR直接识别出的表格结构
			
 
				-                table_html_content = None
			
 
				-                for i, block in enumerate(result[0]):
			
 
				-                    # block 结构: [[box], (text, confidence)] or for table [[box], ('html', table_html)]
			
 
				-                    # 检查是否是表格识别结果
			
 
				-                     # 新版PaddleOCR可能直接返回表格html在特定位置
			
 
				-                     # 需要根据实际返回结果调整查找逻辑
			
 
				-                     # 这里假设表格结果的文本部分是'<table>...'的html字符串
			
 
				-
			
 
				-                    # 尝试从文本块中查找可能的表格标记或内容
			
 
				-                    text_content = block[1][0]
			
 
				-                    if isinstance(text_content, str) and '<table' in text_content and '<td' in text_content:
			
 
				-                         logging.info(f"在结果块 {i} 中找到疑似表格HTML。")
			
 
				-                         table_html_content = text_content
			
 
				-                         codes_from_html = self._extract_codes_from_table(table_html_content)
			
 
				-                         if codes_from_html:
			
 
				-                             extracted_codes.extend(codes_from_html)
			
 
				-                             found_codes_in_table = True
			
 
				-                             logging.info(f"从表格HTML中提取到 {len(codes_from_html)} 个代码。")
			
 
				-                             # break # 找到一个表格就处理，假设只有一个目标表格
			
 
				-
			
 
				-
			
 
				-                # 如果没有直接找到表格HTML，尝试从所有识别文本中提取6位数字代码
			
 
				-                if not found_codes_in_table:
			
 
				-                    logging.warning("未直接找到表格结构，尝试从所有文本中提取6位数字代码。")
			
 
				-                    all_text = " ".join([block[1][0] for block in result[0] if isinstance(block[1][0], str)])
			
 
				-                     # 清理常见的OCR错误
			
 
				-                    all_text = all_text.replace('B', '8').replace('O', '0').replace('o', '0')
			
 
				-                    potential_codes = re.findall(r'\b([0368]\d{5})\b', all_text) # 匹配以0,3,6,8开头的6位数字
			
 
				-                    if potential_codes:
			
 
				-                        extracted_codes.extend(list(set(potential_codes))) # 去重后添加
			
 
				-                        logging.info(f"从文本中提取到 {len(set(potential_codes))} 个潜在代码。")
			
 
				-
			
 
				-            if not extracted_codes:
			
 
				-                 logging.warning(f"未能从文件 {os.path.basename(self.pdf_path)} 中提取到任何股票代码。")
			
 
				-
			
 
				+                logging.error(f"使用 pdf2image 转换 PDF 失败: {convert_err}")
			
 
				+                return []  # 转换失败则无法继续
			
 
				+
			
 
				+            # 从后向前处理图片，优先处理最后一页
			
 
				+            for page_idx, page_image in enumerate(reversed(images)):
			
 
				+                page_num = total_pages - page_idx
			
 
				+                logging.info(f"正在处理第 {page_num} 页")
			
 
				+
			
 
				+                # 将 PIL Image 转换为 numpy array (PPStructure/PaddleOCR 需要)
			
 
				+                img_np = np.array(page_image)
			
 
				+
			
 
				+                # 2. 使用 PPStructure 进行表格识别
			
 
				+                logging.info(f"开始对第 {page_num} 页使用 PPStructure 进行表格识别...")
			
 
				+                try:
			
 
				+                    structure_result = table_engine(img_np)
			
 
				+                    logging.info(f"第 {page_num} 页 PPStructure 处理完成，获得 {len(structure_result)} 个结构元素。")
			
 
				+                except Exception as struct_err:
			
 
				+                    logging.error(f"第 {page_num} 页 PPStructure 处理失败: {struct_err}")
			
 
				+                    structure_result = [] # 出错则结果为空
			
 
				+
			
 
				+                if structure_result:
			
 
				+                    page_tables_html = [] # 存储当前页面找到的所有表格HTML
			
 
				+                    # 遍历 PPStructure 返回的所有结构元素
			
 
				+                    for item in structure_result:
			
 
				+                        # 检查元素类型是否为 'table' (忽略大小写)
			
 
				+                        if item.get('type', '').lower() == 'table':
			
 
				+                            logging.info(f"在第 {page_num} 页找到一个 '{item['type']}' 类型的结构")
			
 
				+                            # 检查是否有识别结果 'res' 和 HTML内容 'html'
			
 
				+                            if 'res' in item and isinstance(item['res'], dict) and 'html' in item['res']:
			
 
				+                                table_html = item['res']['html']
			
 
				+
			
 
				+                                # 新增检查
			
 
				+                                if isinstance(table_html, str) and "群内禁止任何形式的广告" in table_html:
			
 
				+                                    logging.info(f"在第 {page_num} 页检测到包含特定广告语的假表格，跳过。")
			
 
				+                                    continue # 跳过这个假表格
			
 
				+
			
 
				+                                # 有些情况下可能缺少外层<table>标签，补上以兼容解析逻辑
			
 
				+                                if isinstance(table_html, str) and not table_html.strip().startswith('<table'):
			
 
				+                                    table_html = f'<table>{table_html}</table>'
			
 
				+                                    logging.debug("为表格HTML添加了外层 <table> 标签。")
			
 
				+                                page_tables_html.append(table_html)
			
 
				+                            else:
			
 
				+                                logging.warning(f"表格结构检测到，但缺少 'res' 或 'res.html' 内容: {item}")
			
 
				+
			
 
				+                    # 如果当前页面通过 PPStructure 找到了表格
			
 
				+                    if page_tables_html:
			
 
				+                        logging.info(f"在第 {page_num} 页通过 PPStructure 找到 {len(page_tables_html)} 个表格结构。尝试提取代码...")
			
 
				+                        # 从最后一个检测到的表格开始尝试提取代码 (符合优先处理页面末尾表格的逻辑)
			
 
				+                        for table_html_content in reversed(page_tables_html):
			
 
				+                            codes_from_html = self._extract_codes_from_table(table_html_content)
			
 
				+                            if codes_from_html:
			
 
				+                                # 成功提取到代码，立即去重、排序并返回
			
 
				+                                final_codes = sorted(list(set(codes_from_html)))
			
 
				+                                logging.info(f"在第 {page_num} 页的表格中成功提取到 {len(final_codes)} 个代码，停止处理。代码: {final_codes if len(final_codes) < 10 else final_codes[:10] + ['...']}")
			
 
				+                                return final_codes
			
 
				+                            # 如果当前表格未提取到代码，继续尝试页面上的下一个表格（顺序向前）
			
 
				+                        # 如果当前页面的所有表格都尝试完仍未找到代码，则记录信息
			
 
				+                        logging.info(f"第 {page_num} 页检测到的表格均未成功提取到有效代码。")
			
 
				+                    else:
			
 
				+                         logging.info(f"第 {page_num} 页未通过 PPStructure 找到明确的表格结构。")
			
 
				+                         # 可选：在这里添加使用基础 OCR 结果进行关键词搜索的备用逻辑
			
 
				+
			
 
				+                # 循环继续处理上一页 (即 page_num 减小的方向)
			
 
				+
			
 
				+            # 将最终的返回和日志移到循环外部
			
 
				+            logging.warning(f"在检查的页面范围内 ({start_page}-{total_pages})，未能从任何表格中提取到有效股票代码: {os.path.basename(self.pdf_path)}")
			
 
				+            return [] # 返回空列表
			
 
				 
			
 
				         except ImportError as ie:
			
 
				              logging.error(f"缺少必要的库: {ie}。请安装 paddleocr, paddlepaddle, pdf2image。")
			
@@ -210,11 +242,6 @@ class PdfProcessor:
 
				             logging.error(f"处理文件 {self.pdf_path} 时发生错误: {e}", exc_info=True) # exc_info=True 打印堆栈信息
			
 
				             return []
			
 
				 
			
 
				-        # 返回去重后的列表
			
 
				-        final_codes = sorted(list(set(extracted_codes)))
			
 
				-        logging.info(f"文件 {os.path.basename(self.pdf_path)} 处理完毕，提取到 {len(final_codes)} 个唯一代码: {final_codes if len(final_codes) < 10 else final_codes[:10] + ['...']}")
			
 
				-        return final_codes
			
 
				-
			
 
				 # # 本地测试 (取消注释以进行测试)
			
 
				 # if __name__ == '__main__':
			
 
				 #      # 确保测试PDF文件存在于指定路径
			
--- a/src/trading_calculator.py
+++ b/src/trading_calculator.py
@@ -1,5 +1,9 @@
 
				 from datetime import timedelta, date
			
 
				 from collections import defaultdict
			
 
				+import logging
			
 
				+
			
 
				+# 配置日志记录
			
 
				+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(filename)s - %(message)s')
			
 
				 
			
 
				 class TradingCalculator:
			
 
				     """
			
@@ -15,106 +19,136 @@ class TradingCalculator:
 
				                                    需要确保列表按 record_date 升序排列。
			
 
				         """
			
 
				         self.pdf_files_info = pdf_files_info
			
 
				-        self.holdings_by_date = {info['record_date']: set(info['codes']) for info in pdf_files_info}
			
 
				-        self.record_dates = sorted(self.holdings_by_date.keys())
			
 
				-
			
 
				-    def calculate_trading_dates(self):
			
 
				-        """
			
 
				-        根据记录日期计算实际交易日期列表。
			
 
				-        交易日期是上一个记录日期的日期。
			
 
				-        需要处理非交易日（周末和节假日），这里暂时只考虑记录日期本身。
			
 
				-        注意：prd 中提到交易日是上一个记录日，这里需要精确实现。
			
 
				-
			
 
				-        Returns:
			
 
				-            list: 包含所有有效交易日期的列表 (datetime.date对象)，按升序排列。
			
 
				-        """
			
 
				-        if not self.record_dates:
			
 
				-            return []
			
 
				+        # 记录日期到持仓代码集合的映射
			
 
				+        self.holdings_by_record_date = {info['record_date']: set(info['codes']) for info in pdf_files_info}
			
 
				+        # 原始记录日期列表，已排序
			
 
				+        self.record_dates = sorted(self.holdings_by_record_date.keys())
			
 
				+        print(f"初始化 TradingCalculator，共有 {len(self.record_dates)} 个记录日期。")
			
 
				 
			
 
				-        # 交易日是上一个记录日，所以第一个记录日没有对应的交易日
			
 
				-        # 交易日 = 记录日i-1 (日期对象)
			
 
				-        # 持仓 = 记录日i 的持仓
			
 
				-        # 因此，我们将 holdings_by_date 的键调整为对应的交易日
			
 
				-        self.holdings_by_trading_date = {}
			
 
				-        trading_dates = []
			
 
				-        for i in range(1, len(self.record_dates)):
			
 
				-            trading_date = self.record_dates[i-1] # 上一个记录日是当前持仓对应的交易日
			
 
				-            current_record_date = self.record_dates[i]
			
 
				-            self.holdings_by_trading_date[trading_date] = self.holdings_by_date[current_record_date]
			
 
				-            trading_dates.append(trading_date)
			
 
				-
			
 
				-        # 还需要一个方法去获取真实的交易日历，以排除周末和节假日，这里暂时省略
			
 
				-        # simplified_trading_dates = [d for d in trading_dates if d.weekday() < 5] # 简单排除周末
			
 
				-        print(f"计算得到的潜在交易日期数量: {len(trading_dates)}")
			
 
				-        # print(f"（未过滤节假日）过滤周末后的交易日期数量: {len(simplified_trading_dates)}")
			
 
				-
			
 
				-        # 注意：prd 第6点提到中断要和日期列表对比，暗示需要一个完整的交易日历
			
 
				-        # 目前只使用从文件名解析出的日期作为潜在交易日
			
 
				-        self.valid_trading_dates_set = set(trading_dates) # 存储所有有效的交易日，用于后续判断中断
			
 
				-        return trading_dates
			
 
				-
			
 
				-    def determine_buy_sell_dates(self, trading_dates):
			
 
				+    def determine_buy_sell_dates(self):
			
 
				         """
			
 
				-        根据每日持仓和交易日历，计算每个股票的买入卖出日期。
			
 
				-
			
 
				-        Args:
			
 
				-            trading_dates (list): 有效的交易日期列表 (datetime.date 对象)，按升序排列。
			
 
				+        根据记录日期和每日持仓，计算每个股票的买入卖出日期。
			
 
				 
			
 
				         Returns:
			
 
				             list: 包含交易记录的列表。
			
 
				-                  格式: [{'code': str, 'buy_date': date, 'sell_date': date}, ...]
			
 
				+                  格式: [{'code': str, 'buy_date': date or str, 'sell_date': date or str}, ...]
			
 
				+                  买入或卖出日期可能为字符串 "未知 (范围外)"。
			
 
				         """
			
 
				-        if not trading_dates:
			
 
				+        if not self.record_dates:
			
 
				+            logging.warning("没有记录日期，无法计算买卖点。")
			
 
				             return []
			
 
				 
			
 
				-        # 按股票代码收集其出现的所有交易日期
			
 
				-        stock_holding_dates = defaultdict(list)
			
 
				-        for trade_date in trading_dates:
			
 
				-            holdings = self.holdings_by_trading_date.get(trade_date, set())
			
 
				+        # 按股票代码收集其出现的所有记录日期
			
 
				+        stock_record_dates = defaultdict(list)
			
 
				+        for record_date in self.record_dates:
			
 
				+            holdings = self.holdings_by_record_date.get(record_date, set())
			
 
				             for code in holdings:
			
 
				-                stock_holding_dates[code].append(trade_date)
			
 
				+                stock_record_dates[code].append(record_date)
			
 
				 
			
 
				         transactions = []
			
 
				-        for code, dates in stock_holding_dates.items():
			
 
				+        # record_dates 已经是排序好的完整日期列表
			
 
				+        all_available_dates = self.record_dates
			
 
				+
			
 
				+        first_overall_record_date = all_available_dates[0]
			
 
				+        last_overall_record_date = all_available_dates[-1]
			
 
				+
			
 
				+        for code, dates in stock_record_dates.items():
			
 
				             if not dates:
			
 
				                 continue
			
 
				 
			
 
				-            dates.sort() # 确保日期有序
			
 
				-            start_date = dates[0]
			
 
				-            last_date = dates[0]
			
 
				+            # dates 已经是 stock_record_dates 中按 key (record_date) 排序后append的，所以是排序好的
			
 
				+            # dates.sort() # 确保日期有序，但这里应该已经是排序好的
			
 
				+
			
 
				+            # 识别连续的持有日期段（基于记录日期）
			
 
				+            segments = []
			
 
				+            current_segment = [dates[0]]
			
 
				 
			
 
				             for i in range(1, len(dates)):
			
 
				                 current_date = dates[i]
			
 
				-                # 检查日期是否连续 (这里的连续性需要基于真实的交易日历判断)
			
 
				-                # 简化判断：如果两个日期之间的天数 > 1 天，并且中间的日期不在我们已知的交易日集合中，则不算中断
			
 
				-                # 更严格的判断需要完整的交易日历
			
 
				-                days_diff = (current_date - last_date).days
			
 
				-                is_interrupted = True
			
 
				-                if days_diff > 1:
			
 
				-                    # 检查中间的日期是否都是非交易日（基于我们现有的交易日列表）
			
 
				-                    potential_missing_trading_days = 0
			
 
				-                    temp_date = last_date + timedelta(days=1)
			
 
				-                    while temp_date < current_date:
			
 
				-                        if temp_date in self.valid_trading_dates_set:
			
 
				-                           potential_missing_trading_days += 1
			
 
				-                           break # 只要找到一个本应存在的交易日，就说明是中断
			
 
				-                        temp_date += timedelta(days=1)
			
 
				-
			
 
				-                    if potential_missing_trading_days == 0:
			
 
				-                         is_interrupted = False # 中间没有已知的交易日，不算中断
			
 
				-                elif days_diff == 1:
			
 
				-                    is_interrupted = False # 日期连续，不算中断
			
 
				-                # 如果 days_diff <= 0 是不可能的，因为日期已排序
			
 
				-
			
 
				-                if is_interrupted:
			
 
				-                    # 发生中断，记录上一段的交易
			
 
				-                    transactions.append({'code': code, 'buy_date': start_date, 'sell_date': last_date})
			
 
				-                    start_date = current_date # 新的持仓段开始
			
 
				-
			
 
				-                last_date = current_date
			
 
				-
			
 
				-            # 记录最后一段持仓
			
 
				-            transactions.append({'code': code, 'buy_date': start_date, 'sell_date': last_date})
			
 
				-
			
 
				-        print(f"确定了 {len(transactions)} 笔买卖交易。")
			
 
				-        return transactions 
			
 
				+                last_date_in_segment = current_segment[-1]
			
 
				+
			
 
				+                # 检查当前记录日期是否是上一记录日期的下一个日期（在 all_available_dates 列表中）
			
 
				+                try:
			
 
				+                    last_date_index = all_available_dates.index(last_date_in_segment)
			
 
				+                    # 确保索引+1在列表范围内，并且下一个日期就是当前日期
			
 
				+                    if last_date_index + 1 < len(all_available_dates) and all_available_dates[last_date_index + 1] == current_date:
			
 
				+                        # 日期连续
			
 
				+                        current_segment.append(current_date)
			
 
				+                    else:
			
 
				+                        # 日期不连续，当前段结束，开始新段
			
 
				+                        segments.append(current_segment)
			
 
				+                        current_segment = [current_date]
			
 
				+                except ValueError:
			
 
				+                     # This should not happen if dates come from self.record_dates
			
 
				+                    logging.error(f"日期 {last_date_in_segment} 未在 record_dates 中找到，代码 {code}")
			
 
				+                    # Treat as interruption if date is unexpected
			
 
				+                    segments.append(current_segment)
			
 
				+                    current_segment = [current_date]
			
 
				+
			
 
				+
			
 
				+            # 添加最后一个段
			
 
				+            if current_segment:
			
 
				+                segments.append(current_segment)
			
 
				+
			
 
				+            # 根据连续段确定买卖日期
			
 
				+            # 买入日期是持有段开始记录日期的前一个记录日期（作为交易日）
			
 
				+            # 卖出日期是持有段结束记录日期的前一个记录日期（作为交易日），除非是最后一天，则卖出日期未知
			
 
				+            # PRD 提到交易日是上一个记录日，持仓是基于当天记录日。
			
 
				+            # 记录日 i 的持仓对应交易日 i-1
			
 
				+            # 持仓段 [记录日 i, 记录日 j] 对应的交易日段为 [记录日 i-1, 记录日 j-1]
			
 
				+            # 买入发生在 记录日 i-1
			
 
				+            # 卖出发生在 记录日 j
			
 
				+
			
 
				+            for segment in segments:
			
 
				+                segment_start_record_date = segment[0] # 持有段的第一个记录日期
			
 
				+                segment_end_record_date = segment[-1] # 持有段的最后一个记录日期
			
 
				+
			
 
				+                buy_date = None
			
 
				+                sell_date = None
			
 
				+
			
 
				+                # 确定买入日期 (上一个记录日对应的交易日)
			
 
				+                if segment_start_record_date == first_overall_record_date:
			
 
				+                    # 如果持有从第一个记录日开始，买入日期未知
			
 
				+                    logging.info(f"{code} 在第一个记录日 {first_overall_record_date} 持有，买入日期未知 (范围外)。")
			
 
				+                    buy_date = None
			
 
				+                else:
			
 
				+                    # 找到 segment_start_record_date 在 all_available_dates 中的前一个记录日期
			
 
				+                    try:
			
 
				+                        start_date_index = all_available_dates.index(segment_start_record_date)
			
 
				+                        if start_date_index > 0:
			
 
				+                            # 买入日期是持有段开始记录日期的前一个记录日期
			
 
				+                            buy_date = all_available_dates[start_date_index - 1]
			
 
				+                        else:
			
 
				+                            # 这应该被上面的 if 覆盖，作为备用
			
 
				+                            logging.warning(f"{code} 的开始记录日期 {segment_start_record_date} 是记录日列表的第一个日期，买入日期未知。")
			
 
				+                            buy_date = None
			
 
				+                    except ValueError:
			
 
				+                        logging.error(f"开始记录日期 {segment_start_record_date} 未在 record_dates 中找到，代码 {code}")
			
 
				+                        buy_date = "错误" # Should not happen
			
 
				+
			
 
				+                # 确定卖出日期
			
 
				+                if segment_end_record_date == last_overall_record_date:
			
 
				+                    # 如果持有到最后一个记录日结束，卖出日期未知
			
 
				+                    logging.info(f"{code} 在最后一个记录日 {last_overall_record_date} 仍持有，卖出日期未知 (范围外)。")
			
 
				+                    sell_date = None
			
 
				+                else:
			
 
				+                    # 卖出日期是持有段的最后一个记录日期本身 (因为PRD说卖出是当天)
			
 
				+                    sell_date = segment_end_record_date
			
 
				+                    logging.info(f"{code} 在 {sell_date} 卖出。")
			
 
				+
			
 
				+                # 如果买入日期是日期对象，卖出日期也是日期对象，才添加交易
			
 
				+                # 如果任何一个未知，则不添加具体的交易记录，只记录日志
			
 
				+                if isinstance(buy_date, date) or isinstance(sell_date, date):
			
 
				+                     transactions.append({'code': code, 'buy_date': buy_date, 'sell_date': sell_date})
			
 
				+                     logging.info(f"{code} 的交易 ({buy_date} -> {sell_date}) 已添加。")
			
 
				+                else:
			
 
				+                    logging.info(f"{code} 的交易 ({buy_date} -> {sell_date}) 因日期范围外而忽略。")
			
 
				+
			
 
				+
			
 
				+        print(f"获得了 {len(transactions)} 笔买卖交易 (含范围外未确定日期的)。")
			
 
				+        # 过滤掉未知日期的交易，只返回有确定买卖日期的交易
			
 
				+        final_transactions = [t for t in transactions if isinstance(t['buy_date'], date) and isinstance(t['sell_date'], date)]
			
 
				+        partial_transactions = [t for t in transactions if not isinstance(t['buy_date'], date) or not isinstance(t['sell_date'], date)]
			
 
				+        print(f"确定了 {len(final_transactions)} 笔有明确买卖日期的交易。")
			
 
				+        print(f"忽略了 {len(partial_transactions)} 笔有范围外未确定日期的交易。")
			
 
				+        return final_transactions