Parcourir la source

增加关键词扫描

betterMax il y a 5 mois
Parent
commit
ec006ea265
5 fichiers modifiés avec 638 ajouts et 72 suppressions
  1. 110 0
      output/keyword_scan_cache.json
  2. 260 0
      src/keyword_scanner.py
  3. 102 71
      src/main.py
  4. 1 1
      src/pdf_processor.py
  5. 165 0
      uv.lock

+ 110 - 0
output/keyword_scan_cache.json

@@ -0,0 +1,110 @@
+{
+    "波段练手实盘": [],
+    "长线投资计划": [],
+    "大资金实盘": [
+        "data/pdfs\\20170327投资早参.pdf",
+        "data/pdfs\\20170328投资早参.pdf",
+        "data/pdfs\\20170329投资早参.pdf",
+        "data/pdfs\\20170330投资早参.pdf",
+        "data/pdfs\\20170331投资早参.pdf",
+        "data/pdfs\\20170405投资早参.pdf",
+        "data/pdfs\\20170406投资早参.pdf",
+        "data/pdfs\\20170407投资早参.pdf",
+        "data/pdfs\\20170410投资早参.pdf",
+        "data/pdfs\\20170418投资早参(淡定是一味药).pdf",
+        "data/pdfs\\20170419投资早参(关于止盈的想法).pdf",
+        "data/pdfs\\20170424投资早参(忠告).pdf",
+        "data/pdfs\\20170426投资早参(股市金刚经一).pdf",
+        "data/pdfs\\20170427投资早参(投资者的三个致命错误).pdf",
+        "data/pdfs\\20170428投资早参(送给迷茫的你).pdf",
+        "data/pdfs\\20170504投资早参(股市心理学).pdf",
+        "data/pdfs\\20170505投资早参(闲言碎语话投资).pdf",
+        "data/pdfs\\20170508投资早参(我们有多像大多数人).pdf",
+        "data/pdfs\\20170509投资早参(我不知道的,以及我知道的).pdf",
+        "data/pdfs\\20170512投资早参(不妙?大妙?).pdf",
+        "data/pdfs\\20170515投资早参(为什么拿不住?).pdf",
+        "data/pdfs\\20170516投资早参(人生的风险排行).pdf",
+        "data/pdfs\\20170516投资早参(既成事实的现象).pdf",
+        "data/pdfs\\20170518投资早参(投资能为你带来什么?).pdf",
+        "data/pdfs\\20170519投资早参(农民与投资).pdf",
+        "data/pdfs\\20170522投资早参(信念与原则).pdf",
+        "data/pdfs\\20170523投资早参(几个常见的心理陷阱).pdf",
+        "data/pdfs\\20170524投资早参(新手必读).pdf",
+        "data/pdfs\\20170525投资早参(面对熊市的层次).pdf",
+        "data/pdfs\\20170526投资早参(五问).pdf",
+        "data/pdfs\\20170601投资早参(持续盈利).pdf",
+        "data/pdfs\\20170602投资早参(一起努力走更远!).pdf",
+        "data/pdfs\\20170605投资早参(消息与未来走势).pdf",
+        "data/pdfs\\20170606投资早参(如果看待市场).pdf",
+        "data/pdfs\\20170607投资早参(被忽略的弊端).pdf",
+        "data/pdfs\\20170608投资早参(取舍之道).pdf",
+        "data/pdfs\\20170612投资早参(读书与写作).pdf",
+        "data/pdfs\\20170613投资早参(耐心跟随).pdf",
+        "data/pdfs\\20170614投资早参(业余与专业的区别).pdf",
+        "data/pdfs\\20170615投资早参(有所敬畏).pdf",
+        "data/pdfs\\20170616投资早参(视野).pdf",
+        "data/pdfs\\20170621投资早参(三种思维习惯).pdf",
+        "data/pdfs\\20170622投资早参(自渡).pdf",
+        "data/pdfs\\20170623投资早参(反人性).pdf",
+        "data/pdfs\\20170626投资早参(大众与少数).pdf",
+        "data/pdfs\\20170627投资早参(夜夜安枕最重要).pdf",
+        "data/pdfs\\20170628投资早参(过程与结果).pdf",
+        "data/pdfs\\20170629投资早参(把握当下).pdf",
+        "data/pdfs\\20170630投资早参(投资的本质永远没变).pdf",
+        "data/pdfs\\20170703投资早参(追求完美).pdf",
+        "data/pdfs\\20170704投资早参(20%定律).pdf",
+        "data/pdfs\\20170705投资早参(道法术).pdf",
+        "data/pdfs\\20170706投资早参(为什么会频繁交易?).pdf",
+        "data/pdfs\\20170707投资早参(简单一点).pdf",
+        "data/pdfs\\20170710投资早参(练心).pdf",
+        "data/pdfs\\20170711投资早参(敢于认错).pdf",
+        "data/pdfs\\20170712投资早参(何时退出?).pdf",
+        "data/pdfs\\20170713投资早参(怎样执行退出?).pdf",
+        "data/pdfs\\20170714投资早参(做到了吗?).pdf",
+        "data/pdfs\\20170717投资早参(交易进化).pdf",
+        "data/pdfs\\20170718投资早参(写给群友的话).pdf",
+        "data/pdfs\\20170719投资早参(交易计划).pdf",
+        "data/pdfs\\20170720投资早参(习惯高于技巧).pdf",
+        "data/pdfs\\20170721投资早参(正确认识亏损).pdf",
+        "data/pdfs\\20170724投资早参(修养重于财富).pdf",
+        "data/pdfs\\20170725投资早参(交易为生).pdf",
+        "data/pdfs\\20170726投资早参(知也不易,行则更难).pdf",
+        "data/pdfs\\20170727投资早参(什么最重要?).pdf",
+        "data/pdfs\\20170728投资早参(敬畏人性).pdf",
+        "data/pdfs\\20170731投资早参(做一个没有观点的投资者).pdf",
+        "data/pdfs\\20170801投资早参(不要羡慕别人).pdf",
+        "data/pdfs\\20170802投资早参(赚谁的钱).pdf",
+        "data/pdfs\\20170803投资早参(交易记录).pdf",
+        "data/pdfs\\20170804投资早参(任何人都有错的时候).pdf",
+        "data/pdfs\\20170807投资早参(独立性).pdf",
+        "data/pdfs\\20170808投资早参(几个小问题).pdf"
+    ],
+    "实盘股票": [
+        "data/pdfs\\20170310实盘计划.pdf",
+        "data/pdfs\\20170314波段投资策略.pdf",
+        "data/pdfs\\20170315投资早参.pdf",
+        "data/pdfs\\20170316投资早参.pdf",
+        "data/pdfs\\20170317投资早参.pdf",
+        "data/pdfs\\20170320投资早参.pdf",
+        "data/pdfs\\20170321投资早参.pdf",
+        "data/pdfs\\20170322投资早参.pdf",
+        "data/pdfs\\20170323投资早参.pdf",
+        "data/pdfs\\20170324投资早参.pdf",
+        "data/pdfs\\20170327投资早参.pdf",
+        "data/pdfs\\20170328投资早参.pdf",
+        "data/pdfs\\20170329投资早参.pdf",
+        "data/pdfs\\20170330投资早参.pdf",
+        "data/pdfs\\20170331投资早参.pdf",
+        "data/pdfs\\20170405投资早参.pdf",
+        "data/pdfs\\20170406投资早参.pdf",
+        "data/pdfs\\20170407投资早参.pdf",
+        "data/pdfs\\20170410投资早参.pdf",
+        "data/pdfs\\20170411投资早参(一些心里话与投资建议).pdf",
+        "data/pdfs\\20170412投资早参(送给群友的口诀).pdf",
+        "data/pdfs\\20170413投资早参.pdf",
+        "data/pdfs\\20170414投资早参.pdf",
+        "data/pdfs\\20170417投资早参(赔钱的心理短板).pdf",
+        "data/pdfs\\20170418投资早参(淡定是一味药).pdf",
+        "data/pdfs\\20170419投资早参(关于止盈的想法).pdf"
+    ]
+}

+ 260 - 0
src/keyword_scanner.py

@@ -0,0 +1,260 @@
+import os
+import glob
+import logging
+import json
+from datetime import date
+from pdf2image import convert_from_path, pdfinfo_from_path
+from paddleocr import PaddleOCR
+import numpy as np
+
+# 配置日志
+logging.basicConfig(level=logging.INFO, 
+                    format='%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s', 
+                    force=True)
+
+# 初始化 PaddleOCR
+try:
+    ocr = PaddleOCR(use_angle_cls=True, lang='ch', use_gpu=False, show_log=False, use_space_char=True)
+    logging.info("PaddleOCR 初始化成功 (CPU)。")
+except Exception as e:
+    logging.error(f"PaddleOCR 初始化失败: {e}")
+    ocr = None
+
+class KeywordScanner:
+    """
+    扫描PDF文件并检查是否包含指定关键词
+    """
+    def __init__(self, pdf_directory, keywords, output_directory='output'):
+        """
+        初始化关键词扫描器
+
+        Args:
+            pdf_directory (str): PDF文件目录
+            keywords (list): 要搜索的关键词列表
+            output_directory (str): 结果输出目录
+        """
+        self.pdf_directory = pdf_directory
+        self.keywords = keywords
+        self.output_directory = output_directory
+        
+        # 确保输出目录存在
+        os.makedirs(output_directory, exist_ok=True)
+        
+        # 初始化结果字典,每个关键词对应一个文件列表
+        self.results = {keyword: [] for keyword in keywords}
+        
+        # 缓存文件路径
+        self.cache_file = os.path.join(output_directory, 'keyword_scan_cache.json')
+        
+        # 加载缓存
+        self.load_cache()
+        
+    def load_cache(self):
+        """加载缓存的扫描结果"""
+        if os.path.exists(self.cache_file):
+            try:
+                with open(self.cache_file, 'r', encoding='utf-8') as f:
+                    cache_data = json.load(f)
+                self.results = cache_data
+                logging.info(f"从缓存文件 {self.cache_file} 加载了扫描结果。")
+                
+                # 确保所有关键词都在结果字典中
+                for keyword in self.keywords:
+                    if keyword not in self.results:
+                        self.results[keyword] = []
+                        
+            except (json.JSONDecodeError, IOError) as e:
+                logging.warning(f"加载缓存文件 {self.cache_file} 失败: {e},将重新初始化结果。")
+                self.results = {keyword: [] for keyword in self.keywords}
+        else:
+            logging.info(f"缓存文件 {self.cache_file} 不存在。将创建新的结果字典。")
+            
+    def save_cache(self):
+        """保存扫描结果到缓存文件"""
+        try:
+            with open(self.cache_file, 'w', encoding='utf-8') as f:
+                json.dump(self.results, f, ensure_ascii=False, indent=4)
+            logging.info(f"扫描结果已保存到缓存文件 {self.cache_file}。")
+        except IOError as e:
+            logging.error(f"保存缓存文件 {self.cache_file} 失败: {e}")
+            
+    def scan_pdf_for_keywords(self, pdf_path):
+        """
+        扫描单个PDF文件,检查是否包含关键词
+        
+        Args:
+            pdf_path (str): PDF文件路径
+            
+        Returns:
+            dict: 每个关键词对应的匹配状态
+        """
+        if not ocr:
+            logging.error("PaddleOCR 未初始化,无法扫描关键词。")
+            return {}
+            
+        logging.info(f"开始扫描文件: {pdf_path}")
+        
+        # 匹配结果字典
+        matches = {keyword: False for keyword in self.keywords}
+        
+        try:
+            # 获取PDF总页数
+            info = pdfinfo_from_path(pdf_path)
+            total_pages = info["Pages"]
+            logging.info(f"PDF总页数: {total_pages}")
+            
+            # 处理PDF (分批处理,避免内存问题)
+            batch_size = 5  # 每批处理的页数
+            for start_page in range(1, total_pages + 1, batch_size):
+                end_page = min(start_page + batch_size - 1, total_pages)
+                logging.info(f"处理页面 {start_page} 到 {end_page}")
+                
+                images = convert_from_path(pdf_path, first_page=start_page, last_page=end_page, dpi=200)
+                
+                # 对每一页图像进行OCR识别和关键词匹配
+                for i, img in enumerate(images):
+                    current_page = start_page + i
+                    logging.info(f"正在识别第 {current_page} 页")
+                    
+                    # 转换为numpy数组
+                    img_np = np.array(img)
+                    
+                    # OCR识别
+                    try:
+                        ocr_result = ocr.ocr(img_np, cls=True)
+                        
+                        # 提取文本内容
+                        page_text = ""
+                        if ocr_result:
+                            for line in ocr_result:
+                                for word_info in line:
+                                    if isinstance(word_info, list) and len(word_info) >= 2:
+                                        if isinstance(word_info[1], tuple) and len(word_info[1]) >= 1:
+                                            page_text += word_info[1][0] + " "
+                        
+                        # 检查关键词匹配
+                        for keyword in self.keywords:
+                            if keyword in page_text and not matches[keyword]:
+                                matches[keyword] = True
+                                logging.info(f"在第 {current_page} 页找到关键词: {keyword}")
+                                
+                        # 如果所有关键词都已找到,可以提前结束
+                        if all(matches.values()):
+                            logging.info("所有关键词都已找到,提前结束扫描。")
+                            break
+                            
+                    except Exception as e:
+                        logging.error(f"OCR识别第 {current_page} 页时出错: {e}")
+                
+                # 如果所有关键词都已找到,可以提前结束整个处理过程
+                if all(matches.values()):
+                    break
+                    
+            return matches
+            
+        except Exception as e:
+            logging.error(f"处理文件 {pdf_path} 时发生错误: {e}")
+            return matches
+            
+    def scan_directory(self):
+        """
+        扫描目录中的所有PDF文件,寻找包含关键词的文件
+        """
+        # 获取目录中的所有PDF文件
+        pdf_files = glob.glob(os.path.join(self.pdf_directory, "*.pdf"))
+        logging.info(f"在目录 {self.pdf_directory} 中找到 {len(pdf_files)} 个PDF文件。")
+        
+        if not pdf_files:
+            logging.error(f"目录 {self.pdf_directory} 中没有找到PDF文件。")
+            return
+            
+        # 获取已处理的文件列表
+        processed_files = set()
+        for files in self.results.values():
+            processed_files.update(files)
+            
+        # 找出需要处理的文件
+        files_to_process = [f for f in pdf_files if f not in processed_files]
+        logging.info(f"需要处理的新文件数量: {len(files_to_process)}")
+        
+        # 处理每个文件
+        for pdf_path in files_to_process:
+            logging.info(f"处理文件: {pdf_path}")
+            
+            # 扫描文件中的关键词
+            matches = self.scan_pdf_for_keywords(pdf_path)
+            
+            # 更新结果
+            for keyword, matched in matches.items():
+                if matched and pdf_path not in self.results[keyword]:
+                    self.results[keyword].append(pdf_path)
+                    
+            # 每处理完一个文件就保存一次缓存
+            self.save_cache()
+            
+        logging.info("所有文件处理完毕。")
+        
+    def generate_report(self):
+        """
+        生成包含关键词匹配结果的报告
+        
+        Returns:
+            dict: 每个关键词及其匹配的文件列表
+        """
+        # 输出结果到文件
+        report_file = os.path.join(self.output_directory, 'keyword_scan_report.json')
+        try:
+            with open(report_file, 'w', encoding='utf-8') as f:
+                json.dump(self.results, f, ensure_ascii=False, indent=4)
+            logging.info(f"扫描报告已保存到 {report_file}。")
+        except IOError as e:
+            logging.error(f"保存报告文件 {report_file} 失败: {e}")
+            
+        # 为每个关键词创建一个包含匹配文件的文本文件
+        for keyword, files in self.results.items():
+            if files:  # 只为有匹配结果的关键词创建文件
+                keyword_file = os.path.join(self.output_directory, f'{keyword}_files.txt')
+                try:
+                    with open(keyword_file, 'w', encoding='utf-8') as f:
+                        for file_path in files:
+                            f.write(f"{file_path}\n")
+                    logging.info(f"关键词 '{keyword}' 的匹配文件列表已保存到 {keyword_file}。")
+                except IOError as e:
+                    logging.error(f"保存关键词文件 {keyword_file} 失败: {e}")
+                    
+        return self.results
+
+def main():
+    """主函数"""
+    # 直接在代码中定义配置参数
+    pdf_directory = 'data/pdfs'  # PDF文件目录
+    output_directory = 'output'  # 输出结果目录
+    
+    # 在这里定义要搜索的关键词列表
+    keywords = ['波段练手实盘', '长线投资计划', '大资金实盘', '实盘股票']
+    
+    # 初始化扫描器
+    scanner = KeywordScanner(pdf_directory, keywords, output_directory)
+    
+    # 扫描目录
+    scanner.scan_directory()
+    
+    # 生成报告
+    results = scanner.generate_report()
+    
+    # 输出摘要
+    print("\n关键词扫描摘要:")
+    for keyword, files in results.items():
+        print(f"关键词 '{keyword}' 在 {len(files)} 个文件中找到:")
+        for i, file_path in enumerate(files, 1):
+            if i <= 5:  # 只显示前5个文件
+                print(f"  {i}. {os.path.basename(file_path)}")
+            else:
+                print(f"  ... 还有 {len(files) - 5} 个文件")
+                break
+        print()
+    
+    print(f"详细结果已保存到 {output_directory} 目录。")
+
+if __name__ == "__main__":
+    main() 

+ 102 - 71
src/main.py

@@ -31,12 +31,20 @@ def main():
         logging.error("错误:配置文件中缺少 pdf_directory 或 output_directory。")
         return
 
+    # 确保输出目录存在
+    os.makedirs(output_directory, exist_ok=True)
+
     # --- 缓存逻辑开始 ---
     cache_filename = 'pdf_cache.json'
     cache_path = os.path.join(output_directory, cache_filename)
-    cached_pdf_files_info = None # 初始化为 None,用于存储从缓存加载的数据
-
-    # 尝试加载缓存
+    failed_cache_filename = 'failed_pdf_cache.json'
+    failed_cache_path = os.path.join(output_directory, failed_cache_filename)
+    
+    # 初始化缓存数据
+    cached_pdf_files_info = []
+    failed_pdf_files = []
+
+    # 尝试加载成功处理的PDF缓存
     if os.path.exists(cache_path):
         try:
             with open(cache_path, 'r', encoding='utf-8') as f:
@@ -44,7 +52,6 @@ def main():
             # 验证并转换日期
             if isinstance(loaded_cache_data, list):
                 validated_cache_data = []
-                valid_cache = True
                 for item in loaded_cache_data:
                     try:
                         # 转换日期字符串回 date 对象
@@ -52,8 +59,7 @@ def main():
                         validated_cache_data.append(item)
                     except (TypeError, ValueError, KeyError) as e:
                         logging.warning(f"缓存项格式错误或日期转换失败: {item}, 错误: {e}。将忽略此项并继续。")
-                        # 如果希望严格模式,任何一个错误都导致重新处理,可以在这里 valid_cache = False; break
-                if validated_cache_data: # 确保列表不为空且有效记录存在
+                if validated_cache_data:
                     cached_pdf_files_info = validated_cache_data
                     logging.info(f"从缓存文件 {cache_path} 加载并转换了 {len(cached_pdf_files_info)} 条记录。")
                 else:
@@ -65,6 +71,22 @@ def main():
     else:
         logging.info(f"缓存文件 {cache_path} 不存在。")
 
+    # 尝试加载处理失败的PDF缓存
+    if os.path.exists(failed_cache_path):
+        try:
+            with open(failed_cache_path, 'r', encoding='utf-8') as f:
+                failed_pdf_files = json.load(f)
+                if isinstance(failed_pdf_files, list):
+                    logging.info(f"从失败缓存文件 {failed_cache_path} 加载了 {len(failed_pdf_files)} 条记录。")
+                else:
+                    logging.warning(f"失败缓存文件 {failed_cache_path} 格式不正确(不是列表),将初始化为空列表。")
+                    failed_pdf_files = []
+        except (json.JSONDecodeError, IOError) as e:
+            logging.warning(f"加载失败缓存文件 {failed_cache_path} 失败: {e},将初始化为空列表。")
+            failed_pdf_files = []
+    else:
+        logging.info(f"失败缓存文件 {failed_cache_path} 不存在,将初始化为空列表。")
+
     # 2. 获取当前目录的 PDF 文件列表
     pdf_files_in_directory = glob.glob(os.path.join(pdf_directory, '*.pdf'))
     if not pdf_files_in_directory:
@@ -73,82 +95,91 @@ def main():
 
     logging.info(f"当前目录中找到 {len(pdf_files_in_directory)} 个 PDF 文件。")
 
-    # 检查缓存是否可以直接使用或需要更新
-    should_process_all_pdfs = True # 默认需要处理所有 PDF (如果缓存不存在或无效)
-    pdf_files_to_process = [] # 需要处理的 PDF 文件列表
-    final_pdf_files_info = [] # 最终用于计算的数据列表
-
-    if cached_pdf_files_info is not None:
-        cached_paths = {item['path'] for item in cached_pdf_files_info}
-        # 找出在目录中但不在缓存中的 PDF 文件
-        new_pdf_files = [p for p in pdf_files_in_directory if p not in cached_paths]
-
-        if len(cached_pdf_files_info) == len(pdf_files_in_directory) and not new_pdf_files:
-            # 情况 1: 缓存记录数量与 PDF 文件数量一致,且没有新的 PDF
-            logging.info("缓存记录数量与目录 PDF 文件数量一致,且没有新增文件,直接使用缓存数据。")
-            should_process_all_pdfs = False
-            final_pdf_files_info = cached_pdf_files_info # 直接使用缓存数据
-        else:
-            # 情况 2: 缓存记录数量与 PDF 文件数量不一致,或有新的 PDF
-            logging.warning("缓存与目录 PDF 文件不一致,将处理新增或有差异的 PDF。")
-            should_process_all_pdfs = False # 不处理所有,只处理新增的
-            pdf_files_to_process = new_pdf_files # 需要处理的是新文件
-            final_pdf_files_info.extend(cached_pdf_files_info) # 将现有缓存数据加入最终列表
-
-    if should_process_all_pdfs:
-         # 如果没有有效的缓存,处理目录中的所有 PDF
-        logging.info("没有可用的缓存,将处理目录中的所有 PDF 文件。")
-        pdf_files_to_process = pdf_files_in_directory # 需要处理所有 PDF
-
-    if pdf_files_to_process:
-        logging.info(f"开始处理 {len(pdf_files_to_process)} 个 PDF 文件...")
-        processed_new_files_info = []
-        for pdf_path in pdf_files_to_process:
+    # 检查缓存以确定需要处理的文件
+    cached_paths = {item['path'] for item in cached_pdf_files_info}
+    failed_paths = set(failed_pdf_files)
+    
+    # 找出需要处理的新PDF文件(既不在成功缓存也不在失败缓存中的文件)
+    pdf_files_to_process = [p for p in pdf_files_in_directory 
+                           if p not in cached_paths and p not in failed_paths]
+
+    logging.info(f"需要处理的新PDF文件数量: {len(pdf_files_to_process)}")
+
+    # 处理每个PDF文件,并立即更新缓存
+    for pdf_path in pdf_files_to_process:
+        logging.info(f"处理PDF文件: {pdf_path}")
+        try:
             processor = PdfProcessor(pdf_path)
             record_date = processor.record_date
+            
             if record_date:
-                 holdings = processor.extract_holdings()
-                 processed_new_files_info.append({'record_date': record_date, 'codes': holdings, 'path': pdf_path})
+                holdings = processor.extract_holdings()
+                if holdings:  # 如果成功提取了持仓信息
+                    # 添加到成功处理的列表
+                    new_item = {'record_date': record_date, 'codes': holdings, 'path': pdf_path}
+                    cached_pdf_files_info.append(new_item)
+                    
+                    # 立即更新成功处理的缓存文件
+                    try:
+                        # 按record_date排序
+                        cached_pdf_files_info.sort(key=lambda x: x['record_date'])
+                        
+                        # 保存到缓存
+                        serializable_data = [
+                            {**item, 'record_date': item['record_date'].isoformat()}
+                            for item in cached_pdf_files_info
+                        ]
+                        with open(cache_path, 'w', encoding='utf-8') as f:
+                            json.dump(serializable_data, f, ensure_ascii=False, indent=4)
+                        logging.info(f"已将 {pdf_path} 的处理结果添加到缓存文件 {cache_path}。")
+                    except IOError as e:
+                        logging.error(f"更新缓存文件 {cache_path} 失败: {e}")
+                else:
+                    # 没有提取到表格,添加到失败列表
+                    logging.warning(f"文件 {pdf_path} 未能提取到表格数据。")
+                    if pdf_path not in failed_pdf_files:
+                        failed_pdf_files.append(pdf_path)
+                        # 更新失败缓存文件
+                        try:
+                            with open(failed_cache_path, 'w', encoding='utf-8') as f:
+                                json.dump(failed_pdf_files, f, ensure_ascii=False, indent=4)
+                            logging.info(f"已将 {pdf_path} 添加到失败缓存文件 {failed_cache_path}。")
+                        except IOError as e:
+                            logging.error(f"更新失败缓存文件 {failed_cache_path} 失败: {e}")
             else:
+                # 没有提取到日期,添加到失败列表
                 logging.error(f"跳过文件 {pdf_path},无法提取记录日期。")
-
-        # 将新处理的数据添加到最终列表中
-        final_pdf_files_info.extend(processed_new_files_info)
-
-        # --- 保存/更新缓存 ---
-        if final_pdf_files_info: # 仅在最终数据非空时保存
-             # 按 record_date 排序
-            final_pdf_files_info.sort(key=lambda x: x['record_date'])
-            try:
-                # 确保输出目录存在
-                os.makedirs(output_directory, exist_ok=True)
-                with open(cache_path, 'w', encoding='utf-8') as f:
-                    # 在保存前转换 date 对象为 ISO 格式字符串
-                    serializable_data = [
-                        {**item, 'record_date': item['record_date'].isoformat()}
-                        for item in final_pdf_files_info
-                    ]
-                    json.dump(serializable_data, f, ensure_ascii=False, indent=4)
-                logging.info(f"PDF 处理结果已保存/更新到缓存文件 {cache_path}。")
-            except IOError as e:
-                logging.error(f"保存缓存文件 {cache_path} 失败: {e}")
-    elif not final_pdf_files_info and cached_pdf_files_info is not None:
-        # 这种情况是缓存存在,但没有新的或不一致的文件需要处理,且缓存本身也可能为空(虽然上面有检查)
-        # 或者缓存加载成功但校验后发现没有有效记录,然后也没有新文件...
-        # 如果 final_pdf_files_info 是空的,说明没有数据可以用于后续计算
-        logging.warning("没有需要处理的新 PDF 文件,且缓存中没有有效数据。")
-        # 这里不保存缓存,因为它没有有效数据
-
-    # 使用 final_pdf_files_info 进行后续计算
-    if not final_pdf_files_info:
+                if pdf_path not in failed_pdf_files:
+                    failed_pdf_files.append(pdf_path)
+                    # 更新失败缓存文件
+                    try:
+                        with open(failed_cache_path, 'w', encoding='utf-8') as f:
+                            json.dump(failed_pdf_files, f, ensure_ascii=False, indent=4)
+                        logging.info(f"已将 {pdf_path} 添加到失败缓存文件 {failed_cache_path}。")
+                    except IOError as e:
+                        logging.error(f"更新失败缓存文件 {failed_cache_path} 失败: {e}")
+        except Exception as e:
+            logging.error(f"处理文件 {pdf_path} 时发生异常: {e}")
+            # 添加到失败列表
+            if pdf_path not in failed_pdf_files:
+                failed_pdf_files.append(pdf_path)
+                # 更新失败缓存文件
+                try:
+                    with open(failed_cache_path, 'w', encoding='utf-8') as f:
+                        json.dump(failed_pdf_files, f, ensure_ascii=False, indent=4)
+                    logging.info(f"已将 {pdf_path} 添加到失败缓存文件 {failed_cache_path}。")
+                except IOError as e:
+                    logging.error(f"更新失败缓存文件 {failed_cache_path} 失败: {e}")
+
+    # 检查是否有成功处理的数据用于后续计算
+    if not cached_pdf_files_info:
         logging.error("没有成功获取任何 PDF 文件信息以进行计算。")
         return
 
     # --- 缓存逻辑结束 ---
 
     # 4. 计算交易日期和买卖点
-    # 现在 calculator 应该使用 final_pdf_files_info
-    calculator = TradingCalculator(final_pdf_files_info)
+    calculator = TradingCalculator(cached_pdf_files_info)
 
     buy_sell_data = calculator.determine_buy_sell_dates()
     if not buy_sell_data:

+ 1 - 1
src/pdf_processor.py

@@ -157,7 +157,7 @@ class PdfProcessor:
                 logging.info(f"PDF总页数: {total_pages}")
                 
                 # 处理最后3页或全部页面(如果总页数少于3)
-                start_page = max(1, total_pages - 2)  # 确保起始页至少是1
+                start_page = max(1, total_pages - 3)  # 确保起始页至少是1
                 logging.info(f"开始处理PDF从第{start_page}页到第{total_pages}页")
                 
                 images = convert_from_path(self.pdf_path, first_page=start_page, last_page=total_pages, dpi=300)

+ 165 - 0
uv.lock

@@ -54,6 +54,30 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643 },
 ]
 
+[[package]]
+name = "anyio"
+version = "4.9.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "exceptiongroup", marker = "python_full_version < '3.11'" },
+    { name = "idna" },
+    { name = "sniffio" },
+    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/95/7d/4c1bd541d4dffa1b52bd83fb8527089e097a106fc90b467a7313b105f840/anyio-4.9.0.tar.gz", hash = "sha256:673c0c244e15788651a4ff38710fea9675823028a6f08a5eda409e0c9840a028", size = 190949 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a1/ee/48ca1a7c89ffec8b6a0c5d02b89c305671d5ffd8d3c94acf8b8c408575bb/anyio-4.9.0-py3-none-any.whl", hash = "sha256:9f76d541cad6e36af7beb62e978876f3b41e3e04f2c1fbf0884604c0a9c4d93c", size = 100916 },
+]
+
+[[package]]
+name = "astor"
+version = "0.8.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/5a/21/75b771132fee241dfe601d39ade629548a9626d1d39f333fde31bc46febe/astor-0.8.1.tar.gz", hash = "sha256:6a6effda93f4e1ce9f618779b2dd1d9d84f1e32812c23a29b3fff6fd7f63fa5e", size = 35090 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c3/88/97eef84f48fa04fbd6750e62dcceafba6c63c81b7ac1420856c8dcc0a3f9/astor-0.8.1-py2.py3-none-any.whl", hash = "sha256:070a54e890cefb5b3739d19f30f5a5ec840ffc9c50ffa7d23cc9fc1a38ebbfc5", size = 27488 },
+]
+
 [[package]]
 name = "beautifulsoup4"
 version = "4.13.4"
@@ -143,15 +167,19 @@ version = "0.1.0"
 source = { virtual = "." }
 dependencies = [
     { name = "paddleocr" },
+    { name = "paddlepaddle" },
     { name = "pdf2image" },
     { name = "pyyaml" },
+    { name = "setuptools" },
 ]
 
 [package.metadata]
 requires-dist = [
     { name = "paddleocr", specifier = ">=2.10.0" },
+    { name = "paddlepaddle", specifier = ">=3.0.0" },
     { name = "pdf2image", specifier = ">=1.17.0" },
     { name = "pyyaml", specifier = ">=6.0.2" },
+    { name = "setuptools", specifier = ">=79.0.1" },
 ]
 
 [[package]]
@@ -203,6 +231,24 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/27/6b/7c87867d255cbce8167ed99fc65635e9395d2af0f0c915428f5b17ec412d/Cython-3.0.12-py2.py3-none-any.whl", hash = "sha256:0038c9bae46c459669390e53a1ec115f8096b2e4647ae007ff1bf4e6dee92806", size = 1171640 },
 ]
 
+[[package]]
+name = "decorator"
+version = "5.2.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/43/fa/6d96a0978d19e17b68d634497769987b16c8f4cd0a7a05048bec693caa6b/decorator-5.2.1.tar.gz", hash = "sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360", size = 56711 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a", size = 9190 },
+]
+
+[[package]]
+name = "exceptiongroup"
+version = "1.2.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/09/35/2495c4ac46b980e4ca1f6ad6db102322ef3ad2410b79fdde159a4b0f3b92/exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc", size = 28883 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/02/cc/b7e31358aac6ed1ef2bb790a9746ac2c69bcb3c8588b41616914eb106eaf/exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b", size = 16453 },
+]
+
 [[package]]
 name = "fire"
 version = "0.7.0"
@@ -253,6 +299,43 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/90/27/45f8957c3132917f91aaa56b700bcfc2396be1253f685bd5c68529b6f610/fonttools-4.57.0-py3-none-any.whl", hash = "sha256:3122c604a675513c68bd24c6a8f9091f1c2376d18e8f5fe5a101746c81b3e98f", size = 1093605 },
 ]
 
+[[package]]
+name = "h11"
+version = "0.16.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515 },
+]
+
+[[package]]
+name = "httpcore"
+version = "1.0.9"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "certifi" },
+    { name = "h11" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784 },
+]
+
+[[package]]
+name = "httpx"
+version = "0.28.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "anyio" },
+    { name = "certifi" },
+    { name = "httpcore" },
+    { name = "idna" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517 },
+]
+
 [[package]]
 name = "idna"
 version = "3.10"
@@ -517,6 +600,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/86/8a/69176a64335aed183529207ba8bc3d329c2999d852b4f3818027203f50e6/opencv_python_headless-4.11.0.86-cp37-abi3-win_amd64.whl", hash = "sha256:6c304df9caa7a6a5710b91709dd4786bf20a74d57672b3c31f7033cc638174ca", size = 39402386 },
 ]
 
+[[package]]
+name = "opt-einsum"
+version = "3.3.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/7d/bf/9257e53a0e7715bc1127e15063e831f076723c6cd60985333a1c18878fb8/opt_einsum-3.3.0.tar.gz", hash = "sha256:59f6475f77bbc37dcf7cd748519c0ec60722e91e63ca114e68821c0c54a46549", size = 73951 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/bc/19/404708a7e54ad2798907210462fd950c3442ea51acc8790f3da48d2bee8b/opt_einsum-3.3.0-py3-none-any.whl", hash = "sha256:2455e59e3947d3c275477df7f5205b30635e266fe6dc300e3d9f9646bfcea147", size = 65486 },
+]
+
 [[package]]
 name = "packaging"
 version = "25.0"
@@ -556,6 +651,44 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f5/b8/b5ee34d6da98b69ae5483f5fb9170a4d83b998ad38462dd31dada007400b/paddleocr-2.10.0-py3-none-any.whl", hash = "sha256:38df818d87a00af854cbfd14e33615edc3c4fa2caeb662149d51e6f2e212013f", size = 2364999 },
 ]
 
+[[package]]
+name = "paddlepaddle"
+version = "3.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "astor" },
+    { name = "decorator" },
+    { name = "httpx" },
+    { name = "networkx" },
+    { name = "numpy" },
+    { name = "opt-einsum" },
+    { name = "pillow" },
+    { name = "protobuf" },
+    { name = "typing-extensions" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/cf/60/1a3dc2f48dc5eb82d1258e0a0d18b9e9db0ace73f077c2984c626191af2c/paddlepaddle-3.0.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:69321d2fed61f9c4b502843d213af0919e058f8399c3fb7da4b90ff9f9e8544d", size = 94754509 },
+    { url = "https://files.pythonhosted.org/packages/eb/c2/21c77d75f47398f8eb847aa85e456b19a1cca9cebb03e9c3cedce1888944/paddlepaddle-3.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7a8a3de6238d7588224353d7412d1869772fa88d0cc0119cb7c921bee9b1869c", size = 96916011 },
+    { url = "https://files.pythonhosted.org/packages/ab/07/24565a312b58fb8e78c8c4214db0cfaee44f6dec161c00756026991633d8/paddlepaddle-3.0.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:73b1349d91a85dae81e23aee0faf1d04f244e26c688c23e387a504437066829a", size = 192817880 },
+    { url = "https://files.pythonhosted.org/packages/10/e6/0e56a48490f0fc7e6f9b671adccb0ae972f73093e2257a3a0bb278174b3c/paddlepaddle-3.0.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:ea8841cdbea7f26dbe548d3b129dd9764f36a4338656c9b5257cac43b382a674", size = 92043392 },
+    { url = "https://files.pythonhosted.org/packages/df/1f/2bd1e792fc3ce1aae11625afe238868b291370048d32939985b09274a9d1/paddlepaddle-3.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:9fe8544bce8d034da5cdab1417782c7d53e1bef97533bcef5710dfdb59e87ca5", size = 97035297 },
+    { url = "https://files.pythonhosted.org/packages/d6/ff/5fe8b6852d02e52c20e5b8f1dce672d38a60c22ecdde4f726f3e1625c7c6/paddlepaddle-3.0.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f69b4ec5ec881e632bf80d8d2a0ca74f87925335dc57b14bb8971e72e2d87951", size = 94763205 },
+    { url = "https://files.pythonhosted.org/packages/46/24/4e07f557384d3aee30ad924a7e06273b1226876576fd4db5b10193a71427/paddlepaddle-3.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cb4a98192079cde5b5dac1109d334c12b376b97edc8682490b5f5065c8f708d9", size = 96920695 },
+    { url = "https://files.pythonhosted.org/packages/8b/ff/a8685638b8ddd1af3a43f25a2b8f05263a1bec808db990177c38c6228738/paddlepaddle-3.0.0-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:cfd73da79e2eb9325bac4a3a1a7995063c7662dde724bf452659dda88e55b76f", size = 192824195 },
+    { url = "https://files.pythonhosted.org/packages/7f/1d/7229c5eb164b390ebef4c55c42980781d88c38d59cb8922474e24204a5fe/paddlepaddle-3.0.0-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:3ef99688f4d5113dcf0e026a0b225efa77975da04e8ca1d0666ebdb729d4a409", size = 92048781 },
+    { url = "https://files.pythonhosted.org/packages/0a/3d/5e0e1440e3ccf1becfd6bd07ec3432e9e9fe797585038c95f00a9da0962a/paddlepaddle-3.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:215daf5c154855e9f4139844076853cabc86425fd127c390405895ae2b820d85", size = 97036460 },
+    { url = "https://files.pythonhosted.org/packages/e2/5e/07c10c1a98fde7c7dcb299dc62c6c06b2951073734a9daa93851ba29711f/paddlepaddle-3.0.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:dcfebfb10deb15aa33d2a2d66d5a12d9de65461d8a40d2687e2fb3221150d0ba", size = 94826038 },
+    { url = "https://files.pythonhosted.org/packages/ea/83/f3435988cc4cce3764942188ec5c5f92e858012a335642e6740fcb0c3061/paddlepaddle-3.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8677e8d42ff66f74fbca32ff32d8b64bbe3d56ed29a92d43dc418f0f168fcb69", size = 96946375 },
+    { url = "https://files.pythonhosted.org/packages/01/1d/5dfdda57de184d8406d6757f764a1ecdf26923d78ec7c427cb55b085c4d4/paddlepaddle-3.0.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:81a7c5b10150b07521d1cd1491dca1fb76ff621b14db10e899d959f3a77935ea", size = 192856178 },
+    { url = "https://files.pythonhosted.org/packages/e9/df/4254095dff1e3f87cd8a2be000e39bd1e65118ba3841466b2aeecb480c63/paddlepaddle-3.0.0-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:ddaa996a8b0a00dbe4269da21d65052877ec3693a22ecc07b9cada3ecd116886", size = 92026407 },
+    { url = "https://files.pythonhosted.org/packages/bd/ca/850a357ea6803eeeef3ec0703f15fbbe30a6dfb0c01671bfe0527cab8662/paddlepaddle-3.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:68efccd9b64233243e48d89a20d34904e2827abc3458c7dddb9e97ef8e2209f6", size = 97056269 },
+    { url = "https://files.pythonhosted.org/packages/c0/21/706cf5aa1ed7e3da092ee64fe1475a0dd7cc6ed0a907de0e87f5b55e39ce/paddlepaddle-3.0.0-cp313-cp313-macosx_10_9_x86_64.whl", hash = "sha256:52c401ffe4aae56316660a33015a6fd755cda23d3d70a28b13b73d70cbdbb8cb", size = 94826114 },
+    { url = "https://files.pythonhosted.org/packages/ce/02/f6a04543770dbc2edf6368789bf025e61bf934e3b20986072882801b2e85/paddlepaddle-3.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9d1051f2e208da1d439daee7c861bcb50240f09712496eb61e9ce8c618faf2f9", size = 96946877 },
+    { url = "https://files.pythonhosted.org/packages/2c/7a/59d9b82f500ab72e16382503cd0d42393c8d35d2a86622926d1c2b508804/paddlepaddle-3.0.0-cp313-cp313-manylinux1_x86_64.whl", hash = "sha256:8fc4865efd3656ee99424124792cb88a642151b4d591a7d6d4a5c6afdae95959", size = 192848478 },
+    { url = "https://files.pythonhosted.org/packages/5b/4a/a80b735e62c1dac0310f0dcdca51a29523cf863834f291866bd8991f270d/paddlepaddle-3.0.0-cp313-cp313-manylinux2014_aarch64.whl", hash = "sha256:bbc4d573b35d24a262e02d72696e80f5b4aee41f698671af9574deeb1a83e6f7", size = 92027573 },
+    { url = "https://files.pythonhosted.org/packages/87/56/45594a67d2603fc3b3b334017de7ef94eca6cbd21b8f53700e51e61a490e/paddlepaddle-3.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:2e1292d50f80a8577407008f7321f85c8309938c0c575927950aec3c45502c2a", size = 97053969 },
+]
+
 [[package]]
 name = "pdf2image"
 version = "1.17.0"
@@ -645,6 +778,20 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/21/2c/5e05f58658cf49b6667762cca03d6e7d85cededde2caf2ab37b81f80e574/pillow-11.2.1-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:208653868d5c9ecc2b327f9b9ef34e0e42a4cdd172c2988fd81d62d2bc9bc044", size = 2674751 },
 ]
 
+[[package]]
+name = "protobuf"
+version = "6.30.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/c8/8c/cf2ac658216eebe49eaedf1e06bc06cbf6a143469236294a1171a51357c3/protobuf-6.30.2.tar.gz", hash = "sha256:35c859ae076d8c56054c25b59e5e59638d86545ed6e2b6efac6be0b6ea3ba048", size = 429315 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/be/85/cd53abe6a6cbf2e0029243d6ae5fb4335da2996f6c177bb2ce685068e43d/protobuf-6.30.2-cp310-abi3-win32.whl", hash = "sha256:b12ef7df7b9329886e66404bef5e9ce6a26b54069d7f7436a0853ccdeb91c103", size = 419148 },
+    { url = "https://files.pythonhosted.org/packages/97/e9/7b9f1b259d509aef2b833c29a1f3c39185e2bf21c9c1be1cd11c22cb2149/protobuf-6.30.2-cp310-abi3-win_amd64.whl", hash = "sha256:7653c99774f73fe6b9301b87da52af0e69783a2e371e8b599b3e9cb4da4b12b9", size = 431003 },
+    { url = "https://files.pythonhosted.org/packages/8e/66/7f3b121f59097c93267e7f497f10e52ced7161b38295137a12a266b6c149/protobuf-6.30.2-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:0eb523c550a66a09a0c20f86dd554afbf4d32b02af34ae53d93268c1f73bc65b", size = 417579 },
+    { url = "https://files.pythonhosted.org/packages/d0/89/bbb1bff09600e662ad5b384420ad92de61cab2ed0f12ace1fd081fd4c295/protobuf-6.30.2-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:50f32cc9fd9cb09c783ebc275611b4f19dfdfb68d1ee55d2f0c7fa040df96815", size = 317319 },
+    { url = "https://files.pythonhosted.org/packages/28/50/1925de813499546bc8ab3ae857e3ec84efe7d2f19b34529d0c7c3d02d11d/protobuf-6.30.2-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:4f6c687ae8efae6cf6093389a596548214467778146b7245e886f35e1485315d", size = 316212 },
+    { url = "https://files.pythonhosted.org/packages/e5/a1/93c2acf4ade3c5b557d02d500b06798f4ed2c176fa03e3c34973ca92df7f/protobuf-6.30.2-py3-none-any.whl", hash = "sha256:ae86b030e69a98e08c77beab574cbcb9fff6d031d57209f574a5aea1445f4b51", size = 167062 },
+]
+
 [[package]]
 name = "pyclipper"
 version = "1.3.0.post6"
@@ -1026,6 +1173,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/0a/c8/b3f566db71461cabd4b2d5b39bcc24a7e1c119535c8361f81426be39bb47/scipy-1.15.2-cp313-cp313t-win_amd64.whl", hash = "sha256:fe8a9eb875d430d81755472c5ba75e84acc980e4a8f6204d402849234d3017db", size = 40477705 },
 ]
 
+[[package]]
+name = "setuptools"
+version = "79.0.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/bb/71/b6365e6325b3290e14957b2c3a804a529968c77a049b2ed40c095f749707/setuptools-79.0.1.tar.gz", hash = "sha256:128ce7b8f33c3079fd1b067ecbb4051a66e8526e7b65f6cec075dfc650ddfa88", size = 1367909 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0d/6d/b4752b044bf94cb802d88a888dc7d288baaf77d7910b7dedda74b5ceea0c/setuptools-79.0.1-py3-none-any.whl", hash = "sha256:e147c0549f27767ba362f9da434eab9c5dc0045d5304feb602a0af001089fc51", size = 1256281 },
+]
+
 [[package]]
 name = "shapely"
 version = "2.1.0"
@@ -1153,6 +1309,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/38/b9/941876e98dd1f98c158cd5e6633dc1573d1be6daf8f2e3ad5d15e6a8024d/simsimd-6.2.1-cp313-cp313-win_arm64.whl", hash = "sha256:e690b41377c8dd157d585713b0bc35c845aee7742334bf12d1f087fc8a65b6c3", size = 60408 },
 ]
 
+[[package]]
+name = "sniffio"
+version = "1.3.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235 },
+]
+
 [[package]]
 name = "soupsieve"
 version = "2.7"