完善自动获取url与url合法性校验脚本

hd9568 · hd9568 · commit ea45038515eb · 2025-10-10T07:20:19.000Z
diff --git a/docs/guides/model_convert/convert_from_pytorch/tools/generate_pytorch_api_mapping.py b/docs/guides/model_convert/convert_from_pytorch/tools/generate_pytorch_api_mapping.py
@@ -17,57 +17,165 @@ def get_pytorch_url(torch_api: str) -> str:
         对应API的官方文档URL字符串
 
     Rules:
-    1. Tensor相关API指向tensors.html
-    2. 顶层函数（torch.xxx）指向torch.html
-    3. 模块级函数/常量指向模块名.html（如nn.init.html）
-    4. 类/独立函数指向generated/[name].html
-    5. 类方法指向父类页面#锚点
-    6. 特殊处理torchvision等子库的URL结构
+    1. 优先检查特殊映射
+    2. 优先检查是否有专门的generated页面
+    3. 类方法指向父类页面#锚点
+    4. 模块级函数/常量指向模块名.html
+    5. Tensor相关API指向tensors.html
+    6. 顶层函数（torch.xxx）指向torch.html
+    7. 特殊处理torchvision等子库的URL结构
     """
     base_url = "https://pytorch.org/docs/stable/"
-    api_name = torch_api.replace(r"\_", "_")
+    torch_api = torch_api.replace(r"\_", "_")
+
+    # 特殊映射：手动指定已知问题API的正确URL
+    special_mappings = {
+        "torch.cuda.check_error": "generated/torch.cuda.cudart.html",
+        "torch.cuda.mem_get_info": "generated/torch.cuda.memory.mem_get_info.html",
+        "torch.nn.attention.sdpa_kernel": "generated/torch.nn.attention.sdpa_kernel.html",
+        "torch.torch.int32": "tensors.html#torch.int32",
+        "torch.nn.attention._cur_sdpa_kernel_backends": "nn.attention.html#torch.nn.attention.sdpa_kernel",
+        "torch.cuda.memory_reserved": "generated/torch.cuda.memory.memory_reserved.html",
+        "torch.cuda.memory_allocated": "generated/torch.cuda.memory.memory_allocated.html",
+        "torch.cuda.empty_cache": "generated/torch.cuda.memory.empty_cache.html",
+    }
+
+    # 检查特殊映射
+    if torch_api in special_mappings:
+        return f"{base_url}{special_mappings[torch_api]}"
+
+    # 优先检查是否有专门的generated页面
+    generated_apis = {
+        "torch.pow": "generated/torch.pow.html",
+        "torch.nn.utils.parameters_to_vector": "generated/torch.nn.utils.parameters_to_vector.html",
+        "torch.nn.utils.vector_to_parameters": "generated/torch.nn.utils.vector_to_parameters.html",
+        "torch.nn.Module": "generated/torch.nn.Module.html",
+    }
+
+    if torch_api in generated_apis:
+        return f"{base_url}{generated_apis[torch_api]}"
+
+    # 特殊处理：类方法（如torch.nn.Module.to）
+    if torch_api.startswith("torch.nn.Module."):
+        return f"{base_url}generated/torch.nn.Module.html#{torch_api}"
+
+    if torch_api.startswith("torch.linalg.") or torch_api.startswith(
+        "torch.cuda."
+    ):
+        return f"{base_url}generated/{torch_api}.html#{torch_api}"
 
     # 特殊子库处理（torchvision）
-    if api_name.startswith("torchvision."):
+    if torch_api.startswith("torchvision."):
         vision_base = "https://pytorch.org/vision/stable/"
-        if api_name == "torchvision.models":
+        if torch_api == "torchvision.models":
             return f"{vision_base}models.html"
-        return f"{vision_base}generated/{api_name}.html#{api_name}"
+        return f"{vision_base}generated/{torch_api}.html#{torch_api}"
+
+    # 特殊处理：torch.__version__相关
+    if torch_api.startswith("torch.__version__"):
+        return base_url  # 版本信息通常在首页
+
+    # 特殊处理：torch.distributed.ReduceOp枚举值
+    if torch_api.startswith("torch.distributed.ReduceOp."):
+        return f"{base_url}distributed.html#{torch_api}"
+
+    # 特殊处理：torch.autograd.Function
+    if torch_api == "torch.autograd.Function":
+        return f"{base_url}autograd.html#{torch_api}"
+
+    # 特殊处理：torch.utils.cpp_extension
+    if torch_api.startswith("torch.utils.cpp_extension"):
+        return f"{base_url}cpp_extension.html#{torch_api}"
 
     # 1. 处理Tensor相关API
-    if api_name.startswith("torch.Tensor") or api_name == "torch.Tensor":
-        return f"{base_url}tensors.html#{api_name}"
+    if torch_api.startswith("torch.Tensor") or torch_api == "torch.Tensor":
+        return f"{base_url}tensors.html#{torch_api}"
 
     # 2. 处理顶层函数（无子模块）
-    if len(api_name.split(".")) == 2 and api_name.startswith("torch."):
-        return f"{base_url}torch.html#{api_name}"
+    if len(torch_api.split(".")) == 2 and torch_api.startswith("torch."):
+        # 检查是否有专门的generated页面
+        generated_check = [
+            "torch.pow",
+            "torch.abs",
+            "torch.add",
+            "torch.sub",
+            "torch.mul",
+            "torch.div",
+            "torch.exp",
+            "torch.log",
+            "torch.sin",
+            "torch.cos",
+            "torch.tan",
+            "torch.sigmoid",
+        ]
+
+        if any(torch_api.startswith(prefix) for prefix in generated_check):
+            return f"{base_url}generated/{torch_api}.html"
+        return f"{base_url}torch.html#{torch_api}"
 
     # 分割API路径
-    parts = api_name.split(".")
+    parts = torch_api.split(".")
     module_path = ".".join(parts[:-1])  # 模块路径
     item_name = parts[-1]  # 最后一项名称
 
+    # 特殊处理：torch.functional函数
+    if parts[0] == "torch" and parts[1] == "functional":
+        return f"{base_url}torch.html#{torch_api}"
+
     # 3. 处理模块级函数/常量
     if parts[0] == "torch" and not parts[-1][0].isupper():
         # 特殊模块映射（基于官方文档结构）
         module_map = {
-            "torch.nn.init": "nn.init",
-            "torch.nn.functional": "nn.functional",
-            "torch.cuda.amp": "amp",
-            "torch.distributions": "distributions",
+            "torch.nn.init": "nn.init.html",
+            "torch.nn.functional": "nn.functional.html",
+            "torch.cuda.amp": "amp.html",
+            "torch.distributions": "distributions.html",
+            "torch.nn.utils": "nn.utils.html",
+            "torch.optim": "optim.html",
+            "torch.random": "random.html",
+            "torch.special": "special.html",
+            "torch.distributed": "distributed.html",
+            "torch.utils.data": "data.html",
         }
         module_key = ".".join(parts[:-1])
-        module_slug = module_map.get(
-            module_key, module_key.replace("torch.", "")
-        )
-        return f"{base_url}{module_slug}.html#{api_name}"
+        module_slug = module_map.get(module_key, f"generated/{module_key}.html")
+
+        # 检查是否是应该指向generated目录的API
+        generated_modules = [
+            "torch.nn.utils.parameters_to_vector",
+            "torch.nn.utils.vector_to_parameters",
+        ]
+
+        if torch_api in generated_modules:
+            return f"{base_url}generated/{torch_api}.html"
+
+        return f"{base_url}{module_slug}#{torch_api}"
 
     # 4. 处理类/独立函数
     if parts[-1][0].isupper() or len(parts) == 1:
-        return f"{base_url}generated/{api_name}.html#{api_name}"
+        # 特殊类映射
+        class_map = {
+            "torch.autograd.Function": "autograd.html",
+            "torch.utils.cpp_extension.BuildExtension": "cpp_extension.html",
+            "torch.nn.Module": "generated/torch.nn.Module.html",
+        }
+        if torch_api in class_map:
+            return f"{base_url}{class_map[torch_api]}#{torch_api}"
+        return f"{base_url}generated/{torch_api}.html#{torch_api}"
 
     # 5. 默认处理（类方法）
-    return f"{base_url}generated/{module_path}.html#{api_name}"
+    # 特殊处理类方法
+    class_method_map = {
+        "torch.nn.Module": "generated/torch.nn.Module.html",
+        "torch.utils.cpp_extension.BuildExtension": "cpp_extension.html",
+    }
+
+    for class_name, page_name in class_method_map.items():
+        if module_path == class_name:
+            return f"{base_url}{page_name}#{torch_api}"
+
+    # 默认情况下，尝试生成到generated目录
+    return f"{base_url}generated/{module_path}.html#{torch_api}"
 
 
 def escape_underscores_in_api(api_name):
diff --git a/docs/guides/model_convert/convert_from_pytorch/tools/validate_api_difference.py b/docs/guides/model_convert/convert_from_pytorch/tools/validate_api_difference.py
@@ -1,19 +1,41 @@
 import argparse
 import concurrent.futures
 import os
+import random
 import re
+import time
 from collections import defaultdict
 from urllib.parse import urlparse
 
 import requests
+from requests.adapters import HTTPAdapter
 from tqdm import tqdm  # 用于显示进度条
+from urllib3.util.retry import Retry
 
 # 默认文件路径
 DEFAULT_FILE_PATH = "/workspace/paddleDocs/docs/guides/model_convert/convert_from_pytorch/pytorch_api_mapping_cn.md"
 
 # 用户代理头，模拟浏览器访问
 USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
 
+# 重试策略配置
+RETRY_STRATEGY = Retry(
+    total=3,
+    backoff_factor=0.5,
+    status_forcelist=[429, 500, 502, 503, 504],
+    allowed_methods=["HEAD", "GET"],
+)
+
+
+def create_session():
+    """创建带有重试机制的会话"""
+    session = requests.Session()
+    adapter = HTTPAdapter(max_retries=RETRY_STRATEGY)
+    session.mount("http://", adapter)
+    session.mount("https://", adapter)
+    session.headers.update({"User-Agent": USER_AGENT})
+    return session
+
 
 def parse_toc(lines):
     """
@@ -324,7 +346,7 @@ def is_valid_url(url):
         return False
 
 
-def check_url_exists(url_info):
+def check_url_exists(url_info, session=None):
     """
     检查URL是否存在（是否返回404）
     返回状态码和错误信息
@@ -340,21 +362,21 @@ def check_url_exists(url_info):
             "url_info": url_info,
         }
 
-    # 设置请求头
-    headers = {"User-Agent": USER_AGENT}
+    # 添加随机延迟，避免请求过于频繁
+    time.sleep(random.uniform(0.5, 1.5))
+
+    # 创建会话（如果未提供）
+    if session is None:
+        session = create_session()
 
     try:
         # 发送HEAD请求（更快，节省带宽）
-        response = requests.head(
-            url, headers=headers, timeout=10, allow_redirects=True
-        )
+        response = session.head(url, timeout=10, allow_redirects=True)
         status_code = response.status_code
 
         # 如果HEAD请求不被支持（405错误），则尝试GET请求
         if status_code == 405:
-            response = requests.get(
-                url, headers=headers, timeout=10, allow_redirects=True
-            )
+            response = session.get(url, timeout=10, allow_redirects=True)
             status_code = response.status_code
 
         # 根据状态码判断URL是否存在
@@ -409,6 +431,9 @@ def check_urls_exist(urls_with_context, max_workers=10):
     返回警告列表
     """
     warnings = []
+
+    urls_with_context = urls_with_context[-700:]
+
     total_urls = len(urls_with_context)
 
     print(
@@ -421,11 +446,16 @@ def check_urls_exist(urls_with_context, max_workers=10):
             max_workers=max_workers
         ) as executor,
     ):
+        # 为每个线程创建一个会话
+        sessions = [create_session() for _ in range(max_workers)]
+
         # 提交所有任务
-        future_to_url = {
-            executor.submit(check_url_exists, url_info): url_info
-            for url_info in urls_with_context
-        }
+        future_to_url = {}
+        for i, url_info in enumerate(urls_with_context):
+            # 分配会话给任务（轮询方式）
+            session = sessions[i % max_workers]
+            future = executor.submit(check_url_exists, url_info, session)
+            future_to_url[future] = url_info
 
         # 处理完成的任务
         for future in concurrent.futures.as_completed(future_to_url):
@@ -445,6 +475,10 @@ def check_urls_exist(urls_with_context, max_workers=10):
                     warning_msg += f"状态码: {result['status_code']}\n"
                 warnings.append(warning_msg)
 
+    # 关闭所有会话
+    for session in sessions:
+        session.close()
+
     print(f"URL检查完成，发现 {len(warnings)} 个问题")
     return warnings
 
@@ -479,7 +513,7 @@ def main():
     # 检查文件是否存在
     if not os.path.exists(md_file_path):
         print(f"错误: 文件 '{md_file_path}' 不存在")
-        print("请使用 --file 参数指定正确的文件路径")
+        print("请使用 --file 参数指定文件路径")
         return
 
     # 读取文件所有行