AtomGraph · namedgraph · Oct 30, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,6 +12,7 @@ dependencies = [
     "webalgebra",
     "pydantic>=2.0.0",
     "pydantic-settings>=2.0.0",
+    "lxml>=5.0.0",
 ]
 
 [build-system]

diff --git a/src/scripts/push_arxiv_to_linkeddatahub.py b/src/scripts/push_arxiv_to_linkeddatahub.py
@@ -5,17 +5,148 @@
 warnings.filterwarnings("ignore", category=UserWarning, module="pydantic._internal._generate_schema")
 
 import argparse
-from urllib.parse import quote
+import urllib.request
+import urllib.error
+from datetime import datetime
 from pydantic_settings import BaseSettings, SettingsConfigDict
 from pydantic import Field
-from rdflib import Literal, Namespace, Dataset, URIRef, BNode
-from rdflib.namespace import RDF, FOAF
+import rdflib
+from rdflib import Literal, Namespace, Dataset, URIRef, BNode, Graph
+from rdflib.namespace import RDF, FOAF, XSD
 from tqdm import tqdm
+from lxml import etree
 from ldh_bots.operations.feeds.arxiv import ArxivFeed
 from web_algebra.operations.linkeddatahub.create_item import CreateItem
+from web_algebra.operations.linkeddatahub.content.add_xhtml_block import AddXHTMLBlock
 from web_algebra.operations.linked_data.post import POST
 
 SCHEMA = Namespace("http://schema.org/")
+XHTML_NS = "http://www.w3.org/1999/xhtml"
+MATHML_NS = "http://www.w3.org/1998/Math/MathML"
+
+
+def parse_date_to_arxiv_format(date_str: str) -> str:
+    """Parse various date formats and convert to arXiv format (YYYYMMDDHHMM).
+
+    Supported input formats:
+    - YYYY-MM-DD (time defaults to 00:00)
+    - YYYYMMDD (time defaults to 00:00)
+    - YYYY-MM-DD HH:MM
+    - YYYYMMDDHHMM (native arXiv format, returned as-is)
+
+    Args:
+        date_str: Date string in one of the supported formats
+
+    Returns:
+        Date in arXiv format (YYYYMMDDHHMM)
+
+    Raises:
+        ValueError: If date format is not recognized
+    """
+    date_str = date_str.strip()
+
+    # Try native arXiv format first (YYYYMMDDHHMM)
+    if len(date_str) == 12 and date_str.isdigit():
+        return date_str
+
+    # Try YYYYMMDD format
+    if len(date_str) == 8 and date_str.isdigit():
+        return date_str + "0000"
+
+    # Try YYYY-MM-DD format
+    try:
+        dt = datetime.strptime(date_str, "%Y-%m-%d")
+        return dt.strftime("%Y%m%d0000")
+    except ValueError:
+        pass
+
+    # Try YYYY-MM-DD HH:MM format
+    try:
+        dt = datetime.strptime(date_str, "%Y-%m-%d %H:%M")
+        return dt.strftime("%Y%m%d%H%M")
+    except ValueError:
+        pass
+
+    raise ValueError(
+        f"Invalid date format: '{date_str}'. "
+        f"Supported formats: YYYY-MM-DD, YYYYMMDD, YYYY-MM-DD HH:MM, or YYYYMMDDHHMM"
+    )
+
+
+def fetch_arxiv_html(arxiv_id: str) -> str:
+    """Fetch HTML version of ArXiv paper.
+
+    Args:
+        arxiv_id: ArXiv ID (e.g., "2510.12134v1")
+
+    Returns:
+        HTML content as string, or None if not available
+    """
+    url = f"https://arxiv.org/html/{arxiv_id}"
+    try:
+        req = urllib.request.Request(url)
+        with urllib.request.urlopen(req) as response:
+            return response.read().decode('utf-8')
+    except urllib.error.HTTPError as e:
+        if e.code == 404:
+            return None
+        raise
+
+
+def extract_sections(html_content: str) -> list[tuple[str, etree._Element, str]]:
+    """Extract sections from ArXiv HTML.
+
+    Args:
+        html_content: Full HTML document as string
+
+    Returns:
+        List of tuples (section_id, section_element, section_title)
+    """
+    root = etree.fromstring(html_content.encode('utf-8'))
+    sections = []
+
+    # Find all section elements
+    for section in root.xpath('//section[@id]'):
+        section_id = section.get('id')
+
+        # Get section title from first heading
+        title_elem = section.find('.//*[@class="ltx_title"]')
+        section_title = ""
+        if title_elem is not None:
+            # Extract text content, removing tags
+            section_title = ''.join(title_elem.itertext()).strip()
+
+        sections.append((section_id, section, section_title))
+
+    return sections
+
+
+def wrap_and_serialize_c14n(section: etree._Element) -> str:
+    """Wrap section in XHTML div and serialize as canonical XML.
+
+    Args:
+        section: Section element to wrap
+
+    Returns:
+        C14N 2.0 serialized XHTML string (required for RDF XMLLiterals)
+    """
+    from copy import deepcopy
+    import unicodedata
+
+    # Create wrapper div with XHTML as default namespace (not prefixed)
+    wrapper = etree.Element(f"{{{XHTML_NS}}}div", nsmap={None: XHTML_NS})
+
+    # Deep copy the section
+    section_copy = deepcopy(section)
+    wrapper.append(section_copy)
+
+    # Use C14N 2.0 (required for RDF XMLLiterals)
+    c14n_xml = etree.canonicalize(wrapper, with_comments=False)
+
+    # Normalize Unicode to NFC form (canonical composition)
+    c14n_xml = unicodedata.normalize('NFC', c14n_xml)
+
+    return c14n_xml
 
 
 class Settings(BaseSettings):
@@ -41,8 +172,8 @@ class Settings(BaseSettings):
     parser.add_argument(
         "--container",
         type=str,
-        default="https://papers.localhost:4443/papers/",
-        help="LinkedDataHub container URL (default: https://papers.localhost:4443/papers/)"
+        default="https://papers.localhost:4443/arxiv/",
+        help="LinkedDataHub container URL (default: https://papers.localhost:4443/arxiv/)"
     )
     parser.add_argument(
         "--cert",
@@ -76,6 +207,18 @@ class Settings(BaseSettings):
         choices=["ascending", "descending"],
         help="Sort order (default: descending)"
     )
+    parser.add_argument(
+        "--from-date",
+        type=str,
+        default=None,
+        help="Filter papers from this date (formats: YYYY-MM-DD, YYYYMMDD, YYYY-MM-DD HH:MM, or YYYYMMDDHHMM)"
+    )
+    parser.add_argument(
+        "--to-date",
+        type=str,
+        default=None,
+        help="Filter papers up to this date (formats: YYYY-MM-DD, YYYYMMDD, YYYY-MM-DD HH:MM, or YYYYMMDDHHMM)"
+    )
 
     args = parser.parse_args()
 
@@ -88,11 +231,27 @@ class Settings(BaseSettings):
     # LinkedDataHub container URL
     container_url = URIRef(args.container)
 
+    # Construct date filter if provided
+    search_query = args.search_query
+    if args.from_date or args.to_date:
+        try:
+            from_date_str = parse_date_to_arxiv_format(args.from_date) if args.from_date else "190001010000"
+            to_date_str = parse_date_to_arxiv_format(args.to_date) if args.to_date else "209912312359"
+
+            date_filter = f"submittedDate:[{from_date_str} TO {to_date_str}]"
+            # Combine with existing search query using AND
+            search_query = f"({args.search_query}) AND {date_filter}"
+
+            print(f"Date filter applied: {date_filter}")
+        except ValueError as e:
+            print(f"Error parsing date: {e}")
+            exit(1)
+
     # Step 1: Fetch ArXiv papers
-    print(f"Searching ArXiv for: '{args.search_query}'...")
+    print(f"Searching ArXiv for: '{search_query}'...")
     arxiv_feed = ArxivFeed(settings=settings, context={})
     papers_dataset = arxiv_feed.execute(
-        search_query=Literal(args.search_query),
+        search_query=Literal(search_query),
         max_results=Literal(args.max_results),
         sort_by=Literal(args.sort_by),
         sort_order=Literal(args.sort_order)
@@ -107,6 +266,7 @@ class Settings(BaseSettings):
     # Initialize operations
     create_item = CreateItem(settings=settings, context={})
     post = POST(settings=settings, context={})
+    add_xhtml_block = AddXHTMLBlock(settings=settings, context={})
 
     paper_count = 0
 
@@ -119,6 +279,12 @@ class Settings(BaseSettings):
             print(f"Skipping graph without article")
             continue
 
+        # Get ArXiv ID for slug
+        arxiv_id = paper_graph.value(paper_blank_node, URIRef("http://arxiv.org/property/id"))
+        if not arxiv_id:
+            print(f"Skipping paper without ArXiv ID")
+            continue
+
         title = paper_graph.value(paper_blank_node, SCHEMA.headline)
         if not title:
             title = paper_graph.value(paper_blank_node, SCHEMA.name)
@@ -128,14 +294,14 @@ class Settings(BaseSettings):
 
         paper_count += 1
         print(f"\n[{paper_count}] Processing: {title}")
+        print(f"    ArXiv ID: {arxiv_id}")
 
-        # Step 2a: Create item in LinkedDataHub to get the item URL
-        slug = quote(str(title).lower().replace(" ", "-"), safe="")
+        # Step 2a: Create item in LinkedDataHub using ArXiv ID as slug
         try:
             result = create_item.execute(
                 container_uri=container_url,
                 title=Literal(str(title)),
-                slug=Literal(slug)
+                slug=Literal(str(arxiv_id))
             )
             item_url = result.bindings[0]["url"]
             print(f"    Created item at: {item_url}")
@@ -170,6 +336,54 @@ class Settings(BaseSettings):
             status = post_result.bindings[0]["status"]
             print(f"    POST status: {status}")
 
+            # Step 2e: Fetch and add HTML sections if available
+            print(f"    Fetching HTML version for {arxiv_id}...")
+            html_content = fetch_arxiv_html(str(arxiv_id))
+
+            if html_content:
+                print(f"    Extracting sections from HTML...")
+                sections = extract_sections(html_content)
+                print(f"    Found {len(sections)} sections")
+
+                # Add each section as XHTML block
+                for section_id, section_elem, section_title in sections:
+                    try:
+                        print(f"      Adding section {section_id}: {section_title[:50]}...")
+
+                        # Wrap and serialize as C14N
+                        xhtml_content = wrap_and_serialize_c14n(section_elem)
+
+                        # Add XHTML block to LinkedDataHub
+                        try:
+                            # Workaround for RDFLib bug: Disable literal normalization to prevent
+                            # minidom.toxml() from converting empty elements like <div></div> to <div/>
+                            # which violates C14N and breaks Apache Jena 4.7.0 XMLLiteral validation
+                            # See: https://github.com/RDFLib/rdflib/blob/main/rdflib/term.py#L1930
+                            rdflib.NORMALIZE_LITERALS = False
+                            xml_literal = Literal(xhtml_content, datatype=RDF.XMLLiteral)
+
+                            add_xhtml_block.execute(
+                                url=URIRef(item_url),
+                                value=xml_literal,
+                                title=Literal(section_title, datatype=XSD.string) if section_title else None,
+                                fragment=Literal(section_id, datatype=XSD.string)
+                            )
+
+                            rdflib.NORMALIZE_LITERALS = True
+                        except urllib.error.HTTPError as e:
+                            error_body = e.read().decode('utf-8') if hasattr(e, 'read') else str(e)
+                            print(f"      Error adding section {section_id}: HTTP {e.code}")
+                            print(f"      Error body: {error_body[:200]}...")
+                            continue
+                        except Exception as e:
+                            print(f"      Error adding section {section_id}: {e}")
+                            continue
+                    except Exception as e:
+                        print(f"      Error processing section {section_id}: {e}")
+                        continue
+            else:
+                print(f"    HTML version not available for {arxiv_id}")
+
         except Exception as e:
             print(f"    Error posting data: {e}")
             continue