Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ dependencies = [
"webalgebra",
"pydantic>=2.0.0",
"pydantic-settings>=2.0.0",
"lxml>=5.0.0",
]

[build-system]
Expand Down
234 changes: 224 additions & 10 deletions src/scripts/push_arxiv_to_linkeddatahub.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,148 @@
warnings.filterwarnings("ignore", category=UserWarning, module="pydantic._internal._generate_schema")

import argparse
from urllib.parse import quote
import urllib.request
import urllib.error
from datetime import datetime
from pydantic_settings import BaseSettings, SettingsConfigDict
from pydantic import Field
from rdflib import Literal, Namespace, Dataset, URIRef, BNode
from rdflib.namespace import RDF, FOAF
import rdflib
from rdflib import Literal, Namespace, Dataset, URIRef, BNode, Graph
from rdflib.namespace import RDF, FOAF, XSD
from tqdm import tqdm
from lxml import etree
from ldh_bots.operations.feeds.arxiv import ArxivFeed
from web_algebra.operations.linkeddatahub.create_item import CreateItem
from web_algebra.operations.linkeddatahub.content.add_xhtml_block import AddXHTMLBlock
from web_algebra.operations.linked_data.post import POST

SCHEMA = Namespace("http://schema.org/")
XHTML_NS = "http://www.w3.org/1999/xhtml"
MATHML_NS = "http://www.w3.org/1998/Math/MathML"


def parse_date_to_arxiv_format(date_str: str) -> str:
"""Parse various date formats and convert to arXiv format (YYYYMMDDHHMM).

Supported input formats:
- YYYY-MM-DD (time defaults to 00:00)
- YYYYMMDD (time defaults to 00:00)
- YYYY-MM-DD HH:MM
- YYYYMMDDHHMM (native arXiv format, returned as-is)

Args:
date_str: Date string in one of the supported formats

Returns:
Date in arXiv format (YYYYMMDDHHMM)

Raises:
ValueError: If date format is not recognized
"""
date_str = date_str.strip()

# Try native arXiv format first (YYYYMMDDHHMM)
if len(date_str) == 12 and date_str.isdigit():
return date_str

# Try YYYYMMDD format
if len(date_str) == 8 and date_str.isdigit():
return date_str + "0000"

# Try YYYY-MM-DD format
try:
dt = datetime.strptime(date_str, "%Y-%m-%d")
return dt.strftime("%Y%m%d0000")
except ValueError:
pass

# Try YYYY-MM-DD HH:MM format
try:
dt = datetime.strptime(date_str, "%Y-%m-%d %H:%M")
return dt.strftime("%Y%m%d%H%M")
except ValueError:
pass

raise ValueError(
f"Invalid date format: '{date_str}'. "
f"Supported formats: YYYY-MM-DD, YYYYMMDD, YYYY-MM-DD HH:MM, or YYYYMMDDHHMM"
)


def fetch_arxiv_html(arxiv_id: str) -> str:
"""Fetch HTML version of ArXiv paper.

Args:
arxiv_id: ArXiv ID (e.g., "2510.12134v1")

Returns:
HTML content as string, or None if not available
"""
url = f"https://arxiv.org/html/{arxiv_id}"
try:
req = urllib.request.Request(url)
with urllib.request.urlopen(req) as response:
return response.read().decode('utf-8')
except urllib.error.HTTPError as e:
if e.code == 404:
return None
raise


def extract_sections(html_content: str) -> list[tuple[str, etree._Element, str]]:
"""Extract sections from ArXiv HTML.

Args:
html_content: Full HTML document as string

Returns:
List of tuples (section_id, section_element, section_title)
"""
root = etree.fromstring(html_content.encode('utf-8'))
sections = []

# Find all section elements
for section in root.xpath('//section[@id]'):
section_id = section.get('id')

# Get section title from first heading
title_elem = section.find('.//*[@class="ltx_title"]')
section_title = ""
if title_elem is not None:
# Extract text content, removing tags
section_title = ''.join(title_elem.itertext()).strip()

sections.append((section_id, section, section_title))

return sections


def wrap_and_serialize_c14n(section: etree._Element) -> str:
"""Wrap section in XHTML div and serialize as canonical XML.

Args:
section: Section element to wrap

Returns:
C14N 2.0 serialized XHTML string (required for RDF XMLLiterals)
"""
from copy import deepcopy
import unicodedata

# Create wrapper div with XHTML as default namespace (not prefixed)
wrapper = etree.Element(f"{{{XHTML_NS}}}div", nsmap={None: XHTML_NS})

# Deep copy the section
section_copy = deepcopy(section)
wrapper.append(section_copy)

# Use C14N 2.0 (required for RDF XMLLiterals)
c14n_xml = etree.canonicalize(wrapper, with_comments=False)

# Normalize Unicode to NFC form (canonical composition)
c14n_xml = unicodedata.normalize('NFC', c14n_xml)

return c14n_xml


class Settings(BaseSettings):
Expand All @@ -41,8 +172,8 @@ class Settings(BaseSettings):
parser.add_argument(
"--container",
type=str,
default="https://papers.localhost:4443/papers/",
help="LinkedDataHub container URL (default: https://papers.localhost:4443/papers/)"
default="https://papers.localhost:4443/arxiv/",
help="LinkedDataHub container URL (default: https://papers.localhost:4443/arxiv/)"
)
parser.add_argument(
"--cert",
Expand Down Expand Up @@ -76,6 +207,18 @@ class Settings(BaseSettings):
choices=["ascending", "descending"],
help="Sort order (default: descending)"
)
parser.add_argument(
"--from-date",
type=str,
default=None,
help="Filter papers from this date (formats: YYYY-MM-DD, YYYYMMDD, YYYY-MM-DD HH:MM, or YYYYMMDDHHMM)"
)
parser.add_argument(
"--to-date",
type=str,
default=None,
help="Filter papers up to this date (formats: YYYY-MM-DD, YYYYMMDD, YYYY-MM-DD HH:MM, or YYYYMMDDHHMM)"
)

args = parser.parse_args()

Expand All @@ -88,11 +231,27 @@ class Settings(BaseSettings):
# LinkedDataHub container URL
container_url = URIRef(args.container)

# Construct date filter if provided
search_query = args.search_query
if args.from_date or args.to_date:
try:
from_date_str = parse_date_to_arxiv_format(args.from_date) if args.from_date else "190001010000"
to_date_str = parse_date_to_arxiv_format(args.to_date) if args.to_date else "209912312359"

date_filter = f"submittedDate:[{from_date_str} TO {to_date_str}]"
# Combine with existing search query using AND
search_query = f"({args.search_query}) AND {date_filter}"

print(f"Date filter applied: {date_filter}")
except ValueError as e:
print(f"Error parsing date: {e}")
exit(1)

# Step 1: Fetch ArXiv papers
print(f"Searching ArXiv for: '{args.search_query}'...")
print(f"Searching ArXiv for: '{search_query}'...")
arxiv_feed = ArxivFeed(settings=settings, context={})
papers_dataset = arxiv_feed.execute(
search_query=Literal(args.search_query),
search_query=Literal(search_query),
max_results=Literal(args.max_results),
sort_by=Literal(args.sort_by),
sort_order=Literal(args.sort_order)
Expand All @@ -107,6 +266,7 @@ class Settings(BaseSettings):
# Initialize operations
create_item = CreateItem(settings=settings, context={})
post = POST(settings=settings, context={})
add_xhtml_block = AddXHTMLBlock(settings=settings, context={})

paper_count = 0

Expand All @@ -119,6 +279,12 @@ class Settings(BaseSettings):
print(f"Skipping graph without article")
continue

# Get ArXiv ID for slug
arxiv_id = paper_graph.value(paper_blank_node, URIRef("http://arxiv.org/property/id"))
if not arxiv_id:
print(f"Skipping paper without ArXiv ID")
continue

title = paper_graph.value(paper_blank_node, SCHEMA.headline)
if not title:
title = paper_graph.value(paper_blank_node, SCHEMA.name)
Expand All @@ -128,14 +294,14 @@ class Settings(BaseSettings):

paper_count += 1
print(f"\n[{paper_count}] Processing: {title}")
print(f" ArXiv ID: {arxiv_id}")

# Step 2a: Create item in LinkedDataHub to get the item URL
slug = quote(str(title).lower().replace(" ", "-"), safe="")
# Step 2a: Create item in LinkedDataHub using ArXiv ID as slug
try:
result = create_item.execute(
container_uri=container_url,
title=Literal(str(title)),
slug=Literal(slug)
slug=Literal(str(arxiv_id))
)
item_url = result.bindings[0]["url"]
print(f" Created item at: {item_url}")
Expand Down Expand Up @@ -170,6 +336,54 @@ class Settings(BaseSettings):
status = post_result.bindings[0]["status"]
print(f" POST status: {status}")

# Step 2e: Fetch and add HTML sections if available
print(f" Fetching HTML version for {arxiv_id}...")
html_content = fetch_arxiv_html(str(arxiv_id))

if html_content:
print(f" Extracting sections from HTML...")
sections = extract_sections(html_content)
print(f" Found {len(sections)} sections")

# Add each section as XHTML block
for section_id, section_elem, section_title in sections:
try:
print(f" Adding section {section_id}: {section_title[:50]}...")

# Wrap and serialize as C14N
xhtml_content = wrap_and_serialize_c14n(section_elem)

# Add XHTML block to LinkedDataHub
try:
# Workaround for RDFLib bug: Disable literal normalization to prevent
# minidom.toxml() from converting empty elements like <div></div> to <div/>
# which violates C14N and breaks Apache Jena 4.7.0 XMLLiteral validation
# See: https://github.com/RDFLib/rdflib/blob/main/rdflib/term.py#L1930
rdflib.NORMALIZE_LITERALS = False
xml_literal = Literal(xhtml_content, datatype=RDF.XMLLiteral)

add_xhtml_block.execute(
url=URIRef(item_url),
value=xml_literal,
title=Literal(section_title, datatype=XSD.string) if section_title else None,
fragment=Literal(section_id, datatype=XSD.string)
)

rdflib.NORMALIZE_LITERALS = True
except urllib.error.HTTPError as e:
error_body = e.read().decode('utf-8') if hasattr(e, 'read') else str(e)
print(f" Error adding section {section_id}: HTTP {e.code}")
print(f" Error body: {error_body[:200]}...")
continue
except Exception as e:
print(f" Error adding section {section_id}: {e}")
continue
except Exception as e:
print(f" Error processing section {section_id}: {e}")
continue
else:
print(f" HTML version not available for {arxiv_id}")

except Exception as e:
print(f" Error posting data: {e}")
continue
Expand Down
Loading