Add document concatenation script and Makefile targets

realmarcin · claude · realmarcin · commit 901f666688c3 · 2025-10-28T21:23:22.000-07:00
Script Features (src/download/concatenate_documents.py): - Concatenates documents from a directory in reproducible alphabetical order - Includes table of contents with all file paths - Adds headers with filename, path, and size for each document - Handles multiple encodings (UTF-8, UTF-8-sig, latin-1) - Supports filtering by file extension - Supports recursive directory search - Customizable separators between documents - Optional headers and summary sections Makefile Targets: - make concat-docs INPUT_DIR=dir OUTPUT_FILE=file Generic concatenation with required parameters Optional: EXTENSIONS=".txt .md" RECURSIVE=true - make concat-extracted Concatenates D4D YAML files from data/extracted_by_column/ Creates one file per project column (AI_READI, CHORUS, CM4AI, VOICE) Output: data/concatenated/{column}_d4d.txt - make concat-downloads Concatenates raw downloads from downloads_by_column/ Creates one file per project column Output: data/concatenated/{column}_raw.txt Documentation: - Added comprehensive "Document Concatenation" section to CLAUDE.md - Documented all command options and use cases - Updated help menu in Makefile Use Cases: - Combine all downloaded dataset documentation for a project - Create single input documents for LLM processing - Merge documentation fragments into complete documents - Aggregate logs or reports from multiple files Tested with data/extracted_by_column/: - AI_READI: 6 files → 18K - CHORUS: 2 files → 4.0K - CM4AI: 4 files → 6.4K - VOICE: 4 files → 19K 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -225,6 +225,51 @@ The D4D agents use the `aurelian` framework:
 - Processes HTML, PDF, JSON, and text documents
 - Can be run via CLI: `aurelian datasheets <URL>` or `aurelian datasheets --ui`
 
+## Document Concatenation
+
+This project includes tools to concatenate multiple documents from a directory into a single document in reproducible order.
+
+### Concatenation Commands
+
+```bash
+# Concatenate documents from a specific directory
+make concat-docs INPUT_DIR=path/to/dir OUTPUT_FILE=path/to/output.txt
+
+# Optional parameters:
+make concat-docs INPUT_DIR=path/to/dir OUTPUT_FILE=output.txt EXTENSIONS=".txt .md" RECURSIVE=true
+
+# Concatenate extracted D4D documents by column (from data/extracted_by_column)
+make concat-extracted
+
+# Concatenate raw downloads by column (from downloads_by_column)
+make concat-downloads
+
+# Direct script usage with more options:
+python src/download/concatenate_documents.py -i input_dir -o output.txt [OPTIONS]
+
+# Script options:
+#   -e, --extensions .txt .md    # Filter by file extensions
+#   -r, --recursive             # Search subdirectories
+#   --no-headers                # Exclude file headers
+#   --no-summary                # Exclude table of contents
+#   -s "separator"              # Custom separator between files
+```
+
+### Features
+
+- **Reproducible ordering**: Files are sorted alphabetically for consistent results
+- **Multiple formats**: Handles text, HTML, YAML, and other text-based formats
+- **File metadata**: Includes headers with filename, path, and size
+- **Table of contents**: Summary section lists all concatenated files
+- **Error handling**: Gracefully handles encoding issues and read errors
+
+### Use Cases
+
+- Combine all downloaded dataset documentation for a project
+- Create single input documents for LLM processing
+- Merge documentation fragments into complete documents
+- Aggregate logs or reports from multiple files
+
 ## Custom Makefile Targets
 
 Beyond standard LinkML targets, this project adds:
@@ -235,6 +280,9 @@ make gen-html             # Generate HTML from D4D YAML files using human_readab
 make full-schema          # Generate data_sheets_schema_all.yaml (merged schema)
 make test-modules         # Validate all individual D4D module schemas
 make lint-modules         # Lint all individual D4D module schemas
+make concat-docs          # Concatenate documents from a directory
+make concat-extracted     # Concatenate extracted D4D documents by column
+make concat-downloads     # Concatenate raw downloads by column
 ```
 
 ## Null/Empty Value Handling
diff --git a/Makefile b/Makefile
@@ -50,6 +50,9 @@ help: status
 	@echo "make test-modules -- validate all D4D module schemas"
 	@echo "make lint -- perfom linting"
 	@echo "make lint-modules -- lint all D4D module schemas"
+	@echo "make concat-docs INPUT_DIR=dir OUTPUT_FILE=file -- concatenate documents from directory"
+	@echo "make concat-extracted -- concatenate extracted D4D documents by column"
+	@echo "make concat-downloads -- concatenate raw downloads by column"
 	@echo "make testdoc -- builds docs and runs local test server"
 	@echo "make deploy -- deploys site"
 	@echo "make update -- updates linkml version"
diff --git a/project.Makefile b/project.Makefile
@@ -15,3 +15,48 @@ gen-minimal-examples:
 # Generate HTML from current D4D YAML files
 gen-html:
 	$(RUN) python src/html/human_readable_renderer.py
+
+# Concatenate documents from a directory
+# Usage: make concat-docs INPUT_DIR=path/to/dir OUTPUT_FILE=path/to/output.txt
+# Optional: EXTENSIONS=".txt .md" RECURSIVE=true
+concat-docs:
+ifndef INPUT_DIR
+	$(error INPUT_DIR is not defined. Usage: make concat-docs INPUT_DIR=path/to/dir OUTPUT_FILE=path/to/output.txt)
+endif
+ifndef OUTPUT_FILE
+	$(error OUTPUT_FILE is not defined. Usage: make concat-docs INPUT_DIR=path/to/dir OUTPUT_FILE=path/to/output.txt)
+endif
+	@echo "Concatenating documents from $(INPUT_DIR) to $(OUTPUT_FILE)"
+	$(RUN) python src/download/concatenate_documents.py -i $(INPUT_DIR) -o $(OUTPUT_FILE) \
+		$(if $(EXTENSIONS),-e $(EXTENSIONS),) \
+		$(if $(RECURSIVE),-r,)
+
+# Concatenate extracted D4D documents by column
+# This creates a single file per project column from data/extracted_by_column
+concat-extracted:
+	@echo "Concatenating extracted D4D documents by column..."
+	@mkdir -p data/concatenated
+	@for column_dir in data/extracted_by_column/*/; do \
+		if [ -d "$$column_dir" ]; then \
+			column_name=$$(basename "$$column_dir"); \
+			output_file="data/concatenated/$${column_name}_d4d.txt"; \
+			echo "Processing $$column_name..."; \
+			$(RUN) python src/download/concatenate_documents.py -i "$$column_dir" -o "$$output_file" || exit 1; \
+		fi \
+	done
+	@echo "✅ All columns concatenated to data/concatenated/"
+
+# Concatenate documents from downloads_by_column subdirectories
+# This creates a single file per project column from raw downloads
+concat-downloads:
+	@echo "Concatenating downloaded documents by column..."
+	@mkdir -p data/concatenated
+	@for column_dir in downloads_by_column/*/; do \
+		if [ -d "$$column_dir" ]; then \
+			column_name=$$(basename "$$column_dir"); \
+			output_file="data/concatenated/$${column_name}_raw.txt"; \
+			echo "Processing $$column_name..."; \
+			$(RUN) python src/download/concatenate_documents.py -i "$$column_dir" -o "$$output_file" || exit 1; \
+		fi \
+	done
+	@echo "✅ All downloads concatenated to data/concatenated/"
diff --git a/src/download/concatenate_documents.py b/src/download/concatenate_documents.py