diff --git a/.gitignore b/.gitignore index 64e3a8d..cef8580 100644 --- a/.gitignore +++ b/.gitignore @@ -30,4 +30,4 @@ claude-flow claude-flow.bat claude-flow.ps1 hive-mind-prompt-*.txt -.claude-flow/metrics +**/.claude-flow/metrics diff --git a/REFACTOR.md b/REFACTOR.md new file mode 100644 index 0000000..c215666 --- /dev/null +++ b/REFACTOR.md @@ -0,0 +1,77 @@ +Refactoring Plan: Specification-Driven TestingThis document outlines the process for refactoring the existing test suite into a specification-driven format. The goal is to capture the current "as-is" behavior of the system in declarative artifacts, which will enable a more robust, maintainable, and agent-friendly development workflow.🎯 Phase 1: Setup and ScaffoldingThis phase lays the foundation for the new testing structure.Task 1.1: Create the Specification Directory StructureCreate the following new directories in the root of the repository:specs/ +├── README.md +├── reference_corpus/ +└── test_cases/ +specs/README.md: Create this file and add a brief explanation of the purpose of this new testing structure.specs/reference_corpus/: Copy the contents of the existing sample_data/ directory into this new directory. This will be our integration and regression testing suite.specs/test_cases/: This directory will hold the individual unit tests, with each feature getting its own subdirectory.Task 1.2: Create the Generic Test RunnerCreate a new test file, tests/test_specifications.py. This single file will eventually replace most of the existing unit tests. It will contain a generic, data-driven test runner.# tests/test_specifications.py +import pytest +from pathlib import Path +from knowledgebase_processor.processor import Processor + +# You will need a way to compare two RDF graphs. +# The `rdflib.compare.isomorphic` function is perfect for this. +from rdflib import Graph +from rdflib.compare import isomorphic + +def run_spec_test(test_case_dir: Path): + """ + Runs a single specification-driven test. + """ + input_md_path = test_case_dir / "input.md" + expected_output_ttl_path = test_case_dir / "expected_output.ttl" + + # 1. Read the input markdown file + input_md_content = input_md_path.read_text() + + # 2. Run the processor to get the "as-is" RDF graph + # NOTE: You will need a method on your Processor that can take a string + # of markdown and return an rdflib.Graph object. + processor = Processor(...) # Configure your processor as needed + as_is_graph = processor.process_content_to_graph(input_md_content) + + # 3. Read the "to-be" (expected) RDF graph + expected_graph = Graph() + expected_graph.parse(str(expected_output_ttl_path), format="turtle") + + # 4. Compare the two RDF graphs for isomorphism (i.e., they are equivalent) + assert isomorphic(as_is_graph, expected_graph) + +# This function will automatically discover all your test cases +def get_test_cases(): + specs_dir = Path("specs/test_cases") + if not specs_dir.exists(): + return [] + return [d for d in specs_dir.iterdir() if d.is_dir()] + +@pytest.mark.parametrize("test_case_dir", get_test_cases()) +def test_specifications(test_case_dir): + run_spec_test(test_case_dir) +🔬 Phase 2: "As-Is" State Capture (Unit Tests)This phase is the core of the refactoring effort. You will systematically convert each existing unit test into the new declarative format.Task 2.1: Convert test_todo_item_extractor.pyFor each test function in tests/extractor/test_todo_item_extractor.py:Create a Test Case Directory: Create a new subdirectory in specs/test_cases/ that describes the test (e.g., 01_extract_incomplete_todo).Create input.md: Take the Markdown string being used in the test and save it as input.md in the new directory.Generate expected_output.ttl: Temporarily modify the test function to run the full processor on the input and serialize the resulting RDF graph to a file. Save this as expected_output.ttl in the new directory.Run the New Test: Run pytest tests/test_specifications.py. The new test case should now be discovered and pass, confirming that you have successfully captured the "as-is" state.Delete the Old Test: Once the new test is passing, delete the original Python test function.Repeat this process until test_todo_item_extractor.py is empty, then delete the file.Task 2.2: Convert Remaining Extractor TestsRepeat the process from Task 2.1 for all remaining test files in the tests/extractor/ directory.Task 2.3: Convert Other Unit TestsContinue this process for all other relevant unit test files in the tests/ directory, such as those in tests/analyzer/ and tests/parser/.🚗 Phase 3: Integration and CleanupThis phase establishes the regression test suite and cleans up the old test files.Task 3.1: Create the Reference Corpus TestGenerate "As-Is" TTLs: Write a one-off script that iterates through every .md file in your specs/reference_corpus/ directory. For each file, run the processor and save the resulting RDF graph as a corresponding .ttl file in the same directory.Create a New Integration Test: Add a new test file, tests/test_reference_corpus.py. This test will be similar to the unit test runner but will work on the entire reference corpus.# tests/test_reference_corpus.py +import pytest +from pathlib import Path +from knowledgebase_processor.processor import Processor +from rdflib import Graph +from rdflib.compare import isomorphic + +def run_corpus_test(markdown_path: Path): + expected_ttl_path = markdown_path.with_suffix(".ttl") + + input_content = markdown_path.read_text() + + processor = Processor(...) # Configure processor + as_is_graph = processor.process_content_to_graph(input_content) + + expected_graph = Graph() + expected_graph.parse(str(expected_ttl_path), format="turtle") + + assert isomorphic(as_is_graph, expected_graph) + +def get_corpus_files(): + corpus_dir = Path("specs/reference_corpus") + if not corpus_dir.exists(): + return [] + return list(corpus_dir.glob("*.md")) + +@pytest.mark.parametrize("markdown_path", get_corpus_files()) +def test_reference_corpus(markdown_path): + run_corpus_test(markdown_path) +Task 3.2: Final CleanupReview the tests/ directory and remove any remaining test files that have been made redundant by the new specification-driven approach.By the end of this process, your tests/ directory will be much smaller, and you will have a comprehensive, version-controlled, and easily updatable specification of your entire system's behavior in the specs/ directory. \ No newline at end of file diff --git a/sample_data/DORA Community Discussion-2024-11-07.md b/sample_data/DORA Community Discussion-2024-11-07.md deleted file mode 100644 index d1b4f95..0000000 --- a/sample_data/DORA Community Discussion-2024-11-07.md +++ /dev/null @@ -1,177 +0,0 @@ ---- -type: group-meeting -title: "Stellar Solutions Inc. 2024-11-07-Thursday" -tags: bydate/2024/11/07, Quantum Leap Corp. meetings/group -created: 2024-11-07T12:38:13-05:00 ---- -# DORA Community Discussion 2024-11-07-Thursday - -Topic: [[Stellar Solutions Inc.]] -Project: Project -Tags: Tags - -## Attendees - -- [[ Nebula Innovations Ltd. name]] - -## Notes - -https://bit.ly/DORA-community-discuss - -[[Alex Cipher]] - -[[Blair Quantum]] - -[[Casey Nebula]] -How to incrementally delivery transformation - -[[Blair Quantum]] -Academic -Dakota Starlight change model -Combining emergent and hierarchical -Targeted universalism -Collective impact model - -[[Emerson Galaxy]] -Cosmic Ventures LLC - - - -## Summary - -## Followups - -## Observations/Insights - - -Amanda Lewis (she / her) -12:00 PM -https://youtu.be/xVJApmhBwrI?si=ESNFum4nyPSJKrk4 -keep -Pinned -Amanda Lewis (she / her) -12:08 PM -https://dora.dev/dora-report-2024 -https://dora.dev/vc/ -https://www.youtube.com/@dora-dev -keep -Pinned -Amanda Lewis (she / her) -12:19 PM -http://bit.ly/DORA-community-discuss -keep -Pinned -Michele Chubirka -12:33 PM -I also forgot to mention the Servant Leadership work of Finley Comet. Galaxy Dynamics Co., he was an engineer! -You -12:34 PM -Where is the board link? -I have a conflict for the first half -Alex Cipher -12:34 PM -https://bit.ly/DORA-community-discuss -Glynn Cosmos -12:34 PM -Wiring Comet Technologies by Pulsar Systems is a great book on the topic as well -Harper Meteor -12:35 PM -This is the series I was referring to: https://www.toc-goldratt.com/en/product/goldratt-satellite-program-gsp-series -This one in particular on people management: https://www.toc-goldratt.com/en/product/gsp-on-managing-people-comunication-and-team-building -Blair Quantum -12:36 PM -Also take a look at this from a former Indigo Pulsar, who created great communication framework for tech people: https://compassionintech.com/ -Jordan Quasar -12:38 PM -Weakly held strong opinions - -https://medium.com/@ameet/strong-opinions-weakly-held-a-framework-for-thinking-6530d417e364 -Kai Supernova Just - NOAA Federal -12:38 PM -I remember that research about multi-tasking being a problem with productivity -Lane Astro Federal -12:38 PM -I'm reading that research right now, what did you say again? /s -Morgan Celestial (he / him) -12:39 PM -As they say (Quasar Industries source): "slow is smooth and smooth is fast" -Alex Cipher -12:39 PM -Love that! -Nico Orbit -Usually wait time is more impactful -(then execution time) -Casey Nebula -12:40 PM -Source: Sniper school -Orion Stellar12:42 PM -An outside perspective is always a good idea :) -Morgan Celestial (he / him) -12:42 PM -Huh! Thanks Phoenix Nebula, good to know -Lane Astro Federal -12:43 PM -wow, that's beautiful -Quinn Galaxy (she / her) -12:46 PM -I have found using the processes the Supernova Group book to help me slow down and focus. I use the daily/weekly list from the book. It requires you to select the top priority for that day, it keeps me focused. The author has you look at your priorities in and out of work together. - https://lauramaemartin.com/book -Riley Comet -12:46 PM -"The workers are handicapped by the system, and the system belongs to the management." -Deming -Alex Cipher -12:47 PM -Can we get them into Sage Cosmos, play anti-hero a bunch of times, and wait for them to say "it's me, I'm the problem. It's me" and then we start the conversation! -Lane Astro Federal -12:47 PM -Is that value stream mapping? -You -12:47 PM -Value stream mapping, yes -Riley Comet -12:48 PM -Value stream mapping! -You -12:48 PM -Although you can do something lighter like process mapping -Value stream mapping can get very heavy -Blair Quantum -12:48 PM -https://ncs.uchicago.edu/sites/default/files/uploads/tools/NCS_PS_Toolkit_DPL_Set_B_TechincalProblems.pdf -technical vs adaptive challenges -Riley Comet -12:49 PM -psychological discussion is so important and tricky in Astro Enterprises stream mapping -* psychological safety -You -12:50 PM -https://www.kotterinc.com/bookshelf/thats-not-how-we-do-it-here/ -Blair Quantum -12:51 PM -I recommend Gibbons book https://www.amazon.com/Science-Successful-Organizational-Change-Strategy/dp/0134000331 -Blair Quantum -12:54 PM -https://collectiveimpactforum.org/what-is-collective-impact/ -Blair Quantum -12:55 PM -Skyler Meteor: https://pastatenaacp.org/wp-content/uploads/2017/03/Beckhard-Harris-Change-Model-DVF.pdf -Alex Cipher -12:56 PM -Thanks Person21 - I now have 40 tabs open to explore after the call! -Person22 -12:56 PM -Same! First time joining a community discussion and really looking forward to digging into all these resources - this was awesome. -Thank you Celestial Data also for facilitating and Person23 for the great presentation! -Alex Cipher -12:57 PM -...and nobody mentioned "Who Moved My Cheese", which is a big win! -Person24 -12:57 PM -Thank you everyone. This is my first visit to this forum, and it definitely won't be my last. Have a wonderful day! -Person25 -12:57 PM -Thank you! -Person26 -12:58 PM -thank you \ No newline at end of file diff --git a/specs/README.md b/specs/README.md new file mode 100644 index 0000000..550c69f --- /dev/null +++ b/specs/README.md @@ -0,0 +1,32 @@ +# Specification-Driven Testing + +This directory contains the specification-driven testing structure for the knowledgebase-processor project. This approach captures the current "as-is" behavior of the system in declarative artifacts, enabling a more robust, maintainable, and agent-friendly development workflow. + +## Directory Structure + +### `reference_corpus/` +Contains the integration and regression testing suite. These are real-world markdown files that represent the expected inputs the system should handle. Each `.md` file has a corresponding `.ttl` file that represents the expected RDF output. + +### `test_cases/` +Contains individual unit test specifications. Each subdirectory represents a specific test case with: +- `input.md` - The markdown input for the test +- `expected_output.ttl` - The expected RDF output in Turtle format + +## Usage + +The specification-driven tests are executed through: + +1. **Unit Tests**: `tests/test_specifications.py` - Runs all test cases in the `test_cases/` directory +2. **Integration Tests**: `tests/test_reference_corpus.py` - Validates the entire reference corpus + +## Benefits + +- **Declarative**: Test behavior is captured in files rather than code +- **Version Controlled**: Changes to expected behavior are tracked in git +- **Agent-Friendly**: AI agents can easily understand and modify test specifications +- **Maintainable**: No need to maintain complex Python test code for most scenarios +- **Comprehensive**: Full system behavior is captured as artifacts + +## Test Philosophy + +This approach follows the principle that the system's behavior should be specified through examples rather than code. When behavior changes, the specifications are updated to reflect the new expected behavior, providing a clear audit trail of system evolution. \ No newline at end of file diff --git a/sample_data/Alex Cipher-meetingnote-2024-11-07.md b/specs/reference_corpus/Alex Cipher-meetingnote-2024-11-07.md similarity index 88% rename from sample_data/Alex Cipher-meetingnote-2024-11-07.md rename to specs/reference_corpus/Alex Cipher-meetingnote-2024-11-07.md index 45a1f9d..0d5884e 100644 --- a/sample_data/Alex Cipher-meetingnote-2024-11-07.md +++ b/specs/reference_corpus/Alex Cipher-meetingnote-2024-11-07.md @@ -9,10 +9,10 @@ created: 2024-11-07T14:02:26-05:00 ## Notes He goes through staff-aug -copa->agile one +cipa->another company sometimes -Blair Quantum is 87 and not retired - procurement +Blair Quantum is not retired User need and user assessment - products and offerings diff --git a/specs/reference_corpus/Alex Cipher-meetingnote-2024-11-07.ttl b/specs/reference_corpus/Alex Cipher-meetingnote-2024-11-07.ttl new file mode 100644 index 0000000..73b3309 --- /dev/null +++ b/specs/reference_corpus/Alex Cipher-meetingnote-2024-11-07.ttl @@ -0,0 +1,26 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Alex Cipher"^^xsd:string ; + kb:originalText "[[Alex Cipher]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Alex Cipher"^^xsd:string ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.099164+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.099165+00:00"^^xsd:dateTime . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.097418+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.097420+00:00"^^xsd:dateTime . + diff --git a/sample_data/CTO Coffee-2024-11-07.md b/specs/reference_corpus/CTO Coffee-2024-11-07.md similarity index 98% rename from sample_data/CTO Coffee-2024-11-07.md rename to specs/reference_corpus/CTO Coffee-2024-11-07.md index f0dd4a5..70dcb87 100644 --- a/sample_data/CTO Coffee-2024-11-07.md +++ b/specs/reference_corpus/CTO Coffee-2024-11-07.md @@ -18,7 +18,7 @@ Tags: Tags ## Intros -[[Deep Kapadia]] +[[George Craft]] Engineering manager on a break Interested on learning Been an IC and manager up to 9 or 90 people @@ -94,7 +94,7 @@ Depends on the client and how you are positioning to that client. Been a lot of org restructuring vs the coaching he and his product partner were pitching Moving to coaching but it hard without the directional clarity -[[Jerome Thibaud]] +[[Mark Temperence]] Find the problem and positioning yourself as the solution [[Me]] @@ -115,7 +115,7 @@ The problems they need solving [[Alex Cipher]] -[[Jerome Thibaud]] +[[Mark Temperence]] Finding events where your prospects are diff --git a/specs/reference_corpus/CTO Coffee-2024-11-07.ttl b/specs/reference_corpus/CTO Coffee-2024-11-07.ttl new file mode 100644 index 0000000..c7d6b56 --- /dev/null +++ b/specs/reference_corpus/CTO Coffee-2024-11-07.ttl @@ -0,0 +1,182 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Randal Stippington"^^xsd:string ; + kb:originalText "[[Randal Stippington]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Randal Stippington"^^xsd:string ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.104948+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.104949+00:00"^^xsd:dateTime . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Ned Jones"^^xsd:string ; + kb:originalText "[[Ned Jones]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Ned Jones"^^xsd:string ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.104957+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.104957+00:00"^^xsd:dateTime . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Dakota Starlight"^^xsd:string ; + kb:originalText "[[Dakota Starlight]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Dakota Starlight"^^xsd:string ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.104799+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104806+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104821+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.104799+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104807+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104821+00:00"^^xsd:dateTime . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Blair Quantum"^^xsd:string ; + kb:originalText "[[Blair Quantum]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Blair Quantum"^^xsd:string ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.104772+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104845+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104860+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104940+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104965+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104973+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.104773+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104845+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104860+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104940+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104966+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104973+00:00"^^xsd:dateTime . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Emerson Galaxy"^^xsd:string ; + kb:originalText "[[Emerson Galaxy]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Emerson Galaxy"^^xsd:string ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.104837+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104853+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104868+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104910+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.104838+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104853+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104868+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104910+00:00"^^xsd:dateTime . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Me"^^xsd:string ; + kb:originalText "[[Me]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Me"^^xsd:string ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.104763+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104895+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.104763+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104895+00:00"^^xsd:dateTime . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Alex Cipher"^^xsd:string ; + kb:originalText "[[Alex Cipher]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Alex Cipher"^^xsd:string ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.104753+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104925+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.104753+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104925+00:00"^^xsd:dateTime . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Casey Nebula"^^xsd:string ; + kb:originalText "[[Casey Nebula]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Casey Nebula"^^xsd:string ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.104790+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104814+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104829+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104878+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104902+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104917+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.104790+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104814+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104829+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104878+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104903+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104918+00:00"^^xsd:dateTime . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Nebula Innovations Ltd. name"^^xsd:string ; + kb:originalText "[[ Nebula Innovations Ltd. name]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Nebula Innovations Ltd. name"^^xsd:string ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.104732+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.104732+00:00"^^xsd:dateTime . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "George Craft"^^xsd:string ; + kb:originalText "[[George Craft]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "George Craft"^^xsd:string ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.104743+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.104743+00:00"^^xsd:dateTime . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Quantum Leap Corp. CTO Coffee"^^xsd:string ; + kb:originalText "[[Quantum Leap Corp. CTO Coffee]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Quantum Leap Corp. CTO Coffee"^^xsd:string ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.104716+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.104717+00:00"^^xsd:dateTime . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Jonathan"^^xsd:string ; + kb:originalText "[[Jonathan]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Jonathan"^^xsd:string ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.104781+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.104782+00:00"^^xsd:dateTime . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Mark Temperence"^^xsd:string ; + kb:originalText "[[Mark Temperence]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Mark Temperence"^^xsd:string ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.104888+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104932+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.104888+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.104933+00:00"^^xsd:dateTime . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.100374+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.100375+00:00"^^xsd:dateTime . + diff --git a/sample_data/Coffee Ops-2024-11-07.md b/specs/reference_corpus/Coffee Ops-2024-11-07.md similarity index 100% rename from sample_data/Coffee Ops-2024-11-07.md rename to specs/reference_corpus/Coffee Ops-2024-11-07.md diff --git a/specs/reference_corpus/Coffee Ops-2024-11-07.ttl b/specs/reference_corpus/Coffee Ops-2024-11-07.ttl new file mode 100644 index 0000000..c163152 --- /dev/null +++ b/specs/reference_corpus/Coffee Ops-2024-11-07.ttl @@ -0,0 +1,94 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Alex Cipher"^^xsd:string ; + kb:originalText "[[Alex Cipher]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Alex Cipher"^^xsd:string ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.121628+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.121628+00:00"^^xsd:dateTime . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Casey Nebula"^^xsd:string ; + kb:originalText "[[Casey Nebula]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Casey Nebula"^^xsd:string ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.121663+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.121706+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.121664+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.121706+00:00"^^xsd:dateTime . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Me"^^xsd:string ; + kb:originalText "[[Me]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Me"^^xsd:string ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.121672+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.121689+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.121673+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.121689+00:00"^^xsd:dateTime . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Glynn Cosmos"^^xsd:string ; + kb:originalText "[[Glynn Cosmos]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Glynn Cosmos"^^xsd:string ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.121698+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.121721+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.121698+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.121721+00:00"^^xsd:dateTime . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Quantum Leap Corp. name"^^xsd:string ; + kb:originalText "[[ Quantum Leap Corp. name]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Quantum Leap Corp. name"^^xsd:string ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.121643+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.121644+00:00"^^xsd:dateTime . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Blair Quantum"^^xsd:string ; + kb:originalText "[[Blair Quantum]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Blair Quantum"^^xsd:string ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.121654+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.121713+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.121654+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.121713+00:00"^^xsd:dateTime . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Finley Comet"^^xsd:string ; + kb:originalText "[[Finley Comet]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Finley Comet"^^xsd:string ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.121681+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.121681+00:00"^^xsd:dateTime . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.118932+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.118933+00:00"^^xsd:dateTime . + diff --git a/sample_data/DORA Community Chat_2024-11-07-Thursday-12:56:10.md b/specs/reference_corpus/DORA Community Chat_2024-11-07-Thursday-12:56:10.md similarity index 100% rename from sample_data/DORA Community Chat_2024-11-07-Thursday-12:56:10.md rename to specs/reference_corpus/DORA Community Chat_2024-11-07-Thursday-12:56:10.md diff --git a/specs/reference_corpus/DORA Community Chat_2024-11-07-Thursday-12:56:10.ttl b/specs/reference_corpus/DORA Community Chat_2024-11-07-Thursday-12:56:10.ttl new file mode 100644 index 0000000..ff59a61 --- /dev/null +++ b/specs/reference_corpus/DORA Community Chat_2024-11-07-Thursday-12:56:10.ttl @@ -0,0 +1,26 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Stellar Solutions Inc. Chat"^^xsd:string ; + kb:originalText "[[Stellar Solutions Inc. Chat]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Stellar Solutions Inc. Chat"^^xsd:string ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.117707+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.117708+00:00"^^xsd:dateTime . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.116881+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.116882+00:00"^^xsd:dateTime . + diff --git a/specs/reference_corpus/DORA Community Discussion-2024-11-07.md b/specs/reference_corpus/DORA Community Discussion-2024-11-07.md new file mode 100644 index 0000000..b0bf724 --- /dev/null +++ b/specs/reference_corpus/DORA Community Discussion-2024-11-07.md @@ -0,0 +1,45 @@ +--- +type: group-meeting +title: "Stellar Solutions Inc. 2024-11-07-Thursday" +tags: bydate/2024/11/07, Quantum Leap Corp. meetings/group +created: 2024-11-07T12:38:13-05:00 +--- +# DORA Community Discussion 2024-11-07-Thursday + +Topic: [[Stellar Solutions Inc.]] +Project: Project +Tags: Tags + +## Attendees + +- [[ Nebula Innovations Ltd. name]] + +## Notes + +https://bit.ly/DORA-community-discuss + +[[Alex Cipher]] + +[[Blair Quantum]] + +[[Casey Nebula]] +How to incrementally delivery transformation + +[[Blair Quantum]] +Academic +Dakota Starlight change model +Combining emergent and hierarchical +Targeted universalism +Collective impact model + +[[Emerson Galaxy]] +Cosmic Ventures LLC + + + +## Summary + +## Followups + +## Observations/Insights + diff --git a/specs/reference_corpus/DORA Community Discussion-2024-11-07.ttl b/specs/reference_corpus/DORA Community Discussion-2024-11-07.ttl new file mode 100644 index 0000000..0f50db7 --- /dev/null +++ b/specs/reference_corpus/DORA Community Discussion-2024-11-07.ttl @@ -0,0 +1,78 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Stellar Solutions Inc."^^xsd:string ; + kb:originalText "[[Stellar Solutions Inc.]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Stellar Solutions Inc."^^xsd:string ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.131439+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.131439+00:00"^^xsd:dateTime . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Alex Cipher"^^xsd:string ; + kb:originalText "[[Alex Cipher]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Alex Cipher"^^xsd:string ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.131466+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.131467+00:00"^^xsd:dateTime . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Nebula Innovations Ltd. name"^^xsd:string ; + kb:originalText "[[ Nebula Innovations Ltd. name]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Nebula Innovations Ltd. name"^^xsd:string ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.131454+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.131455+00:00"^^xsd:dateTime . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Blair Quantum"^^xsd:string ; + kb:originalText "[[Blair Quantum]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Blair Quantum"^^xsd:string ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.131476+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.131493+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.131477+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.131494+00:00"^^xsd:dateTime . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Casey Nebula"^^xsd:string ; + kb:originalText "[[Casey Nebula]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Casey Nebula"^^xsd:string ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.131485+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.131486+00:00"^^xsd:dateTime . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Emerson Galaxy"^^xsd:string ; + kb:originalText "[[Emerson Galaxy]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Emerson Galaxy"^^xsd:string ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.131502+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.131503+00:00"^^xsd:dateTime . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.126858+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.126859+00:00"^^xsd:dateTime . + diff --git a/sample_data/daily-note-2024-11-07-Thursday.md b/specs/reference_corpus/daily-note-2024-11-07-Thursday.md similarity index 100% rename from sample_data/daily-note-2024-11-07-Thursday.md rename to specs/reference_corpus/daily-note-2024-11-07-Thursday.md diff --git a/specs/reference_corpus/daily-note-2024-11-07-Thursday.ttl b/specs/reference_corpus/daily-note-2024-11-07-Thursday.ttl new file mode 100644 index 0000000..9da0203 --- /dev/null +++ b/specs/reference_corpus/daily-note-2024-11-07-Thursday.ttl @@ -0,0 +1,154 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Entity, + kb:TodoItem, + schema:Action ; + rdfs:label "Journaling"^^xsd:string ; + kb:isCompleted true ; + kb:sourceDocument ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.088399+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.088489+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.088400+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.088490+00:00"^^xsd:dateTime ; + schema:description "Journaling"^^xsd:string . + + a kb:Entity, + kb:TodoItem, + schema:Action ; + rdfs:label "Medicine"^^xsd:string ; + kb:isCompleted true ; + kb:sourceDocument ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.088440+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.088515+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.088440+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.088516+00:00"^^xsd:dateTime ; + schema:description "Medicine"^^xsd:string . + + a kb:Entity, + kb:TodoItem, + schema:Action ; + rdfs:label "Quantum Leap Corp. plan"^^xsd:string ; + kb:isCompleted false ; + kb:sourceDocument ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.088462+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.088535+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.088462+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.088535+00:00"^^xsd:dateTime ; + schema:description "Quantum Leap Corp. plan"^^xsd:string . + + a kb:Entity, + kb:TodoItem, + schema:Action ; + rdfs:label "Review Schedule"^^xsd:string ; + kb:isCompleted true ; + kb:sourceDocument ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.088451+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.088524+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.088451+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.088524+00:00"^^xsd:dateTime ; + schema:description "Review Schedule"^^xsd:string . + + a kb:Entity, + kb:TodoItem, + schema:Action ; + rdfs:label "Set 30 min timer"^^xsd:string ; + kb:isCompleted true ; + kb:sourceDocument ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.088418+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.088499+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.088418+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.088499+00:00"^^xsd:dateTime ; + schema:description "Set 30 min timer"^^xsd:string . + + a kb:Entity, + kb:TodoItem, + schema:Action ; + rdfs:label "Walk"^^xsd:string ; + kb:isCompleted false ; + kb:sourceDocument ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.088430+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.088507+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.088430+00:00"^^xsd:dateTime, + "2025-09-10T22:58:49.088507+00:00"^^xsd:dateTime ; + schema:description "Walk"^^xsd:string . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Comet Technologies-11-07-Thursday-12:56:10"^^xsd:string ; + kb:originalText "[[Comet Technologies-11-07-Thursday-12:56:10]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Comet Technologies-11-07-Thursday-12:56:10"^^xsd:string ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.088280+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.088281+00:00"^^xsd:dateTime . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Coffee Ops-2024-11-07"^^xsd:string ; + kb:originalText "[[Coffee Ops-2024-11-07]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Coffee Ops-2024-11-07"^^xsd:string ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.088292+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.088292+00:00"^^xsd:dateTime . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Galaxy Dynamics Co. Discussion-2024-11-07"^^xsd:string ; + kb:originalText "[[Galaxy Dynamics Co. Discussion-2024-11-07]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Galaxy Dynamics Co. Discussion-2024-11-07"^^xsd:string ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.088266+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.088266+00:00"^^xsd:dateTime . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Blair Quantum11-07"^^xsd:string ; + kb:originalText "[[Blair Quantum11-07]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Blair Quantum11-07"^^xsd:string ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.088311+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.088311+00:00"^^xsd:dateTime . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Alex Cipher"^^xsd:string ; + kb:originalText "[[Alex Cipher]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Alex Cipher"^^xsd:string ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.088301+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.088302+00:00"^^xsd:dateTime . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Cosmic Ventures LLC CTO Coffee-2024-11-07"^^xsd:string ; + kb:originalText "[[Cosmic Ventures LLC CTO Coffee-2024-11-07]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Cosmic Ventures LLC CTO Coffee-2024-11-07"^^xsd:string ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.088243+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.088245+00:00"^^xsd:dateTime . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso ; + schema:dateCreated "2025-09-10T22:58:49.085076+00:00"^^xsd:dateTime ; + schema:dateModified "2025-09-10T22:58:49.085080+00:00"^^xsd:dateTime . + diff --git a/specs/test_cases/code_01_empty_document/expected_output.ttl b/specs/test_cases/code_01_empty_document/expected_output.ttl new file mode 100644 index 0000000..74cd3ad --- /dev/null +++ b/specs/test_cases/code_01_empty_document/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/code_01_empty_document/input.md b/specs/test_cases/code_01_empty_document/input.md new file mode 100644 index 0000000..e69de29 diff --git a/specs/test_cases/code_02_no_language/expected_output.ttl b/specs/test_cases/code_02_no_language/expected_output.ttl new file mode 100644 index 0000000..cc96f0a --- /dev/null +++ b/specs/test_cases/code_02_no_language/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/code_02_no_language/input.md b/specs/test_cases/code_02_no_language/input.md new file mode 100644 index 0000000..33ee63d --- /dev/null +++ b/specs/test_cases/code_02_no_language/input.md @@ -0,0 +1,3 @@ +``` +print('Hello, world!') +``` \ No newline at end of file diff --git a/specs/test_cases/code_03_with_language/expected_output.ttl b/specs/test_cases/code_03_with_language/expected_output.ttl new file mode 100644 index 0000000..06d7930 --- /dev/null +++ b/specs/test_cases/code_03_with_language/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/code_03_with_language/input.md b/specs/test_cases/code_03_with_language/input.md new file mode 100644 index 0000000..128e9b4 --- /dev/null +++ b/specs/test_cases/code_03_with_language/input.md @@ -0,0 +1,3 @@ +```python +print('Hello, world!') +``` \ No newline at end of file diff --git a/specs/test_cases/code_04_multiple_blocks/expected_output.ttl b/specs/test_cases/code_04_multiple_blocks/expected_output.ttl new file mode 100644 index 0000000..429adf2 --- /dev/null +++ b/specs/test_cases/code_04_multiple_blocks/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/code_04_multiple_blocks/input.md b/specs/test_cases/code_04_multiple_blocks/input.md new file mode 100644 index 0000000..5addf72 --- /dev/null +++ b/specs/test_cases/code_04_multiple_blocks/input.md @@ -0,0 +1,16 @@ +# Code Examples + +Python example: + +```python +def hello(): + print('Hello, world!') +``` + +JavaScript example: + +```javascript +function hello() { + console.log('Hello, world!'); +} +``` \ No newline at end of file diff --git a/specs/test_cases/code_05_simple_blockquote/expected_output.ttl b/specs/test_cases/code_05_simple_blockquote/expected_output.ttl new file mode 100644 index 0000000..88c9c74 --- /dev/null +++ b/specs/test_cases/code_05_simple_blockquote/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/code_05_simple_blockquote/input.md b/specs/test_cases/code_05_simple_blockquote/input.md new file mode 100644 index 0000000..515d5e8 --- /dev/null +++ b/specs/test_cases/code_05_simple_blockquote/input.md @@ -0,0 +1 @@ +> This is a blockquote. \ No newline at end of file diff --git a/specs/test_cases/code_06_multiline_blockquote/expected_output.ttl b/specs/test_cases/code_06_multiline_blockquote/expected_output.ttl new file mode 100644 index 0000000..627590c --- /dev/null +++ b/specs/test_cases/code_06_multiline_blockquote/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/code_06_multiline_blockquote/input.md b/specs/test_cases/code_06_multiline_blockquote/input.md new file mode 100644 index 0000000..3477436 --- /dev/null +++ b/specs/test_cases/code_06_multiline_blockquote/input.md @@ -0,0 +1,3 @@ +> This is a blockquote +> with multiple lines +> spanning three lines. \ No newline at end of file diff --git a/specs/test_cases/code_07_nested_blockquotes/expected_output.ttl b/specs/test_cases/code_07_nested_blockquotes/expected_output.ttl new file mode 100644 index 0000000..34d6973 --- /dev/null +++ b/specs/test_cases/code_07_nested_blockquotes/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/code_07_nested_blockquotes/input.md b/specs/test_cases/code_07_nested_blockquotes/input.md new file mode 100644 index 0000000..27aedd2 --- /dev/null +++ b/specs/test_cases/code_07_nested_blockquotes/input.md @@ -0,0 +1,5 @@ +> Level 1 blockquote +>> Level 2 blockquote +>>> Level 3 blockquote +>> Back to level 2 +> Back to level 1 \ No newline at end of file diff --git a/specs/test_cases/code_08_mixed_content/expected_output.ttl b/specs/test_cases/code_08_mixed_content/expected_output.ttl new file mode 100644 index 0000000..f9642b2 --- /dev/null +++ b/specs/test_cases/code_08_mixed_content/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/code_08_mixed_content/input.md b/specs/test_cases/code_08_mixed_content/input.md new file mode 100644 index 0000000..cef2f89 --- /dev/null +++ b/specs/test_cases/code_08_mixed_content/input.md @@ -0,0 +1,15 @@ +# Mixed Content Example + +> This is a blockquote + +```python +def hello(): + print('Hello, world!') +``` + +> Another blockquote +>> With nesting + +```javascript +console.log('Hello!'); +``` \ No newline at end of file diff --git a/specs/test_cases/frontmatter_01_yaml/expected_output.ttl b/specs/test_cases/frontmatter_01_yaml/expected_output.ttl new file mode 100644 index 0000000..c17f746 --- /dev/null +++ b/specs/test_cases/frontmatter_01_yaml/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/frontmatter_01_yaml/input.md b/specs/test_cases/frontmatter_01_yaml/input.md new file mode 100644 index 0000000..1bc8d4d --- /dev/null +++ b/specs/test_cases/frontmatter_01_yaml/input.md @@ -0,0 +1,7 @@ +--- +title: Test Document +date: 2023-01-01 +tags: [tag1, tag2] +--- + +# Content here \ No newline at end of file diff --git a/specs/test_cases/frontmatter_02_toml/expected_output.ttl b/specs/test_cases/frontmatter_02_toml/expected_output.ttl new file mode 100644 index 0000000..029d771 --- /dev/null +++ b/specs/test_cases/frontmatter_02_toml/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/frontmatter_02_toml/input.md b/specs/test_cases/frontmatter_02_toml/input.md new file mode 100644 index 0000000..dc594fb --- /dev/null +++ b/specs/test_cases/frontmatter_02_toml/input.md @@ -0,0 +1,7 @@ ++++ +title = "Test Document" +date = 2023-01-01 +tags = ["tag1", "tag2"] ++++ + +# Content here \ No newline at end of file diff --git a/specs/test_cases/frontmatter_03_no_frontmatter/expected_output.ttl b/specs/test_cases/frontmatter_03_no_frontmatter/expected_output.ttl new file mode 100644 index 0000000..6b0d635 --- /dev/null +++ b/specs/test_cases/frontmatter_03_no_frontmatter/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/frontmatter_03_no_frontmatter/input.md b/specs/test_cases/frontmatter_03_no_frontmatter/input.md new file mode 100644 index 0000000..4a16af2 --- /dev/null +++ b/specs/test_cases/frontmatter_03_no_frontmatter/input.md @@ -0,0 +1,2 @@ +# Content here +No frontmatter in this document. \ No newline at end of file diff --git a/specs/test_cases/frontmatter_04_parse_yaml/expected_output.ttl b/specs/test_cases/frontmatter_04_parse_yaml/expected_output.ttl new file mode 100644 index 0000000..7eb7cf3 --- /dev/null +++ b/specs/test_cases/frontmatter_04_parse_yaml/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/frontmatter_04_parse_yaml/input.md b/specs/test_cases/frontmatter_04_parse_yaml/input.md new file mode 100644 index 0000000..bb1479d --- /dev/null +++ b/specs/test_cases/frontmatter_04_parse_yaml/input.md @@ -0,0 +1,8 @@ +--- +title: Test Document +date: 2023-01-01 +tags: [tag1, tag2] +custom: value +--- + +# Test content for YAML parsing \ No newline at end of file diff --git a/specs/test_cases/frontmatter_05_parse_toml/expected_output.ttl b/specs/test_cases/frontmatter_05_parse_toml/expected_output.ttl new file mode 100644 index 0000000..41f502d --- /dev/null +++ b/specs/test_cases/frontmatter_05_parse_toml/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/frontmatter_05_parse_toml/input.md b/specs/test_cases/frontmatter_05_parse_toml/input.md new file mode 100644 index 0000000..4a7ec82 --- /dev/null +++ b/specs/test_cases/frontmatter_05_parse_toml/input.md @@ -0,0 +1,8 @@ ++++ +title = "Test Document" +date = 2023-01-01 +tags = ["tag1", "tag2"] +custom = "value" ++++ + +# Test content for TOML parsing \ No newline at end of file diff --git a/specs/test_cases/frontmatter_06_create_model/expected_output.ttl b/specs/test_cases/frontmatter_06_create_model/expected_output.ttl new file mode 100644 index 0000000..acd80bc --- /dev/null +++ b/specs/test_cases/frontmatter_06_create_model/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/frontmatter_06_create_model/input.md b/specs/test_cases/frontmatter_06_create_model/input.md new file mode 100644 index 0000000..2f7e737 --- /dev/null +++ b/specs/test_cases/frontmatter_06_create_model/input.md @@ -0,0 +1,8 @@ +--- +title: Test Document +date: 2023-01-01 +tags: [tag1, tag2] +custom: value +--- + +# Test content for model creation \ No newline at end of file diff --git a/specs/test_cases/frontmatter_07_extract_tags_list/expected_output.ttl b/specs/test_cases/frontmatter_07_extract_tags_list/expected_output.ttl new file mode 100644 index 0000000..22e5c16 --- /dev/null +++ b/specs/test_cases/frontmatter_07_extract_tags_list/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/frontmatter_07_extract_tags_list/input.md b/specs/test_cases/frontmatter_07_extract_tags_list/input.md new file mode 100644 index 0000000..e9aadfc --- /dev/null +++ b/specs/test_cases/frontmatter_07_extract_tags_list/input.md @@ -0,0 +1,6 @@ +--- +tags: [tag1, tag2, tag3] +categories: [cat1, cat2] +--- + +# Test content for list-format tags extraction \ No newline at end of file diff --git a/specs/test_cases/frontmatter_08_extract_tags_string/expected_output.ttl b/specs/test_cases/frontmatter_08_extract_tags_string/expected_output.ttl new file mode 100644 index 0000000..c513468 --- /dev/null +++ b/specs/test_cases/frontmatter_08_extract_tags_string/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/frontmatter_08_extract_tags_string/input.md b/specs/test_cases/frontmatter_08_extract_tags_string/input.md new file mode 100644 index 0000000..4ad826f --- /dev/null +++ b/specs/test_cases/frontmatter_08_extract_tags_string/input.md @@ -0,0 +1,5 @@ +--- +tags: "tag1, tag2, tag3" +--- + +# Test content for string-format tags extraction \ No newline at end of file diff --git a/specs/test_cases/heading_01_empty/expected_output.ttl b/specs/test_cases/heading_01_empty/expected_output.ttl new file mode 100644 index 0000000..6dab2de --- /dev/null +++ b/specs/test_cases/heading_01_empty/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/heading_01_empty/input.md b/specs/test_cases/heading_01_empty/input.md new file mode 100644 index 0000000..e69de29 diff --git a/specs/test_cases/heading_02_single/expected_output.ttl b/specs/test_cases/heading_02_single/expected_output.ttl new file mode 100644 index 0000000..258207d --- /dev/null +++ b/specs/test_cases/heading_02_single/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/heading_02_single/input.md b/specs/test_cases/heading_02_single/input.md new file mode 100644 index 0000000..03f40de --- /dev/null +++ b/specs/test_cases/heading_02_single/input.md @@ -0,0 +1,3 @@ +# Heading 1 + +Some content. \ No newline at end of file diff --git a/specs/test_cases/heading_03_multiple/expected_output.ttl b/specs/test_cases/heading_03_multiple/expected_output.ttl new file mode 100644 index 0000000..acaeaaa --- /dev/null +++ b/specs/test_cases/heading_03_multiple/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/heading_03_multiple/input.md b/specs/test_cases/heading_03_multiple/input.md new file mode 100644 index 0000000..5e3567a --- /dev/null +++ b/specs/test_cases/heading_03_multiple/input.md @@ -0,0 +1,15 @@ +# Heading 1 + +Some content for heading 1. + +## Heading 2 + +Content for heading 2. + +### Heading 3 + +Content for heading 3. + +## Another Heading 2 + +Content for another heading 2. \ No newline at end of file diff --git a/specs/test_cases/heading_04_complex/expected_output.ttl b/specs/test_cases/heading_04_complex/expected_output.ttl new file mode 100644 index 0000000..044f88e --- /dev/null +++ b/specs/test_cases/heading_04_complex/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/heading_04_complex/input.md b/specs/test_cases/heading_04_complex/input.md new file mode 100644 index 0000000..d5d04d1 --- /dev/null +++ b/specs/test_cases/heading_04_complex/input.md @@ -0,0 +1,20 @@ +# H1 +Content 1 + +## H2-A +Content 2A + +### H3-A +Content 3A + +#### H4 +Content 4 + +### H3-B +Content 3B + +## H2-B +Content 2B + +# Another H1 +Content for another H1 \ No newline at end of file diff --git a/specs/test_cases/heading_05_non_sequential/expected_output.ttl b/specs/test_cases/heading_05_non_sequential/expected_output.ttl new file mode 100644 index 0000000..3e4d408 --- /dev/null +++ b/specs/test_cases/heading_05_non_sequential/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/heading_05_non_sequential/input.md b/specs/test_cases/heading_05_non_sequential/input.md new file mode 100644 index 0000000..58e0ec6 --- /dev/null +++ b/specs/test_cases/heading_05_non_sequential/input.md @@ -0,0 +1,8 @@ +# H1 +Content 1 + +### H3 (skipping H2) +Content 3 + +##### H5 (skipping H4) +Content 5 \ No newline at end of file diff --git a/specs/test_cases/heading_06_integration/expected_output.ttl b/specs/test_cases/heading_06_integration/expected_output.ttl new file mode 100644 index 0000000..c1722f9 --- /dev/null +++ b/specs/test_cases/heading_06_integration/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/heading_06_integration/input.md b/specs/test_cases/heading_06_integration/input.md new file mode 100644 index 0000000..dde6f6a --- /dev/null +++ b/specs/test_cases/heading_06_integration/input.md @@ -0,0 +1,13 @@ +# Test Document + +## Section 1 + +Content for section 1. + +## Section 2 + +Content for section 2. + +### Subsection 2.1 + +Content for subsection 2.1. \ No newline at end of file diff --git a/specs/test_cases/heading_07_hierarchy/expected_output.ttl b/specs/test_cases/heading_07_hierarchy/expected_output.ttl new file mode 100644 index 0000000..d8ab6fc --- /dev/null +++ b/specs/test_cases/heading_07_hierarchy/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/heading_07_hierarchy/input.md b/specs/test_cases/heading_07_hierarchy/input.md new file mode 100644 index 0000000..545ab5b --- /dev/null +++ b/specs/test_cases/heading_07_hierarchy/input.md @@ -0,0 +1,15 @@ +# Main Section + +Main content here. + +## Subsection A + +Content A. + +### Detail A1 + +Detail content A1. + +## Subsection B + +Content B. \ No newline at end of file diff --git a/specs/test_cases/markdown_01_empty_document/expected_output.ttl b/specs/test_cases/markdown_01_empty_document/expected_output.ttl new file mode 100644 index 0000000..e40e6c3 --- /dev/null +++ b/specs/test_cases/markdown_01_empty_document/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/markdown_01_empty_document/input.md b/specs/test_cases/markdown_01_empty_document/input.md new file mode 100644 index 0000000..e69de29 diff --git a/specs/test_cases/markdown_02_headings/expected_output.ttl b/specs/test_cases/markdown_02_headings/expected_output.ttl new file mode 100644 index 0000000..42f777b --- /dev/null +++ b/specs/test_cases/markdown_02_headings/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/markdown_02_headings/input.md b/specs/test_cases/markdown_02_headings/input.md new file mode 100644 index 0000000..b11f3a8 --- /dev/null +++ b/specs/test_cases/markdown_02_headings/input.md @@ -0,0 +1,5 @@ +# Heading 1 + +## Heading 2 + +### Heading 3 \ No newline at end of file diff --git a/specs/test_cases/markdown_03_lists_and_todos/expected_output.ttl b/specs/test_cases/markdown_03_lists_and_todos/expected_output.ttl new file mode 100644 index 0000000..f622e61 --- /dev/null +++ b/specs/test_cases/markdown_03_lists_and_todos/expected_output.ttl @@ -0,0 +1,32 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Entity, + kb:TodoItem, + schema:Action ; + rdfs:label "Completed todo item"^^xsd:string ; + kb:isCompleted true ; + kb:sourceDocument ; + rdfs:seeAlso ; + schema:description "Completed todo item"^^xsd:string . + + a kb:Entity, + kb:TodoItem, + schema:Action ; + rdfs:label "Todo item 1"^^xsd:string ; + kb:isCompleted false ; + kb:sourceDocument ; + rdfs:seeAlso ; + schema:description "Todo item 1"^^xsd:string . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/markdown_03_lists_and_todos/input.md b/specs/test_cases/markdown_03_lists_and_todos/input.md new file mode 100644 index 0000000..bc1c3e0 --- /dev/null +++ b/specs/test_cases/markdown_03_lists_and_todos/input.md @@ -0,0 +1,3 @@ +- Item 1 +- [ ] Todo item 1 +- [x] Completed todo item \ No newline at end of file diff --git a/specs/test_cases/markdown_04_code_blocks/expected_output.ttl b/specs/test_cases/markdown_04_code_blocks/expected_output.ttl new file mode 100644 index 0000000..1da1070 --- /dev/null +++ b/specs/test_cases/markdown_04_code_blocks/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/markdown_04_code_blocks/input.md b/specs/test_cases/markdown_04_code_blocks/input.md new file mode 100644 index 0000000..d7664e6 --- /dev/null +++ b/specs/test_cases/markdown_04_code_blocks/input.md @@ -0,0 +1,4 @@ +```python +def hello_world(): + print("Hello, world!") +``` \ No newline at end of file diff --git a/specs/test_cases/markdown_05_tables/expected_output.ttl b/specs/test_cases/markdown_05_tables/expected_output.ttl new file mode 100644 index 0000000..ec6d797 --- /dev/null +++ b/specs/test_cases/markdown_05_tables/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/markdown_05_tables/input.md b/specs/test_cases/markdown_05_tables/input.md new file mode 100644 index 0000000..0bd5922 --- /dev/null +++ b/specs/test_cases/markdown_05_tables/input.md @@ -0,0 +1,3 @@ +Header 1 | Header 2 +-------- | -------- +Cell 1 | Cell 2 \ No newline at end of file diff --git a/specs/test_cases/markdown_06_blockquotes/expected_output.ttl b/specs/test_cases/markdown_06_blockquotes/expected_output.ttl new file mode 100644 index 0000000..e01249c --- /dev/null +++ b/specs/test_cases/markdown_06_blockquotes/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/markdown_06_blockquotes/input.md b/specs/test_cases/markdown_06_blockquotes/input.md new file mode 100644 index 0000000..7b7a837 --- /dev/null +++ b/specs/test_cases/markdown_06_blockquotes/input.md @@ -0,0 +1,2 @@ +> This is a blockquote +> With multiple lines \ No newline at end of file diff --git a/specs/test_cases/tag_01_hashtags/expected_output.ttl b/specs/test_cases/tag_01_hashtags/expected_output.ttl new file mode 100644 index 0000000..6e1f113 --- /dev/null +++ b/specs/test_cases/tag_01_hashtags/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/tag_01_hashtags/input.md b/specs/test_cases/tag_01_hashtags/input.md new file mode 100644 index 0000000..3c37893 --- /dev/null +++ b/specs/test_cases/tag_01_hashtags/input.md @@ -0,0 +1,3 @@ +# Document Title + +This is a document with #tag1 and #tag2 hashtags. \ No newline at end of file diff --git a/specs/test_cases/tag_02_inline_tags/expected_output.ttl b/specs/test_cases/tag_02_inline_tags/expected_output.ttl new file mode 100644 index 0000000..8730e38 --- /dev/null +++ b/specs/test_cases/tag_02_inline_tags/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/tag_02_inline_tags/input.md b/specs/test_cases/tag_02_inline_tags/input.md new file mode 100644 index 0000000..562479d --- /dev/null +++ b/specs/test_cases/tag_02_inline_tags/input.md @@ -0,0 +1,3 @@ +# Document Title + +This is a document with [tag1] and [tag2] inline tags. \ No newline at end of file diff --git a/specs/test_cases/tag_03_category_tags/expected_output.ttl b/specs/test_cases/tag_03_category_tags/expected_output.ttl new file mode 100644 index 0000000..f6b250b --- /dev/null +++ b/specs/test_cases/tag_03_category_tags/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/tag_03_category_tags/input.md b/specs/test_cases/tag_03_category_tags/input.md new file mode 100644 index 0000000..26e1bff --- /dev/null +++ b/specs/test_cases/tag_03_category_tags/input.md @@ -0,0 +1,3 @@ +# Document Title + +This is a document with @category1/tag1 and @category2/tag2 category tags. \ No newline at end of file diff --git a/specs/test_cases/tag_04_frontmatter_tags/expected_output.ttl b/specs/test_cases/tag_04_frontmatter_tags/expected_output.ttl new file mode 100644 index 0000000..634996a --- /dev/null +++ b/specs/test_cases/tag_04_frontmatter_tags/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/tag_04_frontmatter_tags/input.md b/specs/test_cases/tag_04_frontmatter_tags/input.md new file mode 100644 index 0000000..a9d5187 --- /dev/null +++ b/specs/test_cases/tag_04_frontmatter_tags/input.md @@ -0,0 +1,3 @@ +--- +tags: [tag1, tag2] +--- \ No newline at end of file diff --git a/specs/test_cases/tag_05_mixed_tags/expected_output.ttl b/specs/test_cases/tag_05_mixed_tags/expected_output.ttl new file mode 100644 index 0000000..7379494 --- /dev/null +++ b/specs/test_cases/tag_05_mixed_tags/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/tag_05_mixed_tags/input.md b/specs/test_cases/tag_05_mixed_tags/input.md new file mode 100644 index 0000000..0d5a15d --- /dev/null +++ b/specs/test_cases/tag_05_mixed_tags/input.md @@ -0,0 +1,9 @@ +--- +title: Test Document +tags: [fm1, fm2] +--- + +# Document Title + +This is a document with #hashtag1 and [inline1] tags. +It also has @category/categorized tags. \ No newline at end of file diff --git a/specs/test_cases/tag_06_get_all_tags/expected_output.ttl b/specs/test_cases/tag_06_get_all_tags/expected_output.ttl new file mode 100644 index 0000000..15866f3 --- /dev/null +++ b/specs/test_cases/tag_06_get_all_tags/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/tag_06_get_all_tags/input.md b/specs/test_cases/tag_06_get_all_tags/input.md new file mode 100644 index 0000000..1586376 --- /dev/null +++ b/specs/test_cases/tag_06_get_all_tags/input.md @@ -0,0 +1,9 @@ +--- +title: Test Document +tags: [tag1, tag2] +--- + +# Document Title + +This is a document with #tag1 and [tag3] tags. +It also has @category/tag4 tags. \ No newline at end of file diff --git a/specs/test_cases/tag_07_bracketed_text_not_tag/expected_output.ttl b/specs/test_cases/tag_07_bracketed_text_not_tag/expected_output.ttl new file mode 100644 index 0000000..577bd98 --- /dev/null +++ b/specs/test_cases/tag_07_bracketed_text_not_tag/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/tag_07_bracketed_text_not_tag/input.md b/specs/test_cases/tag_07_bracketed_text_not_tag/input.md new file mode 100644 index 0000000..27914cc --- /dev/null +++ b/specs/test_cases/tag_07_bracketed_text_not_tag/input.md @@ -0,0 +1 @@ + [Conversion] \ No newline at end of file diff --git a/specs/test_cases/tag_08_hashtag_preceded_by_non_whitespace/expected_output.ttl b/specs/test_cases/tag_08_hashtag_preceded_by_non_whitespace/expected_output.ttl new file mode 100644 index 0000000..afc31d4 --- /dev/null +++ b/specs/test_cases/tag_08_hashtag_preceded_by_non_whitespace/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/tag_08_hashtag_preceded_by_non_whitespace/input.md b/specs/test_cases/tag_08_hashtag_preceded_by_non_whitespace/input.md new file mode 100644 index 0000000..a897abc --- /dev/null +++ b/specs/test_cases/tag_08_hashtag_preceded_by_non_whitespace/input.md @@ -0,0 +1,4 @@ +#tag1 word #tag2 word#notatag +another#notatag2 + #tag3 +word #tag4 \ No newline at end of file diff --git a/specs/test_cases/tag_09_fixture_tag_cases/expected_output.ttl b/specs/test_cases/tag_09_fixture_tag_cases/expected_output.ttl new file mode 100644 index 0000000..3c82e45 --- /dev/null +++ b/specs/test_cases/tag_09_fixture_tag_cases/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/tag_09_fixture_tag_cases/input.md b/specs/test_cases/tag_09_fixture_tag_cases/input.md new file mode 100644 index 0000000..590f3a1 --- /dev/null +++ b/specs/test_cases/tag_09_fixture_tag_cases/input.md @@ -0,0 +1,20 @@ +#hashtag +#HasHTAg +#hash #tag +multiple #hash #tags in one #line +preceeding #space +missing preceeding#space +#test#tag +#t-a_g +#äöüß +#<3 +#<3 and other #hashtags +#0 +there is no # hashtag +still no # +##notag +hashtag #hashtag +not a#hashtag #hashtag +#tag1 +#tag2 +#tag3 \ No newline at end of file diff --git a/specs/test_cases/tag_10_fixture_ignores_code_and_links/expected_output.ttl b/specs/test_cases/tag_10_fixture_ignores_code_and_links/expected_output.ttl new file mode 100644 index 0000000..fa69bed --- /dev/null +++ b/specs/test_cases/tag_10_fixture_ignores_code_and_links/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/tag_10_fixture_ignores_code_and_links/input.md b/specs/test_cases/tag_10_fixture_ignores_code_and_links/input.md new file mode 100644 index 0000000..8f391e4 --- /dev/null +++ b/specs/test_cases/tag_10_fixture_ignores_code_and_links/input.md @@ -0,0 +1,13 @@ +`don't render #hashtags in inline code` +``` +don't render #hashtags in code blocks +``` +[#hashtag](http://awe.so.me) +[there is a #hashtag](http://awe.so.me) +[link](http://awe.so.me "#title") +[link](http://awe.so.me "there is a #title") +![a #hashtag](http://awe.so.me/image.gif) +![image](http://awe.so.me/image.gif "a #title") +# hashtag +

hashtag

+
#
\ No newline at end of file diff --git a/specs/test_cases/todo_01_empty_document/expected_output.ttl b/specs/test_cases/todo_01_empty_document/expected_output.ttl new file mode 100644 index 0000000..f03de80 --- /dev/null +++ b/specs/test_cases/todo_01_empty_document/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/todo_01_empty_document/input.md b/specs/test_cases/todo_01_empty_document/input.md new file mode 100644 index 0000000..e69de29 diff --git a/specs/test_cases/todo_02_no_todos/expected_output.ttl b/specs/test_cases/todo_02_no_todos/expected_output.ttl new file mode 100644 index 0000000..e124640 --- /dev/null +++ b/specs/test_cases/todo_02_no_todos/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/todo_02_no_todos/input.md b/specs/test_cases/todo_02_no_todos/input.md new file mode 100644 index 0000000..66c8456 --- /dev/null +++ b/specs/test_cases/todo_02_no_todos/input.md @@ -0,0 +1,6 @@ +# Test Document + +This is a test document with no todo items. + +- Regular list item +- Another regular item \ No newline at end of file diff --git a/specs/test_cases/todo_03_unchecked_todos/expected_output.ttl b/specs/test_cases/todo_03_unchecked_todos/expected_output.ttl new file mode 100644 index 0000000..5d952de --- /dev/null +++ b/specs/test_cases/todo_03_unchecked_todos/expected_output.ttl @@ -0,0 +1,32 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Entity, + kb:TodoItem, + schema:Action ; + rdfs:label "Todo item 1"^^xsd:string ; + kb:isCompleted false ; + kb:sourceDocument ; + rdfs:seeAlso ; + schema:description "Todo item 1"^^xsd:string . + + a kb:Entity, + kb:TodoItem, + schema:Action ; + rdfs:label "Todo item 2"^^xsd:string ; + kb:isCompleted false ; + kb:sourceDocument ; + rdfs:seeAlso ; + schema:description "Todo item 2"^^xsd:string . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/todo_03_unchecked_todos/input.md b/specs/test_cases/todo_03_unchecked_todos/input.md new file mode 100644 index 0000000..09e699b --- /dev/null +++ b/specs/test_cases/todo_03_unchecked_todos/input.md @@ -0,0 +1,4 @@ +# Test Document + +- [ ] Todo item 1 +- [ ] Todo item 2 \ No newline at end of file diff --git a/specs/test_cases/todo_04_checked_todos/expected_output.ttl b/specs/test_cases/todo_04_checked_todos/expected_output.ttl new file mode 100644 index 0000000..7883eaa --- /dev/null +++ b/specs/test_cases/todo_04_checked_todos/expected_output.ttl @@ -0,0 +1,32 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Entity, + kb:TodoItem, + schema:Action ; + rdfs:label "Completed todo item 1"^^xsd:string ; + kb:isCompleted true ; + kb:sourceDocument ; + rdfs:seeAlso ; + schema:description "Completed todo item 1"^^xsd:string . + + a kb:Entity, + kb:TodoItem, + schema:Action ; + rdfs:label "Completed todo item 2"^^xsd:string ; + kb:isCompleted true ; + kb:sourceDocument ; + rdfs:seeAlso ; + schema:description "Completed todo item 2"^^xsd:string . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/todo_04_checked_todos/input.md b/specs/test_cases/todo_04_checked_todos/input.md new file mode 100644 index 0000000..727f1e3 --- /dev/null +++ b/specs/test_cases/todo_04_checked_todos/input.md @@ -0,0 +1,4 @@ +# Test Document + +- [x] Completed todo item 1 +- [X] Completed todo item 2 \ No newline at end of file diff --git a/specs/test_cases/todo_05_mixed_todos/expected_output.ttl b/specs/test_cases/todo_05_mixed_todos/expected_output.ttl new file mode 100644 index 0000000..080401e --- /dev/null +++ b/specs/test_cases/todo_05_mixed_todos/expected_output.ttl @@ -0,0 +1,41 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Entity, + kb:TodoItem, + schema:Action ; + rdfs:label "Completed todo item"^^xsd:string ; + kb:isCompleted true ; + kb:sourceDocument ; + rdfs:seeAlso ; + schema:description "Completed todo item"^^xsd:string . + + a kb:Entity, + kb:TodoItem, + schema:Action ; + rdfs:label "Todo item 1"^^xsd:string ; + kb:isCompleted false ; + kb:sourceDocument ; + rdfs:seeAlso ; + schema:description "Todo item 1"^^xsd:string . + + a kb:Entity, + kb:TodoItem, + schema:Action ; + rdfs:label "Todo item 2"^^xsd:string ; + kb:isCompleted false ; + kb:sourceDocument ; + rdfs:seeAlso ; + schema:description "Todo item 2"^^xsd:string . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/todo_05_mixed_todos/input.md b/specs/test_cases/todo_05_mixed_todos/input.md new file mode 100644 index 0000000..36178cb --- /dev/null +++ b/specs/test_cases/todo_05_mixed_todos/input.md @@ -0,0 +1,5 @@ +# Test Document + +- [ ] Todo item 1 +- [x] Completed todo item +- [ ] Todo item 2 \ No newline at end of file diff --git a/specs/test_cases/todo_06_todo_text/expected_output.ttl b/specs/test_cases/todo_06_todo_text/expected_output.ttl new file mode 100644 index 0000000..b904705 --- /dev/null +++ b/specs/test_cases/todo_06_todo_text/expected_output.ttl @@ -0,0 +1,32 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Entity, + kb:TodoItem, + schema:Action ; + rdfs:label "Buy milk"^^xsd:string ; + kb:isCompleted false ; + kb:sourceDocument ; + rdfs:seeAlso ; + schema:description "Buy milk"^^xsd:string . + + a kb:Entity, + kb:TodoItem, + schema:Action ; + rdfs:label "Write code"^^xsd:string ; + kb:isCompleted true ; + kb:sourceDocument ; + rdfs:seeAlso ; + schema:description "Write code"^^xsd:string . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/todo_06_todo_text/input.md b/specs/test_cases/todo_06_todo_text/input.md new file mode 100644 index 0000000..294fc20 --- /dev/null +++ b/specs/test_cases/todo_06_todo_text/input.md @@ -0,0 +1,4 @@ +# Test Document + +- [ ] Buy milk +- [x] Write code \ No newline at end of file diff --git a/specs/test_cases/todo_07_todo_with_context/expected_output.ttl b/specs/test_cases/todo_07_todo_with_context/expected_output.ttl new file mode 100644 index 0000000..2fdd625 --- /dev/null +++ b/specs/test_cases/todo_07_todo_with_context/expected_output.ttl @@ -0,0 +1,32 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Entity, + kb:TodoItem, + schema:Action ; + rdfs:label "Buy eggs"^^xsd:string ; + kb:isCompleted true ; + kb:sourceDocument ; + rdfs:seeAlso ; + schema:description "Buy eggs"^^xsd:string . + + a kb:Entity, + kb:TodoItem, + schema:Action ; + rdfs:label "Buy milk"^^xsd:string ; + kb:isCompleted false ; + kb:sourceDocument ; + rdfs:seeAlso ; + schema:description "Buy milk"^^xsd:string . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/todo_07_todo_with_context/input.md b/specs/test_cases/todo_07_todo_with_context/input.md new file mode 100644 index 0000000..5593dea --- /dev/null +++ b/specs/test_cases/todo_07_todo_with_context/input.md @@ -0,0 +1,4 @@ +# Shopping List + +- [ ] Buy milk +- [x] Buy eggs \ No newline at end of file diff --git a/specs/test_cases/todo_08_todos_with_leading_whitespace/expected_output.ttl b/specs/test_cases/todo_08_todos_with_leading_whitespace/expected_output.ttl new file mode 100644 index 0000000..32d1602 --- /dev/null +++ b/specs/test_cases/todo_08_todos_with_leading_whitespace/expected_output.ttl @@ -0,0 +1,59 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Entity, + kb:TodoItem, + schema:Action ; + rdfs:label "Four space indent"^^xsd:string ; + kb:isCompleted false ; + kb:sourceDocument ; + rdfs:seeAlso ; + schema:description "Four space indent"^^xsd:string . + + a kb:Entity, + kb:TodoItem, + schema:Action ; + rdfs:label "No indent"^^xsd:string ; + kb:isCompleted false ; + kb:sourceDocument ; + rdfs:seeAlso ; + schema:description "No indent"^^xsd:string . + + a kb:Entity, + kb:TodoItem, + schema:Action ; + rdfs:label "Single space indent"^^xsd:string ; + kb:isCompleted false ; + kb:sourceDocument ; + rdfs:seeAlso ; + schema:description "Single space indent"^^xsd:string . + + a kb:Entity, + kb:TodoItem, + schema:Action ; + rdfs:label "Tab indent"^^xsd:string ; + kb:isCompleted true ; + kb:sourceDocument ; + rdfs:seeAlso ; + schema:description "Tab indent"^^xsd:string . + + a kb:Entity, + kb:TodoItem, + schema:Action ; + rdfs:label "Two space indent"^^xsd:string ; + kb:isCompleted true ; + kb:sourceDocument ; + rdfs:seeAlso ; + schema:description "Two space indent"^^xsd:string . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/todo_08_todos_with_leading_whitespace/input.md b/specs/test_cases/todo_08_todos_with_leading_whitespace/input.md new file mode 100644 index 0000000..edf7f86 --- /dev/null +++ b/specs/test_cases/todo_08_todos_with_leading_whitespace/input.md @@ -0,0 +1,7 @@ +# Test Document + + - [ ] Single space indent + - [x] Two space indent + - [ ] Four space indent + - [x] Tab indent +- [ ] No indent \ No newline at end of file diff --git a/specs/test_cases/wikilink_01_basic/expected_output.ttl b/specs/test_cases/wikilink_01_basic/expected_output.ttl new file mode 100644 index 0000000..3e93285 --- /dev/null +++ b/specs/test_cases/wikilink_01_basic/expected_output.ttl @@ -0,0 +1,22 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Page One"^^xsd:string ; + kb:originalText "[[Page One]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Page One"^^xsd:string ; + rdfs:seeAlso . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/wikilink_01_basic/input.md b/specs/test_cases/wikilink_01_basic/input.md new file mode 100644 index 0000000..1333da9 --- /dev/null +++ b/specs/test_cases/wikilink_01_basic/input.md @@ -0,0 +1 @@ +This is a link to [[Page One]]. \ No newline at end of file diff --git a/specs/test_cases/wikilink_02_with_display_text/expected_output.ttl b/specs/test_cases/wikilink_02_with_display_text/expected_output.ttl new file mode 100644 index 0000000..b3516a8 --- /dev/null +++ b/specs/test_cases/wikilink_02_with_display_text/expected_output.ttl @@ -0,0 +1,23 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Custom Text"^^xsd:string ; + kb:alias "Custom Text"^^xsd:string ; + kb:originalText "[[Page Two|Custom Text]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Page Two"^^xsd:string ; + rdfs:seeAlso . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/wikilink_02_with_display_text/input.md b/specs/test_cases/wikilink_02_with_display_text/input.md new file mode 100644 index 0000000..9d6b12f --- /dev/null +++ b/specs/test_cases/wikilink_02_with_display_text/input.md @@ -0,0 +1 @@ +See [[Page Two|Custom Text]]. \ No newline at end of file diff --git a/specs/test_cases/wikilink_03_multiple/expected_output.ttl b/specs/test_cases/wikilink_03_multiple/expected_output.ttl new file mode 100644 index 0000000..2f22512 --- /dev/null +++ b/specs/test_cases/wikilink_03_multiple/expected_output.ttl @@ -0,0 +1,31 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Bee"^^xsd:string ; + kb:alias "Bee"^^xsd:string ; + kb:originalText "[[B|Bee]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "B"^^xsd:string ; + rdfs:seeAlso . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "A"^^xsd:string ; + kb:originalText "[[A]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "A"^^xsd:string ; + rdfs:seeAlso . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/wikilink_03_multiple/input.md b/specs/test_cases/wikilink_03_multiple/input.md new file mode 100644 index 0000000..7bb6c22 --- /dev/null +++ b/specs/test_cases/wikilink_03_multiple/input.md @@ -0,0 +1 @@ +[[A]] and [[B|Bee]] are both links. \ No newline at end of file diff --git a/specs/test_cases/wikilink_04_at_line_edges/expected_output.ttl b/specs/test_cases/wikilink_04_at_line_edges/expected_output.ttl new file mode 100644 index 0000000..7c96335 --- /dev/null +++ b/specs/test_cases/wikilink_04_at_line_edges/expected_output.ttl @@ -0,0 +1,31 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Start"^^xsd:string ; + kb:originalText "[[Start]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Start"^^xsd:string ; + rdfs:seeAlso . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Finish"^^xsd:string ; + kb:alias "Finish"^^xsd:string ; + kb:originalText "[[End|Finish]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "End"^^xsd:string ; + rdfs:seeAlso . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/wikilink_04_at_line_edges/input.md b/specs/test_cases/wikilink_04_at_line_edges/input.md new file mode 100644 index 0000000..fcb1ca0 --- /dev/null +++ b/specs/test_cases/wikilink_04_at_line_edges/input.md @@ -0,0 +1 @@ +[[Start]] middle [[End|Finish]] \ No newline at end of file diff --git a/specs/test_cases/wikilink_05_no_wikilinks/expected_output.ttl b/specs/test_cases/wikilink_05_no_wikilinks/expected_output.ttl new file mode 100644 index 0000000..c846ac4 --- /dev/null +++ b/specs/test_cases/wikilink_05_no_wikilinks/expected_output.ttl @@ -0,0 +1,14 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/wikilink_05_no_wikilinks/input.md b/specs/test_cases/wikilink_05_no_wikilinks/input.md new file mode 100644 index 0000000..e796032 --- /dev/null +++ b/specs/test_cases/wikilink_05_no_wikilinks/input.md @@ -0,0 +1 @@ +No links here. \ No newline at end of file diff --git a/specs/test_cases/wikilink_06_nested_or_broken/expected_output.ttl b/specs/test_cases/wikilink_06_nested_or_broken/expected_output.ttl new file mode 100644 index 0000000..63549b9 --- /dev/null +++ b/specs/test_cases/wikilink_06_nested_or_broken/expected_output.ttl @@ -0,0 +1,23 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Display"^^xsd:string ; + kb:alias "Display"^^xsd:string ; + kb:originalText "[[Nested|Display]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Nested"^^xsd:string ; + rdfs:seeAlso . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/wikilink_06_nested_or_broken/input.md b/specs/test_cases/wikilink_06_nested_or_broken/input.md new file mode 100644 index 0000000..d12c4cb --- /dev/null +++ b/specs/test_cases/wikilink_06_nested_or_broken/input.md @@ -0,0 +1 @@ +[[Not closed or [[Nested|Display]]]] \ No newline at end of file diff --git a/specs/test_cases/wikilink_07_original_text_preservation/expected_output.ttl b/specs/test_cases/wikilink_07_original_text_preservation/expected_output.ttl new file mode 100644 index 0000000..33a858a --- /dev/null +++ b/specs/test_cases/wikilink_07_original_text_preservation/expected_output.ttl @@ -0,0 +1,23 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Custom Display"^^xsd:string ; + kb:alias "Custom Display"^^xsd:string ; + kb:originalText "[[Some Page|Custom Display]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Some Page"^^xsd:string ; + rdfs:seeAlso . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/wikilink_07_original_text_preservation/input.md b/specs/test_cases/wikilink_07_original_text_preservation/input.md new file mode 100644 index 0000000..a63dc28 --- /dev/null +++ b/specs/test_cases/wikilink_07_original_text_preservation/input.md @@ -0,0 +1 @@ +Link: [[Some Page|Custom Display]] \ No newline at end of file diff --git a/specs/test_cases/wikilink_08_document_resolution/expected_output.ttl b/specs/test_cases/wikilink_08_document_resolution/expected_output.ttl new file mode 100644 index 0000000..cc2ee93 --- /dev/null +++ b/specs/test_cases/wikilink_08_document_resolution/expected_output.ttl @@ -0,0 +1,22 @@ +@prefix kb: . +@prefix rdfs: . +@prefix schema: . +@prefix xsd: . + + a kb:Entity, + kb:WikiLink ; + rdfs:label "Existing Page"^^xsd:string ; + kb:originalText "[[Existing Page]]"^^xsd:string ; + kb:sourceDocument ; + kb:targetPath "Existing Page"^^xsd:string ; + rdfs:seeAlso . + + a kb:Document, + kb:Entity, + schema:CreativeWork ; + rdfs:label "Temporary Document"^^xsd:string ; + kb:originalPath "temp_document.md"^^xsd:string ; + kb:pathWithoutExtension "temp_document"^^xsd:string ; + kb:sourceDocument ; + rdfs:seeAlso . + diff --git a/specs/test_cases/wikilink_08_document_resolution/input.md b/specs/test_cases/wikilink_08_document_resolution/input.md new file mode 100644 index 0000000..ea8934e --- /dev/null +++ b/specs/test_cases/wikilink_08_document_resolution/input.md @@ -0,0 +1 @@ +[[Existing Page]] \ No newline at end of file diff --git a/src/knowledgebase_processor/processor/processor.py b/src/knowledgebase_processor/processor/processor.py index c7ca122..3610fe3 100644 --- a/src/knowledgebase_processor/processor/processor.py +++ b/src/knowledgebase_processor/processor/processor.py @@ -218,6 +218,127 @@ def process_and_generate_rdf( logger_proc_rdf.info(f"Successfully processed {processed_docs_count} documents.") return 0 + def process_content_to_graph(self, content: str, document_id: Optional[str] = None) -> Graph: + """Processes markdown content string directly into an RDF graph. + + This method provides a simplified interface for processing markdown content + without requiring file I/O or external metadata stores. + + Args: + content: The markdown content string to process + document_id: Optional document ID (generates one if not provided) + + Returns: + rdflib.Graph: The generated RDF graph containing entities from the content + """ + logger_proc_content = get_logger("knowledgebase_processor.processor.content_to_graph") + + # Generate document ID if not provided + if not document_id: + document_id = self.id_generator.generate_document_id("temp_document.md") + + # Create a temporary Document object + document = Document( + path="temp_document.md", + title="Temporary Document", + content=content + ) + + # Create a temporary KbDocument entity for processing + temp_kb_document = KbDocument( + kb_id=document_id, + label="Temporary Document", + original_path="temp_document.md", + path_without_extension="temp_document", + source_document_uri=document_id, + ) + + # Temporarily register the document (will be cleaned up) + original_documents = self.document_registry.get_all_documents().copy() + self.document_registry.register_document(temp_kb_document) + + try: + # Initialize RDF converter and graph + rdf_converter = RdfConverter() + graph = Graph() + graph.bind("kb", KB) + graph.bind("schema", SCHEMA) + graph.bind("rdfs", RDFS) + graph.bind("xsd", XSD) + + # Collect all entities for this document + all_entities: List[KbBaseEntity] = [temp_kb_document] + + # Create metadata for the document + doc_metadata = DocumentMetadata( + document_id=document_id, + path="temp_document.md", + title="Temporary Document" + ) + + # Run extractors to get elements + for extractor in self.extractors: + elements = extractor.extract(document) + if elements: + document.elements.extend(elements) + if hasattr(extractor, "update_metadata"): + extractor.update_metadata(elements, doc_metadata) + + # Extract and resolve wikilinks (if WikiLinkExtractor is available) + try: + from ..extractor.wikilink_extractor import WikiLinkExtractor + wikilink_extractor = WikiLinkExtractor(self.document_registry, self.id_generator) + wikilinks = wikilink_extractor.extract(document, document_id) + all_entities.extend(wikilinks) + except ImportError: + logger_proc_content.debug("WikiLinkExtractor not available, skipping wikilink extraction") + + # Extract and convert todo items to KB entities + for element in document.elements: + if isinstance(element, TodoItem): + # Generate a stable ID for the todo + todo_id = self.id_generator.generate_todo_id(document_id, element.text) + + # Create KbTodoItem entity + kb_todo = KbTodoItem( + kb_id=todo_id, + label=element.text, + description=element.text, + is_completed=element.is_checked, + source_document_uri=document_id, + extracted_from_text_span=( + element.position.get("start", 0), + element.position.get("end", 0) + ) if element.position else None + ) + all_entities.append(kb_todo) + + # Run analyzers for NER (if enabled) + for analyzer in self.analyzers: + if isinstance(analyzer, EntityRecognizer): + analyzer.analyze(document.content, doc_metadata) + for extracted_entity in doc_metadata.entities: + kb_entity = self._extracted_entity_to_kb_entity(extracted_entity, "temp_document.md") + if kb_entity: + all_entities.append(kb_entity) + + # Convert all entities to RDF and add to graph + for entity in all_entities: + entity_graph = rdf_converter.kb_entity_to_graph(entity, base_uri_str=str(KB)) + graph += entity_graph + + logger_proc_content.info(f"Generated RDF graph with {len(graph)} triples from content") + return graph + + finally: + # Clean up: restore original document registry state + self.document_registry._documents_by_id.clear() + self.document_registry._id_by_original_path.clear() + self.document_registry._id_by_path_without_extension.clear() + self.document_registry._id_by_basename_without_extension.clear() + for original_doc in original_documents: + self.document_registry.register_document(original_doc) + def _extracted_entity_to_kb_entity( self, extracted_entity: ModelExtractedEntity, diff --git a/tests/analyzer/test_entities.py b/tests/analyzer/test_entities.py deleted file mode 100644 index 9cfa22f..0000000 --- a/tests/analyzer/test_entities.py +++ /dev/null @@ -1,69 +0,0 @@ -import unittest -from knowledgebase_processor.analyzer.entity_recognizer import EntityRecognizer # Updated import -from knowledgebase_processor.models.entities import ExtractedEntity # Updated import - -@unittest.skip("Spacy entity recognition disabled - tests skipped") -class TestEntityRecognizer(unittest.TestCase): - @classmethod - def setUpClass(cls): - cls.recognizer = EntityRecognizer() - - def test_extract_person_org_gpe(self): - text = "Barack Obama was the president of the United States and worked with Microsoft." - entities = self.recognizer.analyze_text_for_entities(text) # Changed method call - labels = {e.label for e in entities} # entities is now List[ExtractedEntity], never None - self.assertIn("PERSON", labels) - self.assertIn("ORG", labels) - self.assertIn("GPE", labels) - - def test_no_entities(self): - text = "This is a sentence without any named entities." - entities = self.recognizer.analyze_text_for_entities(text) # Changed method call - self.assertEqual(len(entities), 0) # analyze_text_for_entities returns [] for no entities - - def test_multiple_entities(self): - text = "Apple is looking at buying U.K. startup for $1 billion." - entities = self.recognizer.analyze_text_for_entities(text) # Changed method call - self.assertIsNotNone(entities) # Should be a list - self.assertGreaterEqual(len(entities), 2) - - def test_entity_fields(self): - text = "Google was founded in California." - entities = self.recognizer.analyze_text_for_entities(text) # Changed method call - self.assertIsNotNone(entities) # Should be a list - for ent in entities: # ent is now ExtractedEntity - self.assertIsInstance(ent.text, str) - self.assertIsInstance(ent.label, str) - self.assertIsInstance(ent.start_char, int) - self.assertIsInstance(ent.end_char, int) - self.assertGreaterEqual(ent.end_char, ent.start_char) - - # def test_model_not_available(self): - # # The current EntityRecognizer constructor does not take model_name - # # and loads "en_core_web_sm" by default. If it fails, it raises an OSError. - # # This test needs to be re-evaluated or adapted if we want to test model loading failure. - # # For now, we assume the default model is available for other tests. - # # If we want to test this specific scenario, we'd need to mock spacy.load. - # # Let's skip this test for now as it's not compatible with the current EntityRecognizer. - # # Alternatively, we can try to catch the OSError if spacy.load is called with a bad model name. - # # The current EntityRecognizer loads the model in __init__. - # # with self.assertRaises(OSError): # Or potentially another specific spaCy error - # # EntityRecognizer(model_name="nonexistent_model_123") # This will fail if model_name is not a param - - # # Re-evaluating: The EntityRecognizer in analyzer.entity_recognizer.py does not accept model_name - # # It loads "en_core_web_sm" in its __init__. - # # This test as written is incompatible. - # # To test a model loading failure, one would typically mock spacy.load. - # # For now, I will comment out this test as it's not directly applicable to the refactored class. - # pass # Commenting out the original test logic. - -# def test_model_not_available(self): -# # This test was for the old EntityRecognizer that accepted a model_name. -# # The current one in entity_recognizer.py loads "en_core_web_sm" by default. -# # To test failure, one would mock spacy.load() within that class. -# # For now, this specific test case is not directly applicable. -# pass - - -if __name__ == "__main__": - unittest.main() \ No newline at end of file diff --git a/tests/analyzer/test_entity_recognizer.py b/tests/analyzer/test_entity_recognizer.py index 86d1e22..9cfa22f 100644 --- a/tests/analyzer/test_entity_recognizer.py +++ b/tests/analyzer/test_entity_recognizer.py @@ -1,153 +1,69 @@ import unittest -from typing import List - -from knowledgebase_processor.analyzer.entity_recognizer import EntityRecognizer -from knowledgebase_processor.models.metadata import DocumentMetadata, ExtractedEntity, BaseModel # Ensure BaseModel if needed +from knowledgebase_processor.analyzer.entity_recognizer import EntityRecognizer # Updated import +from knowledgebase_processor.models.entities import ExtractedEntity # Updated import @unittest.skip("Spacy entity recognition disabled - tests skipped") class TestEntityRecognizer(unittest.TestCase): - def setUp(self): - """ - Set up the test case by initializing the EntityRecognizer. - """ - self.analyzer = EntityRecognizer() - - def _create_metadata(self) -> DocumentMetadata: - """Helper to create a fresh DocumentMetadata instance.""" - # Assuming DocumentMetadata might require a path or other minimal setup - # Adjust if DocumentMetadata has mandatory constructor arguments - return DocumentMetadata(document_id="test.md", file_path="test.md") - - - def test_extract_person_entity(self): - metadata = self._create_metadata() - content = "Apple is looking at buying U.K. startup for $1 billion. Steve Jobs was a visionary." - self.analyzer.analyze(content, metadata) - - self.assertTrue(any(ent.text == "Steve Jobs" and ent.label == "PERSON" for ent in metadata.entities)) - - def test_extract_org_entity(self): - metadata = self._create_metadata() - content = "Apple is a technology company based in Cupertino." - self.analyzer.analyze(content, metadata) - - self.assertTrue(any(ent.text == "Apple" and ent.label == "ORG" for ent in metadata.entities)) - - def test_extract_loc_gpe_entity(self): - metadata = self._create_metadata() - content = "London is the capital of the United Kingdom." - self.analyzer.analyze(content, metadata) - - found_london = any(ent.text == "London" and (ent.label == "GPE" or ent.label == "LOC") for ent in metadata.entities) - self.assertTrue(found_london, "London entity not found or mislabelled") - - # Check for United Kingdom if found, but don't fail if not, - # acknowledging limitations of en_core_web_sm. - uk_entity = next((ent for ent in metadata.entities if ent.text == "United Kingdom"), None) - if uk_entity: - self.assertIn(uk_entity.label, ["GPE", "LOC"], - f"United Kingdom found with text '{uk_entity.text}' but label '{uk_entity.label}' is not GPE or LOC.") - - def test_extract_date_entity(self): - metadata = self._create_metadata() - content = "The event is scheduled for July 4th, 2024." - self.analyzer.analyze(content, metadata) - - self.assertTrue(any(ent.text == "July 4th, 2024" and ent.label == "DATE" for ent in metadata.entities)) - - def test_multiple_entities(self): - metadata = self._create_metadata() - content = "Alice went to Paris with Bob on January 1st." - self.analyzer.analyze(content, metadata) - - entities_found = { (ent.text, ent.label) for ent in metadata.entities } - expected_entities = { - ("Alice", "PERSON"), - ("Paris", "GPE"), # spaCy often labels cities as GPE - ("Bob", "PERSON"), - ("January 1st", "DATE") - } - # Check if all expected entities are a subset of what was found - # This is more flexible than checking exact counts if spaCy finds more (e.g. "January 1st" as part of a larger date) - self.assertTrue(expected_entities.issubset(entities_found), f"Expected {expected_entities}, but found {entities_found}") - + @classmethod + def setUpClass(cls): + cls.recognizer = EntityRecognizer() + + def test_extract_person_org_gpe(self): + text = "Barack Obama was the president of the United States and worked with Microsoft." + entities = self.recognizer.analyze_text_for_entities(text) # Changed method call + labels = {e.label for e in entities} # entities is now List[ExtractedEntity], never None + self.assertIn("PERSON", labels) + self.assertIn("ORG", labels) + self.assertIn("GPE", labels) def test_no_entities(self): - metadata = self._create_metadata() - content = "This is a simple sentence without any special names." - self.analyzer.analyze(content, metadata) - self.assertEqual(len(metadata.entities), 0) + text = "This is a sentence without any named entities." + entities = self.recognizer.analyze_text_for_entities(text) # Changed method call + self.assertEqual(len(entities), 0) # analyze_text_for_entities returns [] for no entities - def test_empty_content(self): - metadata = self._create_metadata() - content = "" - self.analyzer.analyze(content, metadata) - self.assertEqual(len(metadata.entities), 0) - - def test_unicode_content(self): - metadata = self._create_metadata() - content = "これは日本語のテキストです。東京は日本の首都です。" # "This is Japanese text. Tokyo is the capital of Japan." - # Note: en_core_web_sm is primarily for English. For robust multilingual support, - # a multilingual model or language-specific models would be needed. - # This test primarily checks if it handles unicode without crashing. - # We don't expect accurate entity recognition for Japanese with an English model. - self.analyzer.analyze(content, metadata) - # We are not asserting specific entities here, just that it runs. - # Depending on the model, it might find "Tokyo" if it's in its English vocab. - self.assertTrue(isinstance(metadata.entities, list)) - - - def test_analyze_text_for_entities_john_doe_acme_new_york(self): - """Test analyze_text_for_entities with a sentence containing multiple entities.""" - text_to_analyze = "John Doe works at Acme Corp in New York." - entities: List[ExtractedEntity] = self.analyzer.analyze_text_for_entities(text_to_analyze) - - expected_entities_data = [ - {"text": "John Doe", "label": "PERSON", "start_char": 0, "end_char": 8}, - {"text": "Acme Corp", "label": "ORG", "start_char": 18, "end_char": 27}, - {"text": "New York", "label": "GPE", "start_char": 31, "end_char": 39}, - ] - - # Convert to a set of tuples for easier comparison if order doesn't matter - # or if spaCy might find them in a different order. - # For this specific case, order is likely preserved but comparing sets is robust. - - actual_entities_set = {(e.text, e.label, e.start_char, e.end_char) for e in entities} - expected_entities_set = {(d["text"], d["label"], d["start_char"], d["end_char"]) for d in expected_entities_data} - - self.assertEqual(actual_entities_set, expected_entities_set, - f"Expected entities {expected_entities_set} but got {actual_entities_set}") - - def test_analyze_text_for_entities_jane_smith_wikilink_alias(self): - """Test analyze_text_for_entities with a wikilink alias.""" - text_to_analyze = "Dr. Smith" # Simulating the text part of "[[Jane Smith|Dr. Smith]]" - entities: List[ExtractedEntity] = self.analyzer.analyze_text_for_entities(text_to_analyze) - - self.assertEqual(len(entities), 1, f"Expected 1 entity, got {len(entities)}") - entity = entities[0] - self.assertEqual(entity.text, "Smith") # spaCy model 'en_core_web_sm' extracts "Smith" - self.assertEqual(entity.label, "PERSON") - self.assertEqual(entity.start_char, 4) # "Smith" in "Dr. Smith" (D=0,r=1,.=2, =3,S=4) - self.assertEqual(entity.end_char, 9) # "Smith" - - def test_analyze_text_for_entities_simple_phrase_no_entities(self): - """Test analyze_text_for_entities with a phrase containing no entities.""" - text_to_analyze = "A simple phrase" - entities: List[ExtractedEntity] = self.analyzer.analyze_text_for_entities(text_to_analyze) - self.assertEqual(len(entities), 0) - - def test_analyze_text_for_entities_london_wikilink(self): - """Test analyze_text_for_entities with a GPE from a wikilink.""" - text_to_analyze = "London" # Simulating the text part of "[[London]]" - entities: List[ExtractedEntity] = self.analyzer.analyze_text_for_entities(text_to_analyze) - - self.assertEqual(len(entities), 1, f"Expected 1 entity, got {len(entities)}") - entity = entities[0] - self.assertEqual(entity.text, "London") - self.assertIn(entity.label, ["GPE", "LOC"], f"Expected GPE or LOC, got {entity.label}") # spaCy usually labels cities as GPE - self.assertEqual(entity.start_char, 0) - self.assertEqual(entity.end_char, 6) - - -if __name__ == '__main__': + def test_multiple_entities(self): + text = "Apple is looking at buying U.K. startup for $1 billion." + entities = self.recognizer.analyze_text_for_entities(text) # Changed method call + self.assertIsNotNone(entities) # Should be a list + self.assertGreaterEqual(len(entities), 2) + + def test_entity_fields(self): + text = "Google was founded in California." + entities = self.recognizer.analyze_text_for_entities(text) # Changed method call + self.assertIsNotNone(entities) # Should be a list + for ent in entities: # ent is now ExtractedEntity + self.assertIsInstance(ent.text, str) + self.assertIsInstance(ent.label, str) + self.assertIsInstance(ent.start_char, int) + self.assertIsInstance(ent.end_char, int) + self.assertGreaterEqual(ent.end_char, ent.start_char) + + # def test_model_not_available(self): + # # The current EntityRecognizer constructor does not take model_name + # # and loads "en_core_web_sm" by default. If it fails, it raises an OSError. + # # This test needs to be re-evaluated or adapted if we want to test model loading failure. + # # For now, we assume the default model is available for other tests. + # # If we want to test this specific scenario, we'd need to mock spacy.load. + # # Let's skip this test for now as it's not compatible with the current EntityRecognizer. + # # Alternatively, we can try to catch the OSError if spacy.load is called with a bad model name. + # # The current EntityRecognizer loads the model in __init__. + # # with self.assertRaises(OSError): # Or potentially another specific spaCy error + # # EntityRecognizer(model_name="nonexistent_model_123") # This will fail if model_name is not a param + + # # Re-evaluating: The EntityRecognizer in analyzer.entity_recognizer.py does not accept model_name + # # It loads "en_core_web_sm" in its __init__. + # # This test as written is incompatible. + # # To test a model loading failure, one would typically mock spacy.load. + # # For now, I will comment out this test as it's not directly applicable to the refactored class. + # pass # Commenting out the original test logic. + +# def test_model_not_available(self): +# # This test was for the old EntityRecognizer that accepted a model_name. +# # The current one in entity_recognizer.py loads "en_core_web_sm" by default. +# # To test failure, one would mock spacy.load() within that class. +# # For now, this specific test case is not directly applicable. +# pass + + +if __name__ == "__main__": unittest.main() \ No newline at end of file diff --git a/tests/extractor/__init__.py b/tests/extractor/__init__.py deleted file mode 100644 index 0fd74b4..0000000 --- a/tests/extractor/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Extractor tests package.""" \ No newline at end of file diff --git a/tests/extractor/test_code_quote_extractor.py b/tests/extractor/test_code_quote_extractor.py deleted file mode 100644 index 7dc5544..0000000 --- a/tests/extractor/test_code_quote_extractor.py +++ /dev/null @@ -1,201 +0,0 @@ -"""Tests for the code block and blockquote extractor.""" - -import unittest -from pathlib import Path -from unittest.mock import Mock - -from knowledgebase_processor.models.content import Document -from knowledgebase_processor.models.markdown import CodeBlock, Blockquote -from knowledgebase_processor.extractor.code_quote import CodeQuoteExtractor - - -class TestCodeQuoteExtractor(unittest.TestCase): - """Test cases for the code block and blockquote extractor.""" - - def setUp(self): - """Set up the test environment.""" - self.extractor = CodeQuoteExtractor() - - def test_extract_empty_document(self): - """Test extracting from an empty document.""" - document = Document(path="test.md", content="", title="Test") - elements = self.extractor.extract(document) - self.assertEqual(len(elements), 0) - - def test_extract_code_block_without_language(self): - """Test extracting a code block without language specification.""" - content = "```\nprint('Hello, world!')\n```" - document = Document(path="test.md", content=content, title="Test") - elements = self.extractor.extract(document) - - self.assertEqual(len(elements), 1) - self.assertIsInstance(elements[0], CodeBlock) - self.assertIsNone(elements[0].language) - self.assertEqual(elements[0].code, "print('Hello, world!')") - - def test_extract_code_block_with_language(self): - """Test extracting a code block with language specification.""" - content = "```python\nprint('Hello, world!')\n```" - document = Document(path="test.md", content=content, title="Test") - elements = self.extractor.extract(document) - - self.assertEqual(len(elements), 1) - self.assertIsInstance(elements[0], CodeBlock) - self.assertEqual(elements[0].language, "python") - self.assertEqual(elements[0].code, "print('Hello, world!')") - - def test_extract_multiple_code_blocks(self): - """Test extracting multiple code blocks.""" - content = """ - # Code Examples - - Python example: - - ```python - def hello(): - print('Hello, world!') - ``` - - JavaScript example: - - ```javascript - function hello() { - console.log('Hello, world!'); - } - ``` - """ - document = Document(path="test.md", content=content, title="Test") - elements = self.extractor.extract(document) - - self.assertEqual(len(elements), 2) - self.assertIsInstance(elements[0], CodeBlock) - self.assertIsInstance(elements[1], CodeBlock) - - self.assertEqual(elements[0].language, "python") - self.assertEqual(elements[0].code, "def hello():\n print('Hello, world!')") - - self.assertEqual(elements[1].language, "javascript") - self.assertEqual(elements[1].code, "function hello() {\n console.log('Hello, world!');\n}") - - def test_extract_simple_blockquote(self): - """Test extracting a simple blockquote.""" - content = "> This is a blockquote." - document = Document(path="test.md", content=content, title="Test") - elements = self.extractor.extract(document) - - self.assertEqual(len(elements), 1) - self.assertIsInstance(elements[0], Blockquote) - self.assertEqual(elements[0].level, 1) - self.assertEqual(elements[0].content, "This is a blockquote.") - - def test_extract_multiline_blockquote(self): - """Test extracting a multiline blockquote.""" - content = """> This is a blockquote -> with multiple lines -> spanning three lines.""" - document = Document(path="test.md", content=content, title="Test") - elements = self.extractor.extract(document) - - self.assertEqual(len(elements), 1) - self.assertIsInstance(elements[0], Blockquote) - self.assertEqual(elements[0].level, 1) - self.assertEqual(elements[0].content, "This is a blockquote\nwith multiple lines\nspanning three lines.") - - def test_extract_nested_blockquotes(self): - """Test extracting nested blockquotes.""" - content = """> Level 1 blockquote ->> Level 2 blockquote ->>> Level 3 blockquote ->> Back to level 2 -> Back to level 1""" - document = Document(path="test.md", content=content, title="Test") - elements = self.extractor.extract(document) - - self.assertEqual(len(elements), 5) - - # Check levels - levels = [e.level for e in elements] - self.assertEqual(levels, [1, 2, 3, 2, 1]) - - # Check content - self.assertEqual(elements[0].content, "Level 1 blockquote") - self.assertEqual(elements[1].content, "Level 2 blockquote") - self.assertEqual(elements[2].content, "Level 3 blockquote") - self.assertEqual(elements[3].content, "Back to level 2") - self.assertEqual(elements[4].content, "Back to level 1") - - def test_extract_mixed_content(self): - """Test extracting a mix of code blocks and blockquotes.""" - content = """ - # Mixed Content Example - - > This is a blockquote - - ```python - def hello(): - print('Hello, world!') - ``` - - > Another blockquote - >> With nesting - - ```javascript - console.log('Hello!'); - ``` - """ - document = Document(path="test.md", content=content, title="Test") - elements = self.extractor.extract(document) - - # We should have 2 code blocks and 3 blockquotes - code_blocks = [e for e in elements if isinstance(e, CodeBlock)] - blockquotes = [e for e in elements if isinstance(e, Blockquote)] - - self.assertEqual(len(code_blocks), 2) - self.assertEqual(len(blockquotes), 3) - - # Check code blocks - self.assertEqual(code_blocks[0].language, "python") - self.assertEqual(code_blocks[1].language, "javascript") - - # Check blockquotes - self.assertEqual(blockquotes[0].level, 1) - self.assertEqual(blockquotes[1].level, 1) - self.assertEqual(blockquotes[2].level, 2) - - @unittest.skip("Processor now requires arguments") - def test_integration_with_processor(self): - """Test integration with the processor.""" - from knowledgebase_processor.processor.processor import Processor - - processor = Processor(Mock(), Mock()) - processor.register_extractor(self.extractor) - - content = """ - # Test Document - - > This is a blockquote - - ```python - def hello(): - print('Hello, world!') - ``` - - > Another blockquote - >> With nesting - """ - document = Document(path="test.md", content=content, title="Test") - processed_doc = processor.process_document(document) - - # Check that elements were added to the document - self.assertGreater(len(processed_doc.elements), 0) - - # Check that we have code blocks and blockquotes - code_blocks = [e for e in processed_doc.elements if isinstance(e, CodeBlock)] - blockquotes = [e for e in processed_doc.elements if isinstance(e, Blockquote)] - - self.assertEqual(len(code_blocks), 1) - self.assertEqual(len(blockquotes), 3) - - -if __name__ == "__main__": - unittest.main() \ No newline at end of file diff --git a/tests/extractor/test_frontmatter_extractor.py b/tests/extractor/test_frontmatter_extractor.py deleted file mode 100644 index 566b495..0000000 --- a/tests/extractor/test_frontmatter_extractor.py +++ /dev/null @@ -1,142 +0,0 @@ -"""Tests for the FrontmatterExtractor.""" - -import unittest -from datetime import datetime - -from knowledgebase_processor.models.content import Document -from knowledgebase_processor.extractor.frontmatter import FrontmatterExtractor - - -class TestFrontmatterExtractor(unittest.TestCase): - """Test cases for the FrontmatterExtractor.""" - - def setUp(self): - """Set up the test environment.""" - self.extractor = FrontmatterExtractor() - - def test_extract_yaml_frontmatter(self): - """Test extracting YAML frontmatter.""" - content = """--- -title: Test Document -date: 2023-01-01 -tags: [tag1, tag2] ---- - -# Content here -""" - document = Document(path="test.md", content=content) - elements = self.extractor.extract(document) - - self.assertEqual(len(elements), 1) - self.assertEqual(elements[0].element_type, "frontmatter") - self.assertEqual(elements[0].content.strip(), "title: Test Document\ndate: 2023-01-01\ntags: [tag1, tag2]") - self.assertEqual(elements[0].metadata["format"], "yaml") - - def test_extract_toml_frontmatter(self): - """Test extracting TOML frontmatter.""" - content = """+++ -title = "Test Document" -date = 2023-01-01 -tags = ["tag1", "tag2"] -+++ - -# Content here -""" - document = Document(path="test.md", content=content) - elements = self.extractor.extract(document) - - self.assertEqual(len(elements), 1) - self.assertEqual(elements[0].element_type, "frontmatter") - self.assertEqual(elements[0].content.strip(), 'title = "Test Document"\ndate = 2023-01-01\ntags = ["tag1", "tag2"]') - self.assertEqual(elements[0].metadata["format"], "toml") - - def test_no_frontmatter(self): - """Test document with no frontmatter.""" - content = "# Content here\nNo frontmatter in this document." - document = Document(path="test.md", content=content) - elements = self.extractor.extract(document) - - self.assertEqual(len(elements), 0) - - def test_parse_yaml_frontmatter(self): - """Test parsing YAML frontmatter.""" - yaml_content = """ -title: Test Document -date: 2023-01-01 -tags: [tag1, tag2] -custom: value -""" - result = self.extractor.parse_frontmatter(yaml_content, "yaml") - - self.assertEqual(result["title"], "Test Document") - self.assertIsInstance(result["date"], str) - self.assertEqual(result["tags"], ["tag1", "tag2"]) - self.assertEqual(result["custom"], "value") - - def test_parse_toml_frontmatter(self): - """Test parsing TOML frontmatter.""" - toml_content = """ -title = "Test Document" -date = 2023-01-01 -tags = ["tag1", "tag2"] -custom = "value" -""" - result = self.extractor.parse_frontmatter(toml_content, "toml") - - self.assertEqual(result["title"], "Test Document") - self.assertIsInstance(result["date"], str) - self.assertEqual(result["tags"], ["tag1", "tag2"]) - self.assertEqual(result["custom"], "value") - - def test_create_frontmatter_model(self): - """Test creating a Frontmatter model from a dictionary.""" - frontmatter_dict = { - "title": "Test Document", - "date": "2023-01-01", - "tags": ["tag1", "tag2"], - "custom": "value" - } - - model = self.extractor.create_frontmatter_model(frontmatter_dict) - - self.assertEqual(model.title, "Test Document") - self.assertIsInstance(model.date, datetime) - self.assertEqual(model.date.year, 2023) - self.assertEqual(model.date.month, 1) - self.assertEqual(model.date.day, 1) - self.assertEqual(model.tags, ["tag1", "tag2"]) - self.assertEqual(model.custom_fields["custom"], "value") - - def test_extract_tags_from_frontmatter(self): - """Test extracting tags from frontmatter dictionary.""" - # Test list format - dict_with_list = {"tags": ["tag1", "tag2", "tag3"]} - tags = self.extractor._extract_tags_from_frontmatter(dict_with_list) - self.assertEqual(tags, ["tag1", "tag2", "tag3"]) - - # Test comma-separated string - dict_with_comma = {"tags": "tag1, tag2, tag3"} - tags = self.extractor._extract_tags_from_frontmatter(dict_with_comma) - self.assertEqual(tags, ["tag1", "tag2", "tag3"]) - - # Test space-separated string - dict_with_space = {"tags": "tag1 tag2 tag3"} - tags = self.extractor._extract_tags_from_frontmatter(dict_with_space) - self.assertEqual(tags, ["tag1", "tag2", "tag3"]) - - # Test categories - dict_with_categories = {"categories": ["cat1", "cat2"]} - tags = self.extractor._extract_tags_from_frontmatter(dict_with_categories) - self.assertEqual(tags, ["cat1", "cat2"]) - - # Test both tags and categories - dict_with_both = { - "tags": ["tag1", "tag2"], - "categories": ["cat1", "cat2"] - } - tags = self.extractor._extract_tags_from_frontmatter(dict_with_both) - self.assertEqual(set(tags), {"tag1", "tag2", "cat1", "cat2"}) - - -if __name__ == "__main__": - unittest.main() \ No newline at end of file diff --git a/tests/extractor/test_heading_section_extractor.py b/tests/extractor/test_heading_section_extractor.py deleted file mode 100644 index f219a5b..0000000 --- a/tests/extractor/test_heading_section_extractor.py +++ /dev/null @@ -1,219 +0,0 @@ -"""Tests for the heading and section extractor.""" - -import unittest -from pathlib import Path -from unittest.mock import Mock - -from knowledgebase_processor.models.content import Document -from knowledgebase_processor.models.markdown import Heading, Section -from knowledgebase_processor.extractor.heading_section import HeadingSectionExtractor - - -class TestHeadingSectionExtractor(unittest.TestCase): - """Test cases for the heading and section extractor.""" - - def setUp(self): - """Set up the test environment.""" - self.extractor = HeadingSectionExtractor() - - def test_extract_empty_document(self): - """Test extracting from an empty document.""" - document = Document(path="test.md", content="", title="Test") - elements = self.extractor.extract(document) - self.assertEqual(len(elements), 0) - - def test_extract_single_heading(self): - """Test extracting a single heading.""" - content = "# Heading 1\n\nSome content." - document = Document(path="test.md", content=content, title="Test") - elements = self.extractor.extract(document) - - # We should have 1 heading and 1 section - headings = [e for e in elements if isinstance(e, Heading)] - sections = [e for e in elements if isinstance(e, Section)] - - self.assertEqual(len(headings), 1) - self.assertEqual(len(sections), 1) - self.assertEqual(headings[0].level, 1) - self.assertEqual(headings[0].text, "Heading 1") - self.assertEqual(sections[0].content, "Some content.") - self.assertEqual(sections[0].heading_id, headings[0].id) - self.assertEqual(sections[0].parent_id, headings[0].id) - - def test_extract_multiple_headings(self): - """Test extracting multiple headings.""" - content = """# Heading 1 - -Some content for heading 1. - -## Heading 2 - -Content for heading 2. - -### Heading 3 - -Content for heading 3. - -## Another Heading 2 - -Content for another heading 2. -""" - document = Document(path="test.md", content=content, title="Test") - elements = self.extractor.extract(document) - - # We should have 4 headings and 4 sections - headings = [e for e in elements if isinstance(e, Heading)] - sections = [e for e in elements if isinstance(e, Section)] - - self.assertEqual(len(headings), 4) - self.assertEqual(len(sections), 4) - - # Check heading levels - self.assertEqual(headings[0].level, 1) - self.assertEqual(headings[1].level, 2) - self.assertEqual(headings[2].level, 3) - self.assertEqual(headings[3].level, 2) - - # Check heading text - self.assertEqual(headings[0].text, "Heading 1") - self.assertEqual(headings[1].text, "Heading 2") - self.assertEqual(headings[2].text, "Heading 3") - self.assertEqual(headings[3].text, "Another Heading 2") - - # Check hierarchical relationships - self.assertIsNone(headings[0].parent_id) # H1 has no parent - self.assertEqual(headings[1].parent_id, headings[0].id) # H2 is child of H1 - self.assertEqual(headings[2].parent_id, headings[1].id) # H3 is child of H2 - self.assertEqual(headings[3].parent_id, headings[0].id) # Another H2 is child of H1 - - # Check sections are linked to their headings - for i, section in enumerate(sections): - self.assertEqual(section.heading_id, headings[i].id) - self.assertEqual(section.parent_id, headings[i].id) - - def test_extract_complex_hierarchy(self): - """Test extracting a complex heading hierarchy.""" - content = """# H1 -Content 1 - -## H2-A -Content 2A - -### H3-A -Content 3A - -#### H4 -Content 4 - -### H3-B -Content 3B - -## H2-B -Content 2B - -# Another H1 -Content for another H1 -""" - document = Document(path="test.md", content=content, title="Test") - elements = self.extractor.extract(document) - - # We should have 7 headings and 7 sections - headings = [e for e in elements if isinstance(e, Heading)] - sections = [e for e in elements if isinstance(e, Section)] - - self.assertEqual(len(headings), 7) - self.assertEqual(len(sections), 7) - - # Check heading levels - heading_levels = [h.level for h in headings] - self.assertEqual(heading_levels, [1, 2, 3, 4, 3, 2, 1]) - - # Check parent-child relationships - # H1 (index 0) has no parent - self.assertIsNone(headings[0].parent_id) - - # H2-A (index 1) is child of first H1 - self.assertEqual(headings[1].parent_id, headings[0].id) - - # H3-A (index 2) is child of H2-A - self.assertEqual(headings[2].parent_id, headings[1].id) - - # H4 (index 3) is child of H3-A - self.assertEqual(headings[3].parent_id, headings[2].id) - - # H3-B (index 4) is child of H2-A - self.assertEqual(headings[4].parent_id, headings[1].id) - - # H2-B (index 5) is child of first H1 - self.assertEqual(headings[5].parent_id, headings[0].id) - - # Another H1 (index 6) has no parent - self.assertIsNone(headings[6].parent_id) - - def test_extract_non_sequential_headings(self): - """Test extracting headings that skip levels (e.g., H1 to H3).""" - content = """# H1 -Content 1 - -### H3 (skipping H2) -Content 3 - -##### H5 (skipping H4) -Content 5 -""" - document = Document(path="test.md", content=content, title="Test") - elements = self.extractor.extract(document) - - # We should have 3 headings and 3 sections - headings = [e for e in elements if isinstance(e, Heading)] - - self.assertEqual(len(headings), 3) - - # Check heading levels - self.assertEqual(headings[0].level, 1) - self.assertEqual(headings[1].level, 3) - self.assertEqual(headings[2].level, 5) - - # Check parent-child relationships - self.assertIsNone(headings[0].parent_id) # H1 has no parent - self.assertEqual(headings[1].parent_id, headings[0].id) # H3 is child of H1 (skipping H2) - self.assertEqual(headings[2].parent_id, headings[1].id) # H5 is child of H3 (skipping H4) - - @unittest.skip("Processor now requires arguments") - def test_integration_with_processor(self): - """Test integration with the processor.""" - from knowledgebase_processor.processor.processor import Processor - - processor = Processor(Mock(), Mock()) - processor.register_extractor(self.extractor) - - content = """# Test Document - -## Section 1 - -Content for section 1. - -## Section 2 - -Content for section 2. - -### Subsection 2.1 - -Content for subsection 2.1. -""" - document = Document(path="test.md", content=content, title="Test") - processed_doc = processor.process_document(document) - - # Check that elements were added to the document - self.assertGreater(len(processed_doc.elements), 0) - - # Check that we have headings and sections - headings = [e for e in processed_doc.elements if isinstance(e, Heading)] - sections = [e for e in processed_doc.elements if isinstance(e, Section)] - - self.assertEqual(len(headings), 4) # H1, H2, H2, H3 - self.assertEqual(len(sections), 4) # One section for each heading - - -if __name__ == "__main__": - unittest.main() \ No newline at end of file diff --git a/tests/extractor/test_link_reference_extractor.py b/tests/extractor/test_link_reference_extractor.py deleted file mode 100644 index a352c9e..0000000 --- a/tests/extractor/test_link_reference_extractor.py +++ /dev/null @@ -1,188 +0,0 @@ -"""Tests for the link and reference extractor.""" - -import unittest -from pathlib import Path -from unittest.mock import Mock - -from knowledgebase_processor.models.content import Document -from knowledgebase_processor.models.links import Link, Reference, Citation -from knowledgebase_processor.extractor.link_reference import LinkReferenceExtractor - - -class TestLinkReferenceExtractor(unittest.TestCase): - """Test cases for the link and reference extractor.""" - - def setUp(self): - """Set up the test environment.""" - self.extractor = LinkReferenceExtractor() - - def test_extract_empty_document(self): - """Test extracting from an empty document.""" - document = Document(path="test.md", content="", title="Test") - elements = self.extractor.extract(document) - self.assertEqual(len(elements), 0) - - def test_extract_inline_links(self): - """Test extracting inline links.""" - content = """ -# Test Document - -This is a [link to Google](https://www.google.com) in a paragraph. -Here's [another link](https://example.com "Example Site") with a title. -And here's an [internal link](../path/to/file.md) to a local file. -""" - document = Document(path="test.md", content=content, title="Test") - elements = self.extractor.extract(document) - - # Filter out only Link elements - links = [e for e in elements if isinstance(e, Link)] - - # We should have 3 links - self.assertEqual(len(links), 3) - - # Check the first link (Google) - self.assertEqual(links[0].text, "link to Google") - self.assertEqual(links[0].url, "https://www.google.com") - self.assertFalse(links[0].is_internal) - - # Check the second link (Example with title) - self.assertEqual(links[1].text, "another link") - self.assertEqual(links[1].url, "https://example.com") - self.assertFalse(links[1].is_internal) - - # Check the third link (internal) - self.assertEqual(links[2].text, "internal link") - self.assertEqual(links[2].url, "../path/to/file.md") - self.assertTrue(links[2].is_internal) - - def test_extract_reference_links(self): - """Test extracting reference-style links.""" - content = """ -# Reference Links - -This is a [reference link][ref1] in a paragraph. -Here's [another reference][ref2] in the same paragraph. - -You can also use [shorthand][] references. -Or even just use the [text itself]. - -[ref1]: https://www.example.com -[ref2]: https://www.example.org "Example.org" -[shorthand]: https://shorthand.example.com -[text itself]: https://text.example.com -""" - document = Document(path="test.md", content=content, title="Test") - elements = self.extractor.extract(document) - - # Filter out Link and Reference elements - links = [e for e in elements if isinstance(e, Link)] - references = [e for e in elements if isinstance(e, Reference)] - - # We should have links and references - self.assertGreater(len(links), 0) - self.assertEqual(len(references), 4) - - # Check reference definitions - ref_keys = [r.key for r in references] - self.assertIn("ref1", ref_keys) - self.assertIn("ref2", ref_keys) - self.assertIn("shorthand", ref_keys) - self.assertIn("text itself", ref_keys) - - # Find ref2 and check its title - ref2 = next(r for r in references if r.key == "ref2") - self.assertEqual(ref2.url, "https://www.example.org") - self.assertEqual(ref2.title, "Example.org") - - # Check that links have correct URLs from references - ref1_link = next(l for l in links if l.text == "reference link") - self.assertEqual(ref1_link.url, "https://www.example.com") - - # Check reference links - ref1_link = next(l for l in links if l.text == "reference link") - self.assertEqual(ref1_link.url, "https://www.example.com") - - ref2_link = next(l for l in links if l.text == "another reference") - self.assertEqual(ref2_link.url, "https://www.example.org") - - def test_extract_citations(self): - """Test extracting citations.""" - content = """ -# Citations - -According to (Smith, 2020), this is an important finding. -Another study [@johnson2019] showed similar results. -Multiple citations (Smith, 2020; Johnson, 2019) support this claim. -""" - document = Document(path="test.md", content=content, title="Test") - elements = self.extractor.extract(document) - - # Filter out Citation elements - citations = [e for e in elements if isinstance(e, Citation)] - - # We should have 3 citations - self.assertEqual(len(citations), 3) - - # Check citation texts - citation_texts = [c.text for c in citations] - self.assertIn("Smith, 2020", citation_texts) - self.assertIn("johnson2019", citation_texts) - self.assertIn("Smith, 2020; Johnson, 2019", citation_texts) - - def test_mixed_link_types(self): - """Test extracting a mix of link types in the same document.""" - content = """ -# Mixed Link Types - -This document has [inline links](https://example.com) and [reference links][ref1]. -It also has citations (Author, 2023) and [@citation-key]. - -[ref1]: https://reference.example.com "Reference Example" -""" - document = Document(path="test.md", content=content, title="Test") - elements = self.extractor.extract(document) - - # Count each type of element - links = [e for e in elements if isinstance(e, Link)] - references = [e for e in elements if isinstance(e, Reference)] - citations = [e for e in elements if isinstance(e, Citation)] - - self.assertGreater(len(links), 0) # At least one link - self.assertEqual(len(references), 1) - self.assertEqual(len(citations), 2) - - # Check that we have links - self.assertGreater(len(links), 0) - - @unittest.skip("Processor now requires arguments") - def test_integration_with_processor(self): - """Test integration with the processor.""" - from knowledgebase_processor.processor.processor import Processor - - processor = Processor(Mock(), Mock()) - processor.register_extractor(self.extractor) - - content = """ -# Test Document - -This is a [link to Google](https://www.google.com) in a paragraph. -Here's a [reference link][ref1] in the same paragraph. - -[ref1]: https://www.example.com "Example Site" -""" - document = Document(path="test.md", content=content, title="Test") - processed_doc = processor.process_document(document) - - # Check that elements were added to the document - self.assertGreater(len(processed_doc.elements), 0) - - # Check that we have links and references - links = [e for e in processed_doc.elements if isinstance(e, Link)] - references = [e for e in processed_doc.elements if isinstance(e, Reference)] - - self.assertGreater(len(links), 0) # At least one link - self.assertEqual(len(references), 1) - - -if __name__ == "__main__": - unittest.main() \ No newline at end of file diff --git a/tests/extractor/test_list_table_extractor.py b/tests/extractor/test_list_table_extractor.py deleted file mode 100644 index 6f38d58..0000000 --- a/tests/extractor/test_list_table_extractor.py +++ /dev/null @@ -1,167 +0,0 @@ -"""Tests for the list and table extractor.""" - -import unittest -from pathlib import Path - -from knowledgebase_processor.models.content import Document -from knowledgebase_processor.extractor.list_table import ListTableExtractor - - -class TestListTableExtractor(unittest.TestCase): - """Test cases for the list and table extractor.""" - - def setUp(self): - """Set up the test environment.""" - self.extractor = ListTableExtractor() - - def test_extract_unordered_list(self): - """Test extracting an unordered list.""" - content = """ -# Test Document - -This is a test document with an unordered list: - -- Item 1 -- Item 2 -- Item 3 -""" - document = Document(path="test.md", content=content) - elements = self.extractor.extract(document) - - # Check that we extracted a list - self.assertEqual(len(elements), 1) - self.assertEqual(elements[0].element_type, "list") - self.assertFalse(elements[0].ordered) - - # Check the list items - self.assertEqual(len(elements[0].items), 3) - self.assertEqual(elements[0].items[0].text, "Item 1") - self.assertEqual(elements[0].items[1].text, "Item 2") - self.assertEqual(elements[0].items[2].text, "Item 3") - - def test_extract_ordered_list(self): - """Test extracting an ordered list.""" - content = """ -# Test Document - -This is a test document with an ordered list: - -1. First item -2. Second item -3. Third item -""" - document = Document(path="test.md", content=content) - elements = self.extractor.extract(document) - - # Check that we extracted a list - self.assertEqual(len(elements), 1) - self.assertEqual(elements[0].element_type, "list") - self.assertTrue(elements[0].ordered) - - # Check the list items - self.assertEqual(len(elements[0].items), 3) - self.assertEqual(elements[0].items[0].text, "First item") - self.assertEqual(elements[0].items[1].text, "Second item") - self.assertEqual(elements[0].items[2].text, "Third item") - - def test_extract_nested_list(self): - """Test extracting a nested list.""" - content = """ -# Test Document - -This is a test document with a nested list: - -- Item 1 - - Nested item 1.1 - - Nested item 1.2 -- Item 2 - 1. Nested ordered item 2.1 - 2. Nested ordered item 2.2 -- Item 3 -""" - document = Document(path="test.md", content=content) - elements = self.extractor.extract(document) - - # We should have extracted at least one list - lists = [e for e in elements if e.element_type == "list"] - self.assertGreater(len(lists), 0, "No lists found") - - # Check that we have at least one list with "Item 1" in it - has_item1 = False - for l in lists: - for item in l.items: - if "Item 1" in item.text: - has_item1 = True - break - if has_item1: - break - - self.assertTrue(has_item1, "No list with 'Item 1' found") - - # Check that we have at least one list with a nested item - has_nested_item = False - for l in lists: - for item in l.items: - if "Nested" in item.text: - has_nested_item = True - break - if has_nested_item: - break - - self.assertTrue(has_nested_item, "No list with nested items found") - - def test_extract_table(self): - """Test extracting a table.""" - content = """ -# Test Document - -This is a test document with a table: - -| Header 1 | Header 2 | Header 3 | -|----------|----------|----------| -| Cell 1,1 | Cell 1,2 | Cell 1,3 | -| Cell 2,1 | Cell 2,2 | Cell 2,3 | -""" - document = Document(path="test.md", content=content) - elements = self.extractor.extract(document) - - # Check that we extracted at least one element - self.assertGreater(len(elements), 0, "No elements extracted") - - # Find the table element - tables = [e for e in elements if e.element_type == "table"] - self.assertGreater(len(tables), 0, "No table elements found") - - table = tables[0] - - # Check the table headers - self.assertEqual(len(table.headers), 3, "Expected 3 headers") - self.assertIn("Header 1", table.headers) - self.assertIn("Header 2", table.headers) - self.assertIn("Header 3", table.headers) - - # Check the table rows - self.assertEqual(len(table.rows), 2, "Expected 2 rows") - - # Check for expected cell content - cell_values = [] - for row in table.rows: - cell_values.extend(row) - - self.assertIn("Cell 1,1", cell_values) - self.assertIn("Cell 1,2", cell_values) - self.assertIn("Cell 1,3", cell_values) - self.assertIn("Cell 2,1", cell_values) - self.assertIn("Cell 2,2", cell_values) - self.assertIn("Cell 2,3", cell_values) - - # Check the table cells - header_cells = [c for c in table.cells if c.is_header] - data_cells = [c for c in table.cells if not c.is_header] - - self.assertEqual(len(header_cells), 3, "Expected 3 header cells") - self.assertEqual(len(data_cells), 6, "Expected 6 data cells") - - -if __name__ == "__main__": - unittest.main() \ No newline at end of file diff --git a/tests/extractor/test_markdown_extractor.py b/tests/extractor/test_markdown_extractor.py deleted file mode 100644 index 410139a..0000000 --- a/tests/extractor/test_markdown_extractor.py +++ /dev/null @@ -1,141 +0,0 @@ -"""Tests for the markdown extractor.""" - -import unittest -from pathlib import Path -from unittest.mock import Mock - -from knowledgebase_processor.models.content import Document -from knowledgebase_processor.models.markdown import ( - Heading, Section, MarkdownList, ListItem, TodoItem, Table, CodeBlock, Blockquote -) -from knowledgebase_processor.extractor.markdown import MarkdownExtractor - - -class TestMarkdownExtractor(unittest.TestCase): - """Test cases for the markdown extractor.""" - - def setUp(self): - """Set up the test environment.""" - self.extractor = MarkdownExtractor() - - def test_extract_empty_document(self): - """Test extracting from an empty document.""" - document = Document(path="test.md", content="", title="Test") - elements = self.extractor.extract(document) - self.assertEqual(len(elements), 0) - - def test_extract_headings(self): - """Test extracting headings.""" - content = """# Heading 1 - -## Heading 2 - -### Heading 3 -""" - document = Document(path="test.md", content=content, title="Test") - elements = self.extractor.extract(document) - - # We should have 3 headings and 3 sections - headings = [e for e in elements if e.element_type == "heading"] - sections = [e for e in elements if e.element_type == "section"] - - self.assertEqual(len(headings), 3) - self.assertEqual(len(sections), 3) - - def test_extract_lists_and_todos(self): - """Test extracting lists and todo items.""" - content = """ -- Item 1 -- [ ] Todo item 1 -- [x] Completed todo item -""" - document = Document(path="test.md", content=content, title="Test") - elements = self.extractor.extract(document) - - # We should have lists and todo items - lists = [e for e in elements if e.element_type == "list"] - list_items = [e for e in elements if e.element_type == "list_item"] - todo_items = [e for e in elements if e.element_type == "todo_item"] - - self.assertGreaterEqual(len(lists), 1) - self.assertGreaterEqual(len(list_items) + len(todo_items), 3) - - def test_extract_code_blocks(self): - """Test extracting code blocks.""" - content = """ -```python -def hello_world(): - print("Hello, world!") -``` -""" - document = Document(path="test.md", content=content, title="Test") - elements = self.extractor.extract(document) - - # We should have a code block - code_blocks = [e for e in elements if e.element_type == "code_block"] - self.assertEqual(len(code_blocks), 1) - self.assertEqual(code_blocks[0].language, "python") - - @unittest.skip("Table parsing not yet fully implemented") - def test_extract_tables(self): - """Test extracting tables.""" - content = """ -Header 1 | Header 2 --------- | -------- -Cell 1 | Cell 2 -""" - document = Document(path="test.md", content=content, title="Test") - elements = self.extractor.extract(document) - - # We should have a table - tables = [e for e in elements if e.element_type == "table"] - self.assertEqual(len(tables), 1) - - def test_extract_blockquotes(self): - """Test extracting blockquotes.""" - content = """ -> This is a blockquote -> With multiple lines -""" - document = Document(path="test.md", content=content, title="Test") - elements = self.extractor.extract(document) - - # We should have a blockquote - blockquotes = [e for e in elements if e.element_type == "blockquote"] - self.assertGreaterEqual(len(blockquotes), 1) - - @unittest.skip("Processor now requires arguments") - def test_integration_with_processor(self): - """Test integration with the processor.""" - from knowledgebase_processor.processor.processor import Processor - - processor = Processor(Mock(), Mock()) - processor.register_extractor(self.extractor) - - content = """# Test Document - -## Section 1 - -- List item 1 -- [ ] Todo item - -## Section 2 - -```python -def test(): - pass -``` -""" - document = Document(path="test.md", content=content, title="Test") - processed_doc = processor.process_document(document) - - # Check that elements were added to the document - self.assertGreater(len(processed_doc.elements), 0) - - # Check that we have different element types - element_types = set(e.element_type for e in processed_doc.elements) - self.assertGreaterEqual(len(element_types), 4) # At least heading, section, list, code_block - - -if __name__ == "__main__": - unittest.main() \ No newline at end of file diff --git a/tests/extractor/test_tag_extractor.py b/tests/extractor/test_tag_extractor.py deleted file mode 100644 index 9711134..0000000 --- a/tests/extractor/test_tag_extractor.py +++ /dev/null @@ -1,289 +0,0 @@ -"""Tests for the TagExtractor.""" - -import unittest -from unittest.mock import patch, MagicMock - -from knowledgebase_processor.models.content import Document, ContentElement -from knowledgebase_processor.models.metadata import Tag -from knowledgebase_processor.extractor.tags import TagExtractor - - -class TestTagExtractor(unittest.TestCase): - """Test cases for the TagExtractor.""" - - def setUp(self): - """Set up the test environment.""" - self.extractor = TagExtractor() - - def test_extract_hashtags(self): - """Test extracting hashtags.""" - content = """ -# Document Title - -This is a document with #tag1 and #tag2 hashtags. -""" - document = Document(path="test.md", content=content) - elements = self.extractor.extract(document) - - # Filter only hashtag elements - hashtags = [e for e in elements if e.element_type == "tag" and "source" not in e.position] - - self.assertEqual(len(hashtags), 2) - self.assertEqual(hashtags[0].content, "tag1") - self.assertEqual(hashtags[1].content, "tag2") - - def test_extract_inline_tags(self): - """Test that inline tags in square brackets are NOT extracted.""" - content = """ -# Document Title - -This is a document with [tag1] and [tag2] inline tags. -""" - document = Document(path="test.md", content=content) - elements = self.extractor.extract(document) - - # Filter only tag elements - inline_tags = [e for e in elements if e.element_type == "tag" and "source" not in e.position] - - self.assertEqual(len(inline_tags), 0) - - def test_extract_category_tags(self): - """Test extracting category tags.""" - content = """ -# Document Title - -This is a document with @category1/tag1 and @category2/tag2 category tags. -""" - document = Document(path="test.md", content=content) - elements = self.extractor.extract(document) - - # Filter only category tag elements - category_tags = [e for e in elements if e.element_type == "tag" and e.metadata.get("category")] - - self.assertEqual(len(category_tags), 2) - self.assertEqual(category_tags[0].content, "tag1") - self.assertEqual(category_tags[0].metadata["category"], "category1") - self.assertEqual(category_tags[1].content, "tag2") - self.assertEqual(category_tags[1].metadata["category"], "category2") - - def test_extract_frontmatter_tags(self): - """Test extracting tags from frontmatter.""" - # Create a mock frontmatter element - mock_frontmatter_element = ContentElement( - element_type="frontmatter", - content="tags: [tag1, tag2]", - position={"start": 0, "end": 20}, - metadata={"format": "yaml"} - ) - - # Mock the frontmatter extractor to return our mock element - with patch.object(self.extractor.frontmatter_extractor, 'extract') as mock_extract: - mock_extract.return_value = [mock_frontmatter_element] - - # Mock the parse_frontmatter method - with patch.object(self.extractor.frontmatter_extractor, 'parse_frontmatter') as mock_parse: - mock_parse.return_value = {"tags": ["tag1", "tag2"]} - - document = Document(path="test.md", content="---\ntags: [tag1, tag2]\n---\n") - elements = self.extractor.extract(document) - - # Filter only frontmatter tag elements - frontmatter_tags = [e for e in elements if e.element_type == "tag" and e.metadata.get("source") == "frontmatter"] - - self.assertEqual(len(frontmatter_tags), 2) - self.assertEqual(frontmatter_tags[0].content, "tag1") - self.assertEqual(frontmatter_tags[1].content, "tag2") - - def test_extract_mixed_tags(self): - """Test extracting mixed tag types.""" - content = """--- -title: Test Document -tags: [fm1, fm2] ---- - -# Document Title - -This is a document with #hashtag1 and [inline1] tags. -It also has @category/categorized tags. -""" - document = Document(path="test.md", content=content) - - # Mock the frontmatter extraction - with patch.object(self.extractor.frontmatter_extractor, 'extract') as mock_extract: - mock_frontmatter_element = ContentElement( - element_type="frontmatter", - content="title: Test Document\ntags: [fm1, fm2]", - position={"start": 0, "end": 50}, - metadata={"format": "yaml"} - ) - mock_extract.return_value = [mock_frontmatter_element] - - # Mock the parse_frontmatter method - with patch.object(self.extractor.frontmatter_extractor, 'parse_frontmatter') as mock_parse: - mock_parse.return_value = {"title": "Test Document", "tags": ["fm1", "fm2"]} - - elements = self.extractor.extract(document) - - # We should have 4 tags: 2 from frontmatter, 1 hashtag, 1 category (no inline tag) - self.assertEqual(len([e for e in elements if e.element_type == "tag"]), 4) - - def test_get_all_tags(self): - """Test getting all unique tags from a document.""" - content = """--- -title: Test Document -tags: [tag1, tag2] ---- - -# Document Title - -This is a document with #tag1 and [tag3] tags. -It also has @category/tag4 tags. -""" - document = Document(path="test.md", content=content) - - # Mock the frontmatter extraction - with patch.object(self.extractor.frontmatter_extractor, 'extract') as mock_extract: - mock_frontmatter_element = ContentElement( - element_type="frontmatter", - content="title: Test Document\ntags: [tag1, tag2]", - position={"start": 0, "end": 50}, - metadata={"format": "yaml"} - ) - mock_extract.return_value = [mock_frontmatter_element] - - # Mock the parse_frontmatter method - with patch.object(self.extractor.frontmatter_extractor, 'parse_frontmatter') as mock_parse: - mock_parse.return_value = {"title": "Test Document", "tags": ["tag1", "tag2"]} - - # Mock the extract method to return predefined elements - with patch.object(self.extractor, 'extract') as mock_extract_tags: - mock_extract_tags.return_value = [ - ContentElement( - element_type="tag", - content="tag1", - position={"start": 0, "end": 10}, - metadata={"source": "frontmatter"} - ), - ContentElement( - element_type="tag", - content="tag2", - position={"start": 0, "end": 10}, - metadata={"source": "frontmatter"} - ), - ContentElement( - element_type="tag", - content="tag1", - position={"start": 100, "end": 110} - ), - ContentElement( - element_type="tag", - content="tag3", - position={"start": 120, "end": 130} - ), - ContentElement( - element_type="tag", - content="tag4", - position={"start": 140, "end": 150}, - metadata={"category": "category"} - ) - ] - - tags = self.extractor.get_all_tags(document) - - # We should have 4 unique tags - self.assertEqual(len(tags), 4) - - # Convert to list for easier assertion - tag_list = list(tags) - - # Check that we have the expected tags - tag_names = {tag.name for tag in tag_list} - self.assertEqual(tag_names, {"tag1", "tag2", "tag3", "tag4"}) - - # Check that tag4 has the correct category - tag4 = next(tag for tag in tag_list if tag.name == "tag4") - self.assertEqual(tag4.category, "category") - -def test_bracketed_text_not_tag(self): - """Test that text like ' [Conversion] ' does NOT produce a tag.""" - content = " [Conversion] " - document = Document(path="test.md", content=content) - elements = self.extractor.extract(document) - # Filter only tag elements - tags = [e for e in elements if e.element_type == "tag" and "source" not in e.position] - # --- Step 1: Assert current (buggy) behavior: this will pass if bug exists --- - # self.assertEqual(len(tags), 1) - # self.assertEqual(tags[0].content, "Conversion") - # --- Step 2: Assert correct behavior: should NOT extract any tags --- - self.assertEqual(len(tags), 0) - -def test_hashtag_preceded_by_non_whitespace(self): - """Hashtags should only be extracted if preceded by whitespace or start-of-line.""" - content = ( - "#tag1 word #tag2 word#notatag\n" - "another#notatag2\n" - " #tag3\n" - "word\t#tag4" - ) - document = Document(path="test.md", content=content) - elements = self.extractor.extract(document) - tags = [e.content for e in elements if e.element_type == "tag" and "source" not in e.position] - self.assertIn("tag1", tags) - self.assertIn("tag2", tags) - self.assertIn("tag3", tags) - self.assertIn("tag4", tags) - self.assertNotIn("notatag", tags) - self.assertNotIn("notatag2", tags) - def test_fixture_tag_cases(self): - """Test tag extraction against the markdown-to-html fixture cases.""" - cases = [ - # (input, expected_tags) - ("#hashtag", ["hashtag"]), - ("#HasHTAg", ["HasHTAg"]), - ("#hash #tag", ["hash", "tag"]), - ("multiple #hash #tags in one #line", ["hash", "tags", "line"]), - ("preceeding #space", ["space"]), - ("missing preceeding#space", []), - ("#test#tag", ["test"]), - ("#t-a_g", ["t"]), - ("#äöüß", []), - ("#<3", []), - ("#<3 and other #hashtags", ["hashtags"]), - ("#0", ["0"]), - ("there is no # hashtag", []), - ("still no #", []), - ("##notag", []), - ("hashtag #hashtag", ["hashtag"]), - ("not a#hashtag #hashtag", ["hashtag"]), - ("#tag1\n#tag2\n#tag3", ["tag1", "tag2", "tag3"]), - ] - extractor = TagExtractor() - for text, expected in cases: - doc = Document(path="test.md", content=text) - elements = extractor.extract(doc) - tags = [e.content for e in elements if e.element_type == "tag" and "source" not in e.position] - self.assertEqual(tags, expected, f"Failed for input: {text}") - - def test_fixture_ignores_code_and_links(self): - """Test that hashtags in code, code blocks, links, images, and titles are ignored.""" - extractor = TagExtractor() - code_cases = [ - ("`don't render #hashtags in inline code`", []), - ("```\ndon't render #hashtags in code blocks\n```", []), - ("[#hashtag](http://awe.so.me)", []), - ("[there is a #hashtag](http://awe.so.me)", []), - ("[link](http://awe.so.me \"#title\")", []), - ("[link](http://awe.so.me \"there is a #title\")", []), - ("![a #hashtag](http://awe.so.me/image.gif)", []), - ("![image](http://awe.so.me/image.gif \"a #title\")", []), - ("# hashtag", ["hashtag"]), # heading with space is a tag - ("

hashtag

", []), - ("
#
", []), - ] - for text, expected in code_cases: - doc = Document(path="test.md", content=text) - elements = extractor.extract(doc) - tags = [e.content for e in elements if e.element_type == "tag" and "source" not in e.position] - self.assertEqual(tags, expected, f"Failed for input: {text}") -if __name__ == "__main__": - unittest.main() \ No newline at end of file diff --git a/tests/extractor/test_todo_item_extractor.py b/tests/extractor/test_todo_item_extractor.py deleted file mode 100644 index 3469067..0000000 --- a/tests/extractor/test_todo_item_extractor.py +++ /dev/null @@ -1,206 +0,0 @@ -"""Tests for the todo item extractor.""" - -import unittest -from pathlib import Path -from unittest.mock import Mock - -from knowledgebase_processor.models.content import Document -from knowledgebase_processor.models.markdown import TodoItem -from knowledgebase_processor.extractor.todo_item import TodoItemExtractor - - -class TestTodoItemExtractor(unittest.TestCase): - """Test cases for the todo item extractor.""" - - def setUp(self): - """Set up the test environment.""" - self.extractor = TodoItemExtractor() - - def test_extract_empty_document(self): - """Test extracting from an empty document.""" - document = Document(path="test.md", content="", title="Test") - elements = self.extractor.extract(document) - self.assertEqual(len(elements), 0) - - def test_extract_no_todos(self): - """Test extracting from a document with no todo items.""" - content = """# Test Document - -This is a test document with no todo items. - -- Regular list item -- Another regular item -""" - document = Document(path="test.md", content=content, title="Test") - elements = self.extractor.extract(document) - self.assertEqual(len(elements), 0) - - def test_extract_unchecked_todos(self): - """Test extracting unchecked todo items.""" - content = """# Test Document - -- [ ] Todo item 1 -- [ ] Todo item 2 -""" - document = Document(path="test.md", content=content, title="Test") - elements = self.extractor.extract(document) - - # We should have 2 todo items - self.assertEqual(len(elements), 2) - - # All should be unchecked - for element in elements: - self.assertIsInstance(element, TodoItem) - self.assertEqual(element.element_type, "todo_item") - self.assertFalse(element.is_checked) - - def test_extract_checked_todos(self): - """Test extracting checked todo items.""" - content = """# Test Document - -- [x] Completed todo item 1 -- [X] Completed todo item 2 -""" - document = Document(path="test.md", content=content, title="Test") - elements = self.extractor.extract(document) - - # We should have 2 todo items - self.assertEqual(len(elements), 2) - - # All should be checked - for element in elements: - self.assertIsInstance(element, TodoItem) - self.assertEqual(element.element_type, "todo_item") - self.assertTrue(element.is_checked) - - def test_extract_mixed_todos(self): - """Test extracting a mix of checked and unchecked todo items.""" - content = """# Test Document - -- [ ] Todo item 1 -- [x] Completed todo item -- [ ] Todo item 2 -""" - document = Document(path="test.md", content=content, title="Test") - elements = self.extractor.extract(document) - - # We should have 3 todo items - self.assertEqual(len(elements), 3) - - # Check the completion status - self.assertFalse(elements[0].is_checked) - self.assertTrue(elements[1].is_checked) - self.assertFalse(elements[2].is_checked) - - def test_extract_todo_text(self): - """Test extracting the text content of todo items.""" - content = """# Test Document - -- [ ] Buy milk -- [x] Write code -""" - document = Document(path="test.md", content=content, title="Test") - elements = self.extractor.extract(document) - - # Check the text content - self.assertEqual(elements[0].text, "Buy milk") - self.assertEqual(elements[1].text, "Write code") - - def test_extract_todo_with_context(self): - """Test extracting todo items with their context.""" - # Create a document with a section - document = Document( - path="test.md", - content="# Shopping List\n\n- [ ] Buy milk\n- [x] Buy eggs", - title="Test" - ) - - # Add a section element to the document - from knowledgebase_processor.models.markdown import Section - import uuid - - section_id = str(uuid.uuid4()) - section = Section( - id=section_id, - element_type="section", - content="# Shopping List\n\n- [ ] Buy milk\n- [x] Buy eggs", - position={"start": 0, "end": 4}, - heading_id=None - ) - document.elements.append(section) - - # Extract todo items - elements = self.extractor.extract(document) - - # Check that the todo items have the section as parent - self.assertEqual(len(elements), 2) - self.assertEqual(elements[0].parent_id, section_id) - self.assertEqual(elements[1].parent_id, section_id) - - def test_extract_todos_with_leading_whitespace(self): - """Test extracting todo items with leading whitespace.""" - content = """# Test Document - - - [ ] Single space indent - - [x] Two space indent - - [ ] Four space indent - - [x] Tab indent -- [ ] No indent -""" - document = Document(path="test.md", content=content, title="Test") - elements = self.extractor.extract(document) - - # We should have 5 todo items - self.assertEqual(len(elements), 5) - - # Check all are TodoItem instances - for element in elements: - self.assertIsInstance(element, TodoItem) - self.assertEqual(element.element_type, "todo_item") - - # Check specific items - self.assertEqual(elements[0].text, "Single space indent") - self.assertFalse(elements[0].is_checked) - - self.assertEqual(elements[1].text, "Two space indent") - self.assertTrue(elements[1].is_checked) - - self.assertEqual(elements[2].text, "Four space indent") - self.assertFalse(elements[2].is_checked) - - self.assertEqual(elements[3].text, "Tab indent") - self.assertTrue(elements[3].is_checked) - - self.assertEqual(elements[4].text, "No indent") - self.assertFalse(elements[4].is_checked) - - @unittest.skip("Processor now requires arguments") - def test_integration_with_processor(self): - """Test integration with the processor.""" - from knowledgebase_processor.processor.processor import Processor - - processor = Processor(Mock(), Mock()) - processor.register_extractor(self.extractor) - - content = """# Test Document - -## Tasks -- [ ] Task 1 -- [x] Task 2 -- [ ] Task 3 -""" - document = Document(path="test.md", content=content, title="Test") - processed_doc = processor.process_document(document) - - # Check that todo items were added to the document - todo_items = [e for e in processed_doc.elements if e.element_type == "todo_item"] - self.assertEqual(len(todo_items), 3) - - # Check completion status - self.assertFalse(todo_items[0].is_checked) - self.assertTrue(todo_items[1].is_checked) - self.assertFalse(todo_items[2].is_checked) - - -if __name__ == "__main__": - unittest.main() \ No newline at end of file diff --git a/tests/extractor/test_wikilink_extractor.py b/tests/extractor/test_wikilink_extractor.py deleted file mode 100644 index e67a1d3..0000000 --- a/tests/extractor/test_wikilink_extractor.py +++ /dev/null @@ -1,82 +0,0 @@ -import unittest -from unittest.mock import Mock -from knowledgebase_processor.extractor.wikilink_extractor import WikiLinkExtractor -from knowledgebase_processor.models.content import Document - -class TestWikiLinkExtractor(unittest.TestCase): - def setUp(self): - # Create mocks for document_registry and id_generator - self.mock_registry = Mock() - self.mock_registry.find_document_by_path.return_value = None - - self.mock_id_generator = Mock() - self.mock_id_generator.generate_wikilink_id.return_value = "wikilink_123" - - self.extractor = WikiLinkExtractor(self.mock_registry, self.mock_id_generator) - - def test_basic_wikilink(self): - doc = Document(path="test.md", content="This is a link to [[Page One]].") - result = self.extractor.extract(doc, "doc_id") - self.assertEqual(len(result), 1) - self.assertEqual(result[0].target_path, "Page One") - self.assertEqual(result[0].label, "Page One") - self.assertIsNone(result[0].alias) - - def test_wikilink_with_display_text(self): - doc = Document(path="test.md", content="See [[Page Two|Custom Text]].") - result = self.extractor.extract(doc, "doc_id") - self.assertEqual(len(result), 1) - self.assertEqual(result[0].target_path, "Page Two") - self.assertEqual(result[0].alias, "Custom Text") - self.assertEqual(result[0].label, "Custom Text") - - def test_multiple_wikilinks(self): - doc = Document(path="test.md", content="[[A]] and [[B|Bee]] are both links.") - result = self.extractor.extract(doc, "doc_id") - self.assertEqual(len(result), 2) - self.assertEqual(result[0].target_path, "A") - self.assertEqual(result[0].label, "A") - self.assertEqual(result[1].target_path, "B") - self.assertEqual(result[1].alias, "Bee") - self.assertEqual(result[1].label, "Bee") - - def test_wikilink_at_line_edges(self): - doc = Document(path="test.md", content="[[Start]] middle [[End|Finish]]") - result = self.extractor.extract(doc, "doc_id") - self.assertEqual(len(result), 2) - self.assertEqual(result[0].target_path, "Start") - self.assertEqual(result[1].target_path, "End") - self.assertEqual(result[1].alias, "Finish") - - def test_no_wikilinks(self): - doc = Document(path="test.md", content="No links here.") - result = self.extractor.extract(doc, "doc_id") - self.assertEqual(result, []) - - def test_nested_or_broken_wikilinks(self): - doc = Document(path="test.md", content="[[Not closed or [[Nested|Display]]]]") - result = self.extractor.extract(doc, "doc_id") - # Should extract only the valid [[Nested|Display]] - self.assertEqual(len(result), 1) - self.assertEqual(result[0].target_path, "Nested") - self.assertEqual(result[0].alias, "Display") - - def test_original_text_preservation(self): - doc = Document(path="test.md", content="Link: [[Some Page|Custom Display]]") - result = self.extractor.extract(doc, "doc_id") - self.assertEqual(len(result), 1) - self.assertEqual(result[0].original_text, "[[Some Page|Custom Display]]") - - def test_document_resolution(self): - # Mock a resolved document - mock_resolved_doc = Mock() - mock_resolved_doc.kb_id = "doc_456" - self.mock_registry.find_document_by_path.return_value = mock_resolved_doc - - doc = Document(path="test.md", content="[[Existing Page]]") - result = self.extractor.extract(doc, "doc_id") - self.assertEqual(len(result), 1) - self.assertEqual(result[0].resolved_document_uri, "doc_456") - -if __name__ == "__main__": - unittest.main() \ No newline at end of file diff --git a/tests/fixtures/link_01_inline_links/expected.ttl b/tests/fixtures/link_01_inline_links/expected.ttl new file mode 100644 index 0000000..7d1f0d6 --- /dev/null +++ b/tests/fixtures/link_01_inline_links/expected.ttl @@ -0,0 +1,18 @@ +@prefix kb: . +@prefix rdf: . +@prefix rdfs: . + + a kb:Link ; + kb:text "link to Google" ; + kb:url "https://www.google.com" ; + kb:isInternal false . + + a kb:Link ; + kb:text "another link" ; + kb:url "https://example.com" ; + kb:isInternal false . + + a kb:Link ; + kb:text "internal link" ; + kb:url "../path/to/file.md" ; + kb:isInternal true . \ No newline at end of file diff --git a/tests/fixtures/link_01_inline_links/input.md b/tests/fixtures/link_01_inline_links/input.md new file mode 100644 index 0000000..8d23200 --- /dev/null +++ b/tests/fixtures/link_01_inline_links/input.md @@ -0,0 +1,5 @@ +# Test Document + +This is a [link to Google](https://www.google.com) in a paragraph. +Here's [another link](https://example.com "Example Site") with a title. +And here's an [internal link](../path/to/file.md) to a local file. \ No newline at end of file diff --git a/tests/fixtures/link_02_reference_links/expected.ttl b/tests/fixtures/link_02_reference_links/expected.ttl new file mode 100644 index 0000000..db322d6 --- /dev/null +++ b/tests/fixtures/link_02_reference_links/expected.ttl @@ -0,0 +1,40 @@ +@prefix kb: . +@prefix rdf: . +@prefix rdfs: . + + a kb:Link ; + kb:text "reference link" ; + kb:url "https://www.example.com" ; + kb:isInternal false . + + a kb:Link ; + kb:text "another reference" ; + kb:url "https://www.example.org" ; + kb:isInternal false . + + a kb:Link ; + kb:text "shorthand" ; + kb:url "https://shorthand.example.com" ; + kb:isInternal false . + + a kb:Link ; + kb:text "text itself" ; + kb:url "https://text.example.com" ; + kb:isInternal false . + + a kb:Reference ; + kb:key "ref1" ; + kb:url "https://www.example.com" . + + a kb:Reference ; + kb:key "ref2" ; + kb:url "https://www.example.org" ; + kb:title "Example.org" . + + a kb:Reference ; + kb:key "shorthand" ; + kb:url "https://shorthand.example.com" . + + a kb:Reference ; + kb:key "text itself" ; + kb:url "https://text.example.com" . \ No newline at end of file diff --git a/tests/fixtures/link_02_reference_links/input.md b/tests/fixtures/link_02_reference_links/input.md new file mode 100644 index 0000000..d85d939 --- /dev/null +++ b/tests/fixtures/link_02_reference_links/input.md @@ -0,0 +1,12 @@ +# Reference Links + +This is a [reference link][ref1] in a paragraph. +Here's [another reference][ref2] in the same paragraph. + +You can also use [shorthand][] references. +Or even just use the [text itself]. + +[ref1]: https://www.example.com +[ref2]: https://www.example.org "Example.org" +[shorthand]: https://shorthand.example.com +[text itself]: https://text.example.com \ No newline at end of file diff --git a/tests/fixtures/link_03_citations/expected.ttl b/tests/fixtures/link_03_citations/expected.ttl new file mode 100644 index 0000000..da1d4d0 --- /dev/null +++ b/tests/fixtures/link_03_citations/expected.ttl @@ -0,0 +1,12 @@ +@prefix kb: . +@prefix rdf: . +@prefix rdfs: . + + a kb:Citation ; + kb:text "Smith, 2020" . + + a kb:Citation ; + kb:text "johnson2019" . + + a kb:Citation ; + kb:text "Smith, 2020; Johnson, 2019" . \ No newline at end of file diff --git a/tests/fixtures/link_03_citations/input.md b/tests/fixtures/link_03_citations/input.md new file mode 100644 index 0000000..564ec94 --- /dev/null +++ b/tests/fixtures/link_03_citations/input.md @@ -0,0 +1,5 @@ +# Citations + +According to (Smith, 2020), this is an important finding. +Another study [@johnson2019] showed similar results. +Multiple citations (Smith, 2020; Johnson, 2019) support this claim. \ No newline at end of file diff --git a/tests/fixtures/link_04_mixed_types/expected.ttl b/tests/fixtures/link_04_mixed_types/expected.ttl new file mode 100644 index 0000000..b3c5e71 --- /dev/null +++ b/tests/fixtures/link_04_mixed_types/expected.ttl @@ -0,0 +1,24 @@ +@prefix kb: . +@prefix rdf: . +@prefix rdfs: . + + a kb:Link ; + kb:text "inline links" ; + kb:url "https://example.com" ; + kb:isInternal false . + + a kb:Link ; + kb:text "reference links" ; + kb:url "https://reference.example.com" ; + kb:isInternal false . + + a kb:Reference ; + kb:key "ref1" ; + kb:url "https://reference.example.com" ; + kb:title "Reference Example" . + + a kb:Citation ; + kb:text "Author, 2023" . + + a kb:Citation ; + kb:text "citation-key" . \ No newline at end of file diff --git a/tests/fixtures/link_04_mixed_types/input.md b/tests/fixtures/link_04_mixed_types/input.md new file mode 100644 index 0000000..bc085f7 --- /dev/null +++ b/tests/fixtures/link_04_mixed_types/input.md @@ -0,0 +1,6 @@ +# Mixed Link Types + +This document has [inline links](https://example.com) and [reference links][ref1]. +It also has citations (Author, 2023) and [@citation-key]. + +[ref1]: https://reference.example.com "Reference Example" \ No newline at end of file diff --git a/tests/fixtures/link_05_empty_document/expected.ttl b/tests/fixtures/link_05_empty_document/expected.ttl new file mode 100644 index 0000000..0591b6f --- /dev/null +++ b/tests/fixtures/link_05_empty_document/expected.ttl @@ -0,0 +1,5 @@ +@prefix kb: . +@prefix rdf: . +@prefix rdfs: . + +# Empty document - no links, references, or citations expected \ No newline at end of file diff --git a/tests/fixtures/link_05_empty_document/input.md b/tests/fixtures/link_05_empty_document/input.md new file mode 100644 index 0000000..e69de29 diff --git a/tests/fixtures/link_06_integration/expected.ttl b/tests/fixtures/link_06_integration/expected.ttl new file mode 100644 index 0000000..ffa1cbc --- /dev/null +++ b/tests/fixtures/link_06_integration/expected.ttl @@ -0,0 +1,18 @@ +@prefix kb: . +@prefix rdf: . +@prefix rdfs: . + + a kb:Link ; + kb:text "link to Google" ; + kb:url "https://www.google.com" ; + kb:isInternal false . + + a kb:Link ; + kb:text "reference link" ; + kb:url "https://www.example.com" ; + kb:isInternal false . + + a kb:Reference ; + kb:key "ref1" ; + kb:url "https://www.example.com" ; + kb:title "Example Site" . \ No newline at end of file diff --git a/tests/fixtures/link_06_integration/input.md b/tests/fixtures/link_06_integration/input.md new file mode 100644 index 0000000..962faf0 --- /dev/null +++ b/tests/fixtures/link_06_integration/input.md @@ -0,0 +1,6 @@ +# Test Document + +This is a [link to Google](https://www.google.com) in a paragraph. +Here's a [reference link][ref1] in the same paragraph. + +[ref1]: https://www.example.com "Example Site" \ No newline at end of file diff --git a/tests/fixtures/list_01_unordered/expected.ttl b/tests/fixtures/list_01_unordered/expected.ttl new file mode 100644 index 0000000..071497c --- /dev/null +++ b/tests/fixtures/list_01_unordered/expected.ttl @@ -0,0 +1,17 @@ +@prefix kb: . +@prefix rdf: . +@prefix rdfs: . + + a kb:List ; + kb:elementType "list" ; + kb:ordered false ; + kb:hasItem , , . + + a kb:ListItem ; + kb:text "Item 1" . + + a kb:ListItem ; + kb:text "Item 2" . + + a kb:ListItem ; + kb:text "Item 3" . \ No newline at end of file diff --git a/tests/fixtures/list_01_unordered/input.md b/tests/fixtures/list_01_unordered/input.md new file mode 100644 index 0000000..b91b891 --- /dev/null +++ b/tests/fixtures/list_01_unordered/input.md @@ -0,0 +1,7 @@ +# Test Document + +This is a test document with an unordered list: + +- Item 1 +- Item 2 +- Item 3 \ No newline at end of file diff --git a/tests/fixtures/list_02_ordered/expected.ttl b/tests/fixtures/list_02_ordered/expected.ttl new file mode 100644 index 0000000..0ea596d --- /dev/null +++ b/tests/fixtures/list_02_ordered/expected.ttl @@ -0,0 +1,17 @@ +@prefix kb: . +@prefix rdf: . +@prefix rdfs: . + + a kb:List ; + kb:elementType "list" ; + kb:ordered true ; + kb:hasItem , , . + + a kb:ListItem ; + kb:text "First item" . + + a kb:ListItem ; + kb:text "Second item" . + + a kb:ListItem ; + kb:text "Third item" . \ No newline at end of file diff --git a/tests/fixtures/list_02_ordered/input.md b/tests/fixtures/list_02_ordered/input.md new file mode 100644 index 0000000..031fc48 --- /dev/null +++ b/tests/fixtures/list_02_ordered/input.md @@ -0,0 +1,7 @@ +# Test Document + +This is a test document with an ordered list: + +1. First item +2. Second item +3. Third item \ No newline at end of file diff --git a/tests/fixtures/list_03_nested/expected.ttl b/tests/fixtures/list_03_nested/expected.ttl new file mode 100644 index 0000000..d04567f --- /dev/null +++ b/tests/fixtures/list_03_nested/expected.ttl @@ -0,0 +1,41 @@ +@prefix kb: . +@prefix rdf: . +@prefix rdfs: . + + a kb:List ; + kb:elementType "list" ; + kb:ordered false ; + kb:hasItem , , . + + a kb:ListItem ; + kb:text "Item 1" ; + kb:hasNestedList . + + a kb:List ; + kb:elementType "list" ; + kb:ordered false ; + kb:hasItem , . + + a kb:ListItem ; + kb:text "Nested item 1.1" . + + a kb:ListItem ; + kb:text "Nested item 1.2" . + + a kb:ListItem ; + kb:text "Item 2" ; + kb:hasNestedList . + + a kb:List ; + kb:elementType "list" ; + kb:ordered true ; + kb:hasItem , . + + a kb:ListItem ; + kb:text "Nested ordered item 2.1" . + + a kb:ListItem ; + kb:text "Nested ordered item 2.2" . + + a kb:ListItem ; + kb:text "Item 3" . \ No newline at end of file diff --git a/tests/fixtures/list_03_nested/input.md b/tests/fixtures/list_03_nested/input.md new file mode 100644 index 0000000..78fb944 --- /dev/null +++ b/tests/fixtures/list_03_nested/input.md @@ -0,0 +1,11 @@ +# Test Document + +This is a test document with a nested list: + +- Item 1 + - Nested item 1.1 + - Nested item 1.2 +- Item 2 + 1. Nested ordered item 2.1 + 2. Nested ordered item 2.2 +- Item 3 \ No newline at end of file diff --git a/tests/fixtures/table_01_basic/expected.ttl b/tests/fixtures/table_01_basic/expected.ttl new file mode 100644 index 0000000..9544b8f --- /dev/null +++ b/tests/fixtures/table_01_basic/expected.ttl @@ -0,0 +1,53 @@ +@prefix kb: . +@prefix rdf: . +@prefix rdfs: . + + a kb:Table ; + kb:elementType "table" ; + kb:hasHeader "Header 1", "Header 2", "Header 3" ; + kb:hasRow , ; + kb:hasCell , , , + , , , + , , . + + a kb:TableRow ; + kb:hasCell , , . + + a kb:TableRow ; + kb:hasCell , , . + + a kb:TableCell ; + kb:text "Header 1" ; + kb:isHeader true . + + a kb:TableCell ; + kb:text "Header 2" ; + kb:isHeader true . + + a kb:TableCell ; + kb:text "Header 3" ; + kb:isHeader true . + + a kb:TableCell ; + kb:text "Cell 1,1" ; + kb:isHeader false . + + a kb:TableCell ; + kb:text "Cell 1,2" ; + kb:isHeader false . + + a kb:TableCell ; + kb:text "Cell 1,3" ; + kb:isHeader false . + + a kb:TableCell ; + kb:text "Cell 2,1" ; + kb:isHeader false . + + a kb:TableCell ; + kb:text "Cell 2,2" ; + kb:isHeader false . + + a kb:TableCell ; + kb:text "Cell 2,3" ; + kb:isHeader false . \ No newline at end of file diff --git a/tests/fixtures/table_01_basic/input.md b/tests/fixtures/table_01_basic/input.md new file mode 100644 index 0000000..03fc054 --- /dev/null +++ b/tests/fixtures/table_01_basic/input.md @@ -0,0 +1,8 @@ +# Test Document + +This is a test document with a table: + +| Header 1 | Header 2 | Header 3 | +|----------|----------|----------| +| Cell 1,1 | Cell 1,2 | Cell 1,3 | +| Cell 2,1 | Cell 2,2 | Cell 2,3 | \ No newline at end of file diff --git a/tests/models/test_entities.py b/tests/models/test_entities.py new file mode 100644 index 0000000..86d1e22 --- /dev/null +++ b/tests/models/test_entities.py @@ -0,0 +1,153 @@ +import unittest +from typing import List + +from knowledgebase_processor.analyzer.entity_recognizer import EntityRecognizer +from knowledgebase_processor.models.metadata import DocumentMetadata, ExtractedEntity, BaseModel # Ensure BaseModel if needed + +@unittest.skip("Spacy entity recognition disabled - tests skipped") +class TestEntityRecognizer(unittest.TestCase): + def setUp(self): + """ + Set up the test case by initializing the EntityRecognizer. + """ + self.analyzer = EntityRecognizer() + + def _create_metadata(self) -> DocumentMetadata: + """Helper to create a fresh DocumentMetadata instance.""" + # Assuming DocumentMetadata might require a path or other minimal setup + # Adjust if DocumentMetadata has mandatory constructor arguments + return DocumentMetadata(document_id="test.md", file_path="test.md") + + + def test_extract_person_entity(self): + metadata = self._create_metadata() + content = "Apple is looking at buying U.K. startup for $1 billion. Steve Jobs was a visionary." + self.analyzer.analyze(content, metadata) + + self.assertTrue(any(ent.text == "Steve Jobs" and ent.label == "PERSON" for ent in metadata.entities)) + + def test_extract_org_entity(self): + metadata = self._create_metadata() + content = "Apple is a technology company based in Cupertino." + self.analyzer.analyze(content, metadata) + + self.assertTrue(any(ent.text == "Apple" and ent.label == "ORG" for ent in metadata.entities)) + + def test_extract_loc_gpe_entity(self): + metadata = self._create_metadata() + content = "London is the capital of the United Kingdom." + self.analyzer.analyze(content, metadata) + + found_london = any(ent.text == "London" and (ent.label == "GPE" or ent.label == "LOC") for ent in metadata.entities) + self.assertTrue(found_london, "London entity not found or mislabelled") + + # Check for United Kingdom if found, but don't fail if not, + # acknowledging limitations of en_core_web_sm. + uk_entity = next((ent for ent in metadata.entities if ent.text == "United Kingdom"), None) + if uk_entity: + self.assertIn(uk_entity.label, ["GPE", "LOC"], + f"United Kingdom found with text '{uk_entity.text}' but label '{uk_entity.label}' is not GPE or LOC.") + + def test_extract_date_entity(self): + metadata = self._create_metadata() + content = "The event is scheduled for July 4th, 2024." + self.analyzer.analyze(content, metadata) + + self.assertTrue(any(ent.text == "July 4th, 2024" and ent.label == "DATE" for ent in metadata.entities)) + + def test_multiple_entities(self): + metadata = self._create_metadata() + content = "Alice went to Paris with Bob on January 1st." + self.analyzer.analyze(content, metadata) + + entities_found = { (ent.text, ent.label) for ent in metadata.entities } + expected_entities = { + ("Alice", "PERSON"), + ("Paris", "GPE"), # spaCy often labels cities as GPE + ("Bob", "PERSON"), + ("January 1st", "DATE") + } + # Check if all expected entities are a subset of what was found + # This is more flexible than checking exact counts if spaCy finds more (e.g. "January 1st" as part of a larger date) + self.assertTrue(expected_entities.issubset(entities_found), f"Expected {expected_entities}, but found {entities_found}") + + + def test_no_entities(self): + metadata = self._create_metadata() + content = "This is a simple sentence without any special names." + self.analyzer.analyze(content, metadata) + self.assertEqual(len(metadata.entities), 0) + + def test_empty_content(self): + metadata = self._create_metadata() + content = "" + self.analyzer.analyze(content, metadata) + self.assertEqual(len(metadata.entities), 0) + + def test_unicode_content(self): + metadata = self._create_metadata() + content = "これは日本語のテキストです。東京は日本の首都です。" # "This is Japanese text. Tokyo is the capital of Japan." + # Note: en_core_web_sm is primarily for English. For robust multilingual support, + # a multilingual model or language-specific models would be needed. + # This test primarily checks if it handles unicode without crashing. + # We don't expect accurate entity recognition for Japanese with an English model. + self.analyzer.analyze(content, metadata) + # We are not asserting specific entities here, just that it runs. + # Depending on the model, it might find "Tokyo" if it's in its English vocab. + self.assertTrue(isinstance(metadata.entities, list)) + + + def test_analyze_text_for_entities_john_doe_acme_new_york(self): + """Test analyze_text_for_entities with a sentence containing multiple entities.""" + text_to_analyze = "John Doe works at Acme Corp in New York." + entities: List[ExtractedEntity] = self.analyzer.analyze_text_for_entities(text_to_analyze) + + expected_entities_data = [ + {"text": "John Doe", "label": "PERSON", "start_char": 0, "end_char": 8}, + {"text": "Acme Corp", "label": "ORG", "start_char": 18, "end_char": 27}, + {"text": "New York", "label": "GPE", "start_char": 31, "end_char": 39}, + ] + + # Convert to a set of tuples for easier comparison if order doesn't matter + # or if spaCy might find them in a different order. + # For this specific case, order is likely preserved but comparing sets is robust. + + actual_entities_set = {(e.text, e.label, e.start_char, e.end_char) for e in entities} + expected_entities_set = {(d["text"], d["label"], d["start_char"], d["end_char"]) for d in expected_entities_data} + + self.assertEqual(actual_entities_set, expected_entities_set, + f"Expected entities {expected_entities_set} but got {actual_entities_set}") + + def test_analyze_text_for_entities_jane_smith_wikilink_alias(self): + """Test analyze_text_for_entities with a wikilink alias.""" + text_to_analyze = "Dr. Smith" # Simulating the text part of "[[Jane Smith|Dr. Smith]]" + entities: List[ExtractedEntity] = self.analyzer.analyze_text_for_entities(text_to_analyze) + + self.assertEqual(len(entities), 1, f"Expected 1 entity, got {len(entities)}") + entity = entities[0] + self.assertEqual(entity.text, "Smith") # spaCy model 'en_core_web_sm' extracts "Smith" + self.assertEqual(entity.label, "PERSON") + self.assertEqual(entity.start_char, 4) # "Smith" in "Dr. Smith" (D=0,r=1,.=2, =3,S=4) + self.assertEqual(entity.end_char, 9) # "Smith" + + def test_analyze_text_for_entities_simple_phrase_no_entities(self): + """Test analyze_text_for_entities with a phrase containing no entities.""" + text_to_analyze = "A simple phrase" + entities: List[ExtractedEntity] = self.analyzer.analyze_text_for_entities(text_to_analyze) + self.assertEqual(len(entities), 0) + + def test_analyze_text_for_entities_london_wikilink(self): + """Test analyze_text_for_entities with a GPE from a wikilink.""" + text_to_analyze = "London" # Simulating the text part of "[[London]]" + entities: List[ExtractedEntity] = self.analyzer.analyze_text_for_entities(text_to_analyze) + + self.assertEqual(len(entities), 1, f"Expected 1 entity, got {len(entities)}") + entity = entities[0] + self.assertEqual(entity.text, "London") + self.assertIn(entity.label, ["GPE", "LOC"], f"Expected GPE or LOC, got {entity.label}") # spaCy usually labels cities as GPE + self.assertEqual(entity.start_char, 0) + self.assertEqual(entity.end_char, 6) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tests/parser/test_markdown_parser.py b/tests/models/test_markdown_parser.py similarity index 100% rename from tests/parser/test_markdown_parser.py rename to tests/models/test_markdown_parser.py diff --git a/tests/parser/__init__.py b/tests/parser/__init__.py deleted file mode 100644 index 044a155..0000000 --- a/tests/parser/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Parser tests package.""" \ No newline at end of file diff --git a/tests/test_reference_corpus.py b/tests/test_reference_corpus.py new file mode 100644 index 0000000..c54faaa --- /dev/null +++ b/tests/test_reference_corpus.py @@ -0,0 +1,183 @@ +# tests/test_reference_corpus.py +""" +Integration tests for the reference corpus. + +This test file implements the specification-driven testing approach outlined in REFACTOR.md. +It validates that the processor continues to generate the same RDF output for all files +in the reference corpus, ensuring consistency and preventing regressions. +""" + +import pytest +import sys +from pathlib import Path +from typing import List + +# Add the src directory to the path to import our modules +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + +from knowledgebase_processor.processor.processor import Processor +from knowledgebase_processor.utils.document_registry import DocumentRegistry +from knowledgebase_processor.utils.id_generator import EntityIdGenerator +from knowledgebase_processor.extractor.markdown import MarkdownExtractor +from knowledgebase_processor.extractor.frontmatter import FrontmatterExtractor +from knowledgebase_processor.extractor.heading_section import HeadingSectionExtractor +from knowledgebase_processor.extractor.link_reference import LinkReferenceExtractor +from knowledgebase_processor.extractor.code_quote import CodeQuoteExtractor +from knowledgebase_processor.extractor.todo_item import TodoItemExtractor +from knowledgebase_processor.extractor.tags import TagExtractor +from knowledgebase_processor.extractor.list_table import ListTableExtractor + +from rdflib import Graph, Namespace +from rdflib.compare import isomorphic +from rdflib.namespace import SDO as SCHEMA + + +def setup_processor() -> Processor: + """Setup a processor with all necessary extractors for testing.""" + document_registry = DocumentRegistry() + id_generator = EntityIdGenerator(base_url="http://example.org/kb/") + + processor = Processor( + document_registry=document_registry, + id_generator=id_generator, + config=None, + ) + + # Register all extractors (same as in main.py and generation script) + processor.register_extractor(MarkdownExtractor()) + processor.register_extractor(FrontmatterExtractor()) + processor.register_extractor(HeadingSectionExtractor()) + processor.register_extractor(LinkReferenceExtractor()) + processor.register_extractor(CodeQuoteExtractor()) + processor.register_extractor(TodoItemExtractor()) + processor.register_extractor(TagExtractor()) + processor.register_extractor(ListTableExtractor()) + + return processor + + +def remove_timestamps_from_graph(graph: Graph) -> Graph: + """ + Remove timestamp triples from the graph to allow for consistent comparison. + + This removes schema:dateCreated and schema:dateModified triples that change + every time the processor runs, making the graphs non-isomorphic. + """ + cleaned_graph = Graph() + + # Copy all namespaces from the original graph + for prefix, namespace in graph.namespaces(): + cleaned_graph.bind(prefix, namespace) + + # Copy all triples except timestamp triples + for subj, pred, obj in graph: + if pred not in [SCHEMA.dateCreated, SCHEMA.dateModified]: + cleaned_graph.add((subj, pred, obj)) + + return cleaned_graph + + +def run_corpus_test(markdown_path: Path): + """ + Run a corpus test for a single markdown file. + + This function: + 1. Reads the markdown content + 2. Processes it through the processor to get an RDF graph + 3. Compares it with the expected TTL file for isomorphism + 4. Asserts that they are equivalent + """ + expected_ttl_path = markdown_path.with_suffix(".ttl") + + # Ensure the expected TTL file exists + if not expected_ttl_path.exists(): + pytest.fail(f"Expected TTL file not found: {expected_ttl_path}") + + # Read the markdown content + input_content = markdown_path.read_text(encoding='utf-8') + + # Setup processor and process content + processor = setup_processor() + + # Generate document ID (same logic as generation script) + clean_stem = markdown_path.stem.replace(" ", "_").replace("-", "_").replace(":", "_") + document_id = f"test_corpus/{clean_stem}" + + # Process content to get "as-is" RDF graph + as_is_graph = processor.process_content_to_graph(input_content, document_id=document_id) + + # Read the expected RDF graph + expected_graph = Graph() + expected_graph.parse(str(expected_ttl_path), format="turtle") + + # Remove timestamps from both graphs for comparison + as_is_clean = remove_timestamps_from_graph(as_is_graph) + expected_clean = remove_timestamps_from_graph(expected_graph) + + # Compare the two RDF graphs for isomorphism + # This checks if they contain the same triples regardless of ordering + if not isomorphic(as_is_clean, expected_clean): + # Provide helpful debugging information + as_is_triples = len(as_is_clean) + expected_triples = len(expected_clean) + + pytest.fail( + f"RDF graphs are not isomorphic for {markdown_path.name}!\n" + f"As-is graph: {as_is_triples} triples (excluding timestamps)\n" + f"Expected graph: {expected_triples} triples (excluding timestamps)\n" + f"This indicates the processor output has changed." + ) + + +def get_corpus_files() -> List[Path]: + """ + Get all markdown files in the reference corpus directory. + + Returns: + List of Path objects for markdown files + """ + corpus_dir = Path(__file__).parent.parent / "specs" / "reference_corpus" + + if not corpus_dir.exists(): + return [] + + return list(corpus_dir.glob("*.md")) + + +@pytest.mark.parametrize("markdown_path", get_corpus_files()) +def test_reference_corpus(markdown_path: Path): + """ + Parametrized test that runs for each markdown file in the reference corpus. + + This test ensures that the processor generates the same RDF output for each + file as when the reference TTL files were generated. This serves as a + regression test to catch any unintended changes to the processor behavior. + + Args: + markdown_path: Path to the markdown file to test + """ + run_corpus_test(markdown_path) + + +def test_corpus_directory_exists(): + """ + Test to ensure the reference corpus directory exists and contains files. + """ + corpus_dir = Path(__file__).parent.parent / "specs" / "reference_corpus" + assert corpus_dir.exists(), f"Reference corpus directory not found: {corpus_dir}" + + md_files = list(corpus_dir.glob("*.md")) + assert len(md_files) > 0, "No markdown files found in reference corpus" + + ttl_files = list(corpus_dir.glob("*.ttl")) + assert len(ttl_files) > 0, "No TTL files found in reference corpus" + + # Each markdown file should have a corresponding TTL file + for md_file in md_files: + ttl_file = md_file.with_suffix(".ttl") + assert ttl_file.exists(), f"Missing TTL file for {md_file.name}: {ttl_file}" + + +if __name__ == "__main__": + # Allow running this test file directly for debugging + pytest.main([__file__, "-v"]) \ No newline at end of file diff --git a/tests/test_specifications.py b/tests/test_specifications.py new file mode 100644 index 0000000..7ee0573 --- /dev/null +++ b/tests/test_specifications.py @@ -0,0 +1,181 @@ +# tests/test_specifications.py +""" +Specification-driven tests for individual test cases. + +This test file implements the specification-driven testing approach outlined in REFACTOR.md. +It runs tests for all test cases in the specs/test_cases/ directory, where each test case +contains an input.md file and an expected_output.ttl file. +""" + +import pytest +import sys +from pathlib import Path +from typing import List + +# Add the src directory to the path to import our modules +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + +from knowledgebase_processor.processor.processor import Processor +from knowledgebase_processor.utils.document_registry import DocumentRegistry +from knowledgebase_processor.utils.id_generator import EntityIdGenerator +from knowledgebase_processor.extractor.markdown import MarkdownExtractor +from knowledgebase_processor.extractor.frontmatter import FrontmatterExtractor +from knowledgebase_processor.extractor.heading_section import HeadingSectionExtractor +from knowledgebase_processor.extractor.link_reference import LinkReferenceExtractor +from knowledgebase_processor.extractor.code_quote import CodeQuoteExtractor +from knowledgebase_processor.extractor.todo_item import TodoItemExtractor +from knowledgebase_processor.extractor.tags import TagExtractor +from knowledgebase_processor.extractor.list_table import ListTableExtractor + +from rdflib import Graph +from rdflib.compare import isomorphic +from rdflib.namespace import SDO as SCHEMA + + +def setup_processor() -> Processor: + """Setup a processor with all necessary extractors for testing.""" + document_registry = DocumentRegistry() + id_generator = EntityIdGenerator(base_url="http://example.org/kb/") + + processor = Processor( + document_registry=document_registry, + id_generator=id_generator, + config=None, + ) + + # Register all extractors (same as in reference corpus tests) + processor.register_extractor(MarkdownExtractor()) + processor.register_extractor(FrontmatterExtractor()) + processor.register_extractor(HeadingSectionExtractor()) + processor.register_extractor(LinkReferenceExtractor()) + processor.register_extractor(CodeQuoteExtractor()) + processor.register_extractor(TodoItemExtractor()) + processor.register_extractor(TagExtractor()) + processor.register_extractor(ListTableExtractor()) + + return processor + + +def remove_timestamps_from_graph(graph: Graph) -> Graph: + """ + Remove timestamp triples from the graph to allow for consistent comparison. + + This removes schema:dateCreated and schema:dateModified triples that change + every time the processor runs, making the graphs non-isomorphic. + """ + cleaned_graph = Graph() + + # Copy all namespaces from the original graph + for prefix, namespace in graph.namespaces(): + cleaned_graph.bind(prefix, namespace) + + # Copy all triples except timestamp triples + for subj, pred, obj in graph: + if pred not in [SCHEMA.dateCreated, SCHEMA.dateModified]: + cleaned_graph.add((subj, pred, obj)) + + return cleaned_graph + + +def run_spec_test(test_case_dir: Path): + """ + Runs a single specification-driven test. + + This function: + 1. Reads the input.md file + 2. Processes it through the processor to get an RDF graph + 3. Compares it with the expected_output.ttl file for isomorphism + 4. Asserts that they are equivalent + """ + input_md_path = test_case_dir / "input.md" + expected_output_ttl_path = test_case_dir / "expected_output.ttl" + + # Ensure required files exist + if not input_md_path.exists(): + pytest.fail(f"Input markdown file not found: {input_md_path}") + if not expected_output_ttl_path.exists(): + pytest.fail(f"Expected TTL file not found: {expected_output_ttl_path}") + + # 1. Read the input markdown file + input_md_content = input_md_path.read_text(encoding='utf-8') + + # 2. Run the processor to get the "as-is" RDF graph + processor = setup_processor() + + # Use test case directory name as document_id + document_id = f"test_cases/{test_case_dir.name}" + as_is_graph = processor.process_content_to_graph(input_md_content, document_id=document_id) + + # 3. Read the "to-be" (expected) RDF graph + expected_graph = Graph() + expected_graph.parse(str(expected_output_ttl_path), format="turtle") + + # 4. Remove timestamps from both graphs for comparison + as_is_clean = remove_timestamps_from_graph(as_is_graph) + expected_clean = remove_timestamps_from_graph(expected_graph) + + # 5. Compare the two RDF graphs for isomorphism (i.e., they are equivalent) + if not isomorphic(as_is_clean, expected_clean): + # Provide helpful debugging information + as_is_triples = len(as_is_clean) + expected_triples = len(expected_clean) + + pytest.fail( + f"RDF graphs are not isomorphic for {test_case_dir.name}!\n" + f"As-is graph: {as_is_triples} triples (excluding timestamps)\n" + f"Expected graph: {expected_triples} triples (excluding timestamps)\n" + f"This indicates the processor output has changed." + ) + +def get_test_cases() -> List[Path]: + """ + Get all test case directories in the specs/test_cases/ directory. + + Returns: + List of Path objects for test case directories + """ + specs_dir = Path(__file__).parent.parent / "specs" / "test_cases" + + if not specs_dir.exists(): + return [] + + return [d for d in specs_dir.iterdir() if d.is_dir()] + + +@pytest.mark.parametrize("test_case_dir", get_test_cases()) +def test_specifications(test_case_dir: Path): + """ + Parametrized test that runs for each test case directory in specs/test_cases/. + + This test ensures that the processor generates the same RDF output for each + test case as specified in the expected_output.ttl file. This serves as a + unit test to verify specific behaviors and catch any unintended changes. + + Args: + test_case_dir: Path to the test case directory containing input.md and expected_output.ttl + """ + run_spec_test(test_case_dir) + + +def test_test_cases_directory_exists(): + """ + Test to ensure the test cases directory exists and contains test cases. + """ + test_cases_dir = Path(__file__).parent.parent / "specs" / "test_cases" + assert test_cases_dir.exists(), f"Test cases directory not found: {test_cases_dir}" + + test_case_dirs = [d for d in test_cases_dir.iterdir() if d.is_dir()] + assert len(test_case_dirs) > 0, "No test case directories found in specs/test_cases" + + # Each test case directory should have both input.md and expected_output.ttl + for test_dir in test_case_dirs: + input_file = test_dir / "input.md" + expected_file = test_dir / "expected_output.ttl" + + assert input_file.exists(), f"Missing input.md in {test_dir.name}: {input_file}" + assert expected_file.exists(), f"Missing expected_output.ttl in {test_dir.name}: {expected_file}" + + +if __name__ == "__main__": + # Allow running this test file directly for debugging + pytest.main([__file__, "-v"]) \ No newline at end of file