feat: add document tokens for SMILES (#176)

lucas-morin · web-flow · commit 32398b8ac0c9 · 2025-03-10T11:04:21.000+01:00
Signed-off-by: lum &lt;lum@zurich.ibm.com&gt;
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
@@ -1003,6 +1003,20 @@ def export_to_document_tokens(
             predicted_class = classifications[0].predicted_classes[0].class_name
             body += DocumentToken.get_picture_classification_token(predicted_class)
 
+        smiles_annotations = [
+            ann for ann in self.annotations if isinstance(ann, PictureMoleculeData)
+        ]
+        if len(smiles_annotations) > 0:
+            body += (
+                "<"
+                + DocumentToken.SMILES.value
+                + ">"
+                + smiles_annotations[0].smi
+                + "</"
+                + DocumentToken.SMILES.value
+                + ">"
+            )
+
         if add_caption and len(self.captions):
             text = self.caption_text(doc)
 
diff --git a/docling_core/types/doc/tokens.py b/docling_core/types/doc/tokens.py
@@ -50,6 +50,7 @@ class DocumentToken(Enum):
     UNORDERED_LIST = "unordered_list"
     LOC = "loc_"
     PAGE_BREAK = "page_break"
+    SMILES = "smiles"
 
     @classmethod
     def get_special_tokens(
diff --git a/test/data/doc/dummy_doc.yaml b/test/data/doc/dummy_doc.yaml
@@ -144,7 +144,7 @@ pictures: # All pictures...
         text: "..."
         provenance: "model2"
       - kind: molecule_data
-        smi: "..."
+        smi: "CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1"
         confidence: 0.98
         class_name: "chemistry_molecular_structure"
         segmentation: [[0,0],[1,0],[0,1],[1,1]]
diff --git a/test/data/doc/dummy_doc.yaml.dt b/test/data/doc/dummy_doc.yaml.dt
@@ -1,4 +1,4 @@
 <doctag><title><loc_42><loc_26><loc_406><loc_46>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</title>
-<picture><loc_297><loc_125><loc_457><loc_500><illustration><caption><loc_210><loc_196><loc_245><loc_213>Figure 1: Four examples of complex page layouts across different document categories</caption></picture>
+<picture><loc_297><loc_125><loc_457><loc_500><illustration><smiles>CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1</smiles><caption><loc_210><loc_196><loc_245><loc_213>Figure 1: Four examples of complex page layouts across different document categories</caption></picture>
 <otsl><loc_210><loc_196><loc_245><loc_213></otsl>
 </doctag>