Skip to content

Commit 32398b8

Browse files
authored
feat: add document tokens for SMILES (#176)
Signed-off-by: lum <[email protected]>
1 parent a4a4e61 commit 32398b8

File tree

4 files changed

+17
-2
lines changed

4 files changed

+17
-2
lines changed

docling_core/types/doc/document.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1003,6 +1003,20 @@ def export_to_document_tokens(
10031003
predicted_class = classifications[0].predicted_classes[0].class_name
10041004
body += DocumentToken.get_picture_classification_token(predicted_class)
10051005

1006+
smiles_annotations = [
1007+
ann for ann in self.annotations if isinstance(ann, PictureMoleculeData)
1008+
]
1009+
if len(smiles_annotations) > 0:
1010+
body += (
1011+
"<"
1012+
+ DocumentToken.SMILES.value
1013+
+ ">"
1014+
+ smiles_annotations[0].smi
1015+
+ "</"
1016+
+ DocumentToken.SMILES.value
1017+
+ ">"
1018+
)
1019+
10061020
if add_caption and len(self.captions):
10071021
text = self.caption_text(doc)
10081022

docling_core/types/doc/tokens.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ class DocumentToken(Enum):
5050
UNORDERED_LIST = "unordered_list"
5151
LOC = "loc_"
5252
PAGE_BREAK = "page_break"
53+
SMILES = "smiles"
5354

5455
@classmethod
5556
def get_special_tokens(

test/data/doc/dummy_doc.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ pictures: # All pictures...
144144
text: "..."
145145
provenance: "model2"
146146
- kind: molecule_data
147-
smi: "..."
147+
smi: "CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1"
148148
confidence: 0.98
149149
class_name: "chemistry_molecular_structure"
150150
segmentation: [[0,0],[1,0],[0,1],[1,1]]

test/data/doc/dummy_doc.yaml.dt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
<doctag><title><loc_42><loc_26><loc_406><loc_46>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</title>
2-
<picture><loc_297><loc_125><loc_457><loc_500><illustration><caption><loc_210><loc_196><loc_245><loc_213>Figure 1: Four examples of complex page layouts across different document categories</caption></picture>
2+
<picture><loc_297><loc_125><loc_457><loc_500><illustration><smiles>CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1</smiles><caption><loc_210><loc_196><loc_245><loc_213>Figure 1: Four examples of complex page layouts across different document categories</caption></picture>
33
<otsl><loc_210><loc_196><loc_245><loc_213></otsl>
44
</doctag>

0 commit comments

Comments
 (0)