|
| 1 | +import tempfile |
| 2 | +import dataclasses |
| 3 | + |
| 4 | +from dotenv import load_dotenv |
| 5 | +from marker.converters.pdf import PdfConverter |
| 6 | +from marker.models import create_model_dict |
| 7 | +from marker.output import text_from_rendered |
| 8 | +from marker.config.parser import ConfigParser |
| 9 | + |
| 10 | +import cocoindex |
| 11 | + |
| 12 | +class PdfToMarkdown(cocoindex.op.FunctionSpec): |
| 13 | + """Convert a PDF to markdown.""" |
| 14 | + |
| 15 | +@cocoindex.op.executor_class(gpu=True, cache=True, behavior_version=1) |
| 16 | +class PdfToMarkdownExecutor: |
| 17 | + """Executor for PdfToMarkdown.""" |
| 18 | + |
| 19 | + spec: PdfToMarkdown |
| 20 | + _converter: PdfConverter |
| 21 | + |
| 22 | + def prepare(self): |
| 23 | + config_parser = ConfigParser({}) |
| 24 | + self._converter = PdfConverter(create_model_dict(), config=config_parser.generate_config_dict()) |
| 25 | + |
| 26 | + def __call__(self, content: bytes) -> str: |
| 27 | + with tempfile.NamedTemporaryFile(delete=True, suffix=".pdf") as temp_file: |
| 28 | + temp_file.write(content) |
| 29 | + temp_file.flush() |
| 30 | + text, _, _ = text_from_rendered(self._converter(temp_file.name)) |
| 31 | + return text |
| 32 | + |
| 33 | +@dataclasses.dataclass |
| 34 | +class ArgInfo: |
| 35 | + name: str |
| 36 | + description: str |
| 37 | + |
| 38 | +@dataclasses.dataclass |
| 39 | +class MethodInfo: |
| 40 | + name: str |
| 41 | + args: list[ArgInfo] |
| 42 | + description: str |
| 43 | + |
| 44 | +@dataclasses.dataclass |
| 45 | +class ClassInfo: |
| 46 | + name: str |
| 47 | + description: str |
| 48 | + methods: list[MethodInfo] |
| 49 | + |
| 50 | +@dataclasses.dataclass |
| 51 | +class ManualInfo: |
| 52 | + title: str |
| 53 | + description: str |
| 54 | + classes: list[ClassInfo] |
| 55 | + methods: list[MethodInfo] |
| 56 | + |
| 57 | + |
| 58 | +class ExtractManual(cocoindex.op.FunctionSpec): |
| 59 | + """Extract manual information from a Markdown.""" |
| 60 | + |
| 61 | +@cocoindex.op.executor_class() |
| 62 | +class ExtractManualExecutor: |
| 63 | + """Executor for ExtractManual.""" |
| 64 | + |
| 65 | + spec: ExtractManual |
| 66 | + |
| 67 | + def __call__(self, _markdown: str) -> ManualInfo: |
| 68 | + return ManualInfo( |
| 69 | + title="title_placeholder", |
| 70 | + description="description_placeholder", |
| 71 | + classes=[ |
| 72 | + ClassInfo( |
| 73 | + name="class_name_placeholder", |
| 74 | + description="class_description_placeholder", |
| 75 | + methods=[ |
| 76 | + MethodInfo( |
| 77 | + name="method_name_placeholder", |
| 78 | + args=[ArgInfo(name="arg_name_placeholder", description="arg_description_placeholder")], |
| 79 | + description="method_description_placeholder" |
| 80 | + ) |
| 81 | + ] |
| 82 | + ) |
| 83 | + ], |
| 84 | + methods=[ |
| 85 | + MethodInfo( |
| 86 | + name="method_name_placeholder", |
| 87 | + args=[ArgInfo(name="arg_name_placeholder", description="arg_description_placeholder")], |
| 88 | + description="method_description_placeholder" |
| 89 | + ) |
| 90 | + ] |
| 91 | + ) |
| 92 | + |
| 93 | +@cocoindex.flow_def(name="ManualExtraction") |
| 94 | +def manual_extraction_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope): |
| 95 | + """ |
| 96 | + Define an example flow that extracts manual information from a Markdown. |
| 97 | + """ |
| 98 | + data_scope["documents"] = flow_builder.add_source(cocoindex.sources.LocalFile(path="pdf_files", binary=True)) |
| 99 | + |
| 100 | + manual_infos = data_scope.add_collector() |
| 101 | + |
| 102 | + with data_scope["documents"].row() as doc: |
| 103 | + doc["markdown"] = doc["content"].transform(PdfToMarkdown()) |
| 104 | + doc["manual_info"] = doc["markdown"].transform(ExtractManual()) |
| 105 | + manual_infos.collect(filename=doc["filename"], manual_info=doc["manual_info"]) |
| 106 | + |
| 107 | + manual_infos.export( |
| 108 | + "manual_infos", |
| 109 | + cocoindex.storages.Postgres(), |
| 110 | + primary_key_fields=["filename"], |
| 111 | + ) |
| 112 | + |
| 113 | +@cocoindex.main_fn() |
| 114 | +def _run(): |
| 115 | + pass |
| 116 | + |
| 117 | +if __name__ == "__main__": |
| 118 | + load_dotenv(override=True) |
| 119 | + _run() |
0 commit comments