@@ -50,9 +50,23 @@ The example repository includes a test PDF:
5050```
5151You can also replace it with any math textbook or exercise collection PDF.
5252
53- ## 4 Write the Execution Script
53+ ## 4 Initialize and Modify the Script
5454
55- In the project’s root directory, create ` generate_question_extract_api.py ` with the following content as an example:
55+ First, create a new ` run_dataflow ` folder anywhere, enter that directory, and then execute Dataflow project initialization:
56+
57+ ``` shell
58+ mkdir run_dataflow
59+ cd run_dataflow
60+ dataflow init
61+ ```
62+
63+ After initialization is complete, the following file will appear in the project directory:
64+
65+ ``` shell
66+ run_dataflow/playground/mathbook_extract.py
67+ ```
68+
69+ The contents of that script are as follows:
5670
5771``` python
5872from dataflow.operators.generate import MathBookQuestionExtract
@@ -61,7 +75,7 @@ from dataflow.serving.APIVLMServing_openai import APIVLMServing_openai
6175class QuestionExtractPipeline :
6276 def __init__ (self , llm_serving : APIVLMServing_openai):
6377 self .extractor = MathBookQuestionExtract(llm_serving)
64- self .test_pdf = " ./dataflow /example/KBCleaningPipeline/questionextract_test.pdf"
78+ self .test_pdf = " .. /example/KBCleaningPipeline/questionextract_test.pdf"
6579
6680 def forward (
6781 self ,
@@ -87,11 +101,11 @@ if __name__ == "__main__":
87101 # 1. Initialize LLM Serving
88102 llm_serving = APIVLMServing_openai(
89103 api_url = " https://api.openai.com/v1/chat/completions" ,
90- model_name = " o4-mini" , # Strong reasoning model recommended
104+ model_name = " o4-mini" , # It is recommended to use a strong reasoning model
91105 max_workers = 20 # Number of concurrent requests
92106 )
93107
94- # 2. Build and run the pipeline
108+ # 2. Construct and run the extraction pipeline
95109 pipeline = QuestionExtractPipeline(llm_serving)
96110 pipeline.forward(
97111 pdf_path = pipeline.test_pdf,
0 commit comments