@@ -108,8 +108,22 @@ def main(lookback: int = 5, wait_for_results: bool = True):
108108# For both extraction and transformation, we use
109109# [`.map`](https://modal.com/docs/guide/scale),
110110# which fans out inputs over containers in parallel.
111- # Each invocation handles one day's worth of data --
112- # the same granularity offered by the data source.
111+ # Each invocation handles at most 1,500 rows,
112+ # which leads to runtimes of about five minutes per call
113+ # By parallelizing the calls, we finish processing everything in about five minutes.
114+
115+ # "Rechunking" our data from a list of filings by day
116+ # into a list of filings of fixed size requires a little
117+ # helper function:
118+
119+
120+ def rechunk (lists , size : int = 1_500 ):
121+ from itertools import chain , islice
122+
123+ it = iter (chain .from_iterable (lists ))
124+ while chunk := list (islice (it , size )):
125+ yield chunk
126+
113127
114128# For the LLM call, we use
115129# [`.spawn`](https://modal.com/docs/guide/job-queue),
@@ -143,9 +157,10 @@ def orchestrate(lookback: int) -> list[modal.FunctionCall]:
143157 print ("Transforming raw SEC filings for these dates:" , * folders )
144158 filing_batches = list (transform .map (folders ))
145159 n_filings = sum (map (len , filing_batches ))
160+ submission_batches_gen = rechunk (filing_batches )
146161
147162 print (f"Submitting { n_filings } SEC filings to LLM for summarization" )
148- jobs = list (llm .process .spawn (batch ) for batch in filing_batches )
163+ jobs = list (llm .process .spawn (batch ) for batch in submission_batches_gen )
149164 if jobs :
150165 print ("FunctionCall IDs:" , * [job .object_id for job in jobs ], sep = "\n \t " )
151166
0 commit comments