1313ALGOLIA_INDEX_NAME = 'logfire-docs'
1414ALGOLIA_APP_ID = 'KPPUDTIAVX'
1515ALGOLIA_WRITE_API_KEY = os .environ .get ('ALGOLIA_WRITE_API_KEY' )
16+ # Algolia accepts 100k, leaaving some room for other fields
17+ MAX_CONTENT_SIZE = 90_000
1618
1719
1820def on_page_content (html : str , page : Page , config : Config , files : Files ) -> str :
@@ -24,6 +26,30 @@ def on_page_content(html: str, page: Page, config: Config, files: Files) -> str:
2426
2527 soup = BeautifulSoup (html , 'html.parser' )
2628
29+ # Clean up presentational and UI elements
30+ for element in soup .find_all (['autoref' ]):
31+ element .decompose ()
32+
33+ # this removes the large source code embeds from Github
34+ for element in soup .find_all ('details' ):
35+ element .decompose ()
36+
37+ for el_with_class in soup .find_all (class_ = ['doc-section-item' , 'doc-section-title' , 'doc-md-description' , 'doc' ]):
38+ # delete the class attribute
39+ del el_with_class ['class' ]
40+
41+ # Cleanup code examples
42+ for extra in soup .find_all ('div' , attrs = {'class' : ['language-py highlight' , 'language-python highlight' ]}):
43+ extra .replace_with (BeautifulSoup (f'<pre>{ extra .find ("code" ).get_text ()} </pre>' , 'html.parser' ))
44+
45+ # Cleanup code examples, part 2
46+ for extra in soup .find_all ('div' , attrs = {'class' : 'language-python doc-signature highlight' }):
47+ extra .replace_with (BeautifulSoup (f'<pre>{ extra .find ("code" ).get_text ()} </pre>' , 'html.parser' ))
48+
49+ # The API reference generates HTML tables with line numbers, this strips the line numbers cell and goes back to a code block
50+ for extra in soup .find_all ('table' , attrs = {'class' : 'highlighttable' }):
51+ extra .replace_with (BeautifulSoup (f'<pre>{ extra .find ("code" ).get_text ()} </pre>' , 'html.parser' ))
52+
2753 # Find all h1 and h2 headings
2854 headings = soup .find_all (['h1' , 'h2' ])
2955
@@ -65,7 +91,12 @@ def on_post_build(config: Config) -> None:
6591
6692 client = SearchClient .create (ALGOLIA_APP_ID , ALGOLIA_WRITE_API_KEY )
6793 index = client .init_index (ALGOLIA_INDEX_NAME )
68- # temporary filter the records from the index if the content is bigger than 10k characters
69- filtered_records = list (filter (lambda record : len (record ['content' ]) < 9000 , records ))
94+
95+ for large_record in list (filter (lambda record : len (record ['content' ]) >= MAX_CONTENT_SIZE , records )):
96+ print (f'Content for { large_record ["abs_url" ]} is too large to be indexed. Skipping...' )
97+ print (f'Content : { large_record ["content" ]} characters' )
98+
99+ # filter the records from the index if the content is bigger than 10k characters
100+ filtered_records = list (filter (lambda record : len (record ['content' ]) < MAX_CONTENT_SIZE , records ))
70101 print (f'Uploading { len (filtered_records )} out of { len (records )} records to Algolia...' )
71102 index .replace_all_objects (filtered_records , {'createIfNotExists' : True }).wait () # type: ignore[reportUnknownMemberType]
0 commit comments