@@ -11,29 +11,71 @@ def initialize(filter, max_tokens_per_batch:, tokenizer:)
1111 @to_process = filter_to_hash
1212 end
1313
14- def next_batch
15- # return text that is easily consumed by the LLM containing:
16- # - topic title, tags, category and creation date
17- # - info about previous posts in the topic that are omitted (mostly count eg: 7 posts omitted)
18- # - raw post content
19- # - author name
20- # - date
21- # - info about future posts in the topic that are omitted
22- #
23- # always attempt to return entire topics (or multiple) if possible
24- # return nil if we are done
25- #
26- # example_return:
27- # { post_count: 12, topic_count: 3, text: "..." }
14+ def each_chunk
15+ return nil if @to_process . empty?
16+
17+ result = { post_count : 0 , topic_count : 0 , text : +"" }
18+ estimated_tokens = 0
19+
20+ @to_process . each do |topic_id , topic_data |
21+ topic = Topic . find_by ( id : topic_id )
22+ next unless topic
23+
24+ topic_text , topic_tokens , post_count = format_topic ( topic , topic_data [ :posts ] )
25+
26+ # If this single topic exceeds our token limit and we haven't added anything yet,
27+ # we need to include at least this one topic (partial content)
28+ if estimated_tokens == 0 && topic_tokens > @max_tokens_per_batch
29+ offset = 0
30+ while offset < topic_text . length
31+ chunk = +""
32+ chunk_tokens = 0
33+ lines = topic_text [ offset ..] . lines
34+ lines . each do |line |
35+ line_tokens = estimate_tokens ( line )
36+ break if chunk_tokens + line_tokens > @max_tokens_per_batch
37+ chunk << line
38+ chunk_tokens += line_tokens
39+ end
40+ break if chunk . empty?
41+ yield (
42+ {
43+ text : chunk ,
44+ post_count : post_count , # This may overcount if split mid-topic, but preserves original logic
45+ topic_count : 1 ,
46+ }
47+ )
48+ offset += chunk . length
49+ end
50+
51+ next
52+ end
53+
54+ # If adding this topic would exceed our token limit and we already have content, skip it
55+ if estimated_tokens > 0 && estimated_tokens + topic_tokens > @max_tokens_per_batch
56+ yield result if result [ :text ] . present?
57+ estimated_tokens = 0
58+ result = { post_count : 0 , topic_count : 0 , text : +"" }
59+ else
60+ # Add this topic to the result
61+ result [ :text ] << topic_text
62+ result [ :post_count ] += post_count
63+ result [ :topic_count ] += 1
64+ estimated_tokens += topic_tokens
65+ end
66+ end
67+ yield result if result [ :text ] . present?
68+
69+ @to_process . clear
2870 end
2971
3072 private
3173
3274 def filter_to_hash
3375 hash = { }
34- filter
76+ @ filter
3577 . search
36- . pluck ( :topic_id , :post_id , :post_number )
78+ . pluck ( :topic_id , :id , :post_number )
3779 . each do |topic_id , post_id , post_number |
3880 hash [ topic_id ] ||= { posts : [ ] }
3981 hash [ topic_id ] [ :posts ] << [ post_id , post_number ]
@@ -42,6 +84,90 @@ def filter_to_hash
4284 hash . each_value { |topic | topic [ :posts ] . sort_by! { |_ , post_number | post_number } }
4385 hash
4486 end
87+
88+ def format_topic ( topic , posts_data )
89+ text = ""
90+ total_tokens = 0
91+ post_count = 0
92+
93+ # Add topic header
94+ text += format_topic_header ( topic )
95+ total_tokens += estimate_tokens ( text )
96+
97+ # Get all post numbers in this topic
98+ all_post_numbers = topic . posts . pluck ( :post_number ) . sort
99+
100+ # Format posts with omitted information
101+ first_post_number = posts_data . first [ 1 ]
102+ last_post_number = posts_data . last [ 1 ]
103+
104+ # Handle posts before our selection
105+ if first_post_number > 1
106+ omitted_before = first_post_number - 1
107+ text += format_omitted_posts ( omitted_before , "before" )
108+ total_tokens += estimate_tokens ( format_omitted_posts ( omitted_before , "before" ) )
109+ end
110+
111+ # Format each post
112+ posts_data . each do |post_id , post_number |
113+ post = Post . find_by ( id : post_id )
114+ next unless post
115+
116+ text += format_post ( post )
117+ total_tokens += estimate_tokens ( format_post ( post ) )
118+ post_count += 1
119+ end
120+
121+ # Handle posts after our selection
122+ if last_post_number < all_post_numbers . last
123+ omitted_after = all_post_numbers . last - last_post_number
124+ text += format_omitted_posts ( omitted_after , "after" )
125+ total_tokens += estimate_tokens ( format_omitted_posts ( omitted_after , "after" ) )
126+ end
127+
128+ [ text , total_tokens , post_count ]
129+ end
130+
131+ def format_topic_header ( topic )
132+ header = +"# #{ topic . title } \n "
133+
134+ # Add category
135+ header << "Category: #{ topic . category . name } \n " if topic . category
136+
137+ # Add tags
138+ header << "Tags: #{ topic . tags . map ( &:name ) . join ( ", " ) } \n " if topic . tags . present?
139+
140+ # Add creation date
141+ header << "Created: #{ format_date ( topic . created_at ) } \n "
142+ header << "Topic url: /t/#{ topic . id } \n \n "
143+
144+ header
145+ end
146+
147+ def format_post ( post )
148+ text = +"---\n "
149+ text << "## Post by #{ post . user . username } - #{ format_date ( post . created_at ) } \n \n "
150+ text << "#{ post . raw } \n "
151+ text << "Likes: #{ post . like_count } \n " if post . like_count . to_i > 0
152+ text << "Post url: /t/-/#{ post . topic_id } /#{ post . post_number } \n \n "
153+ text
154+ end
155+
156+ def format_omitted_posts ( count , position )
157+ if position == "before"
158+ "#{ count } earlier #{ count == 1 ? "post" : "posts" } omitted\n \n "
159+ else
160+ "#{ count } later #{ count == 1 ? "post" : "posts" } omitted\n \n "
161+ end
162+ end
163+
164+ def format_date ( date )
165+ date . strftime ( "%Y-%m-%d %H:%M" )
166+ end
167+
168+ def estimate_tokens ( text )
169+ @tokenizer . tokenize ( text ) . length
170+ end
45171 end
46172 end
47173 end
0 commit comments