Skip to content

Commit dbe2ac1

Browse files
committed
optimize telegram data collector to include group name and user name
1 parent 69eb01b commit dbe2ac1

File tree

1 file changed

+124
-100
lines changed

1 file changed

+124
-100
lines changed
Lines changed: 124 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,81 @@
1+
require "ffi"
2+
require "json"
3+
4+
# Minimal TDLib FFI wrapper, tdlib-ruby is conflict with
5+
# telegram-bot-ruby as they depends on dry-core
6+
module TDJson
7+
extend FFI::Library
8+
lib_name = "tdjson"
9+
if FFI::Platform.windows?
10+
ffi_lib File.join(ENV.fetch("TDLIB_PATH"), "#{lib_name}.dll")
11+
elsif FFI::Platform.mac?
12+
ffi_lib File.join(ENV.fetch("TDLIB_PATH"), "lib#{lib_name}.dylib")
13+
else
14+
ffi_lib File.join(ENV.fetch("TDLIB_PATH"), "lib#{lib_name}.so")
15+
end
16+
17+
attach_function :td_json_client_create, [], :pointer
18+
attach_function :td_json_client_send, [ :pointer, :string ], :void
19+
attach_function :td_json_client_receive, [ :pointer, :double ], :string
20+
attach_function :td_json_client_execute, [ :pointer, :string ], :string
21+
attach_function :td_json_client_destroy, [ :pointer ], :void
22+
end
23+
24+
class TDClient
25+
def initialize
26+
@client = TDJson.td_json_client_create
27+
@request_queue = {}
28+
end
29+
30+
def send_async(query, &block)
31+
request_id = SecureRandom.uuid
32+
@request_queue[request_id] = block
33+
query["@extra"] = { request_id: request_id }.to_json
34+
TDJson.td_json_client_send(@client, JSON.dump(query))
35+
end
36+
37+
def receive(timeout = 1.0)
38+
raw = TDJson.td_json_client_receive(@client, timeout)
39+
return unless raw
40+
41+
update = JSON.parse(raw)
42+
if update["@extra"]
43+
extra = JSON.parse(update["@extra"])
44+
if extra["request_id"]
45+
callback = @request_queue.delete(extra["request_id"])
46+
callback.call(update) if callback
47+
end
48+
end
49+
update
50+
end
51+
52+
def execute(query)
53+
raw = TDJson.td_json_client_execute(@client, JSON.dump(query))
54+
raw && JSON.parse(raw)
55+
end
56+
57+
def send(query)
58+
TDJson.td_json_client_send(@client, JSON.dump(query))
59+
end
60+
61+
def close
62+
TDJson.td_json_client_destroy(@client)
63+
end
64+
65+
def get_chat(chat_id)
66+
execute({
67+
"@type" => "getChat",
68+
"chat_id" => chat_id
69+
})
70+
end
71+
72+
def get_user(user_id)
73+
execute({
74+
"@type" => "getUser",
75+
"user_id" => user_id
76+
})
77+
end
78+
end
179

280
namespace :telegram do
381
desc "Starts the TDLib client to listen for telegram messages"
@@ -23,53 +101,6 @@ namespace :telegram do
23101
next
24102
end
25103

26-
require "ffi"
27-
require "json"
28-
29-
# Minimal TDLib FFI wrapper, tdlib-ruby is conflict with
30-
# telegram-bot-ruby as they depends on dry-core
31-
module TDJson
32-
extend FFI::Library
33-
lib_name = "tdjson"
34-
if FFI::Platform.windows?
35-
ffi_lib File.join(ENV.fetch("TDLIB_PATH"), "#{lib_name}.dll")
36-
elsif FFI::Platform.mac?
37-
ffi_lib File.join(ENV.fetch("TDLIB_PATH"), "lib#{lib_name}.dylib")
38-
else
39-
ffi_lib File.join(ENV.fetch("TDLIB_PATH"), "lib#{lib_name}.so")
40-
end
41-
42-
attach_function :td_json_client_create, [], :pointer
43-
attach_function :td_json_client_send, [ :pointer, :string ], :void
44-
attach_function :td_json_client_receive, [ :pointer, :double ], :string
45-
attach_function :td_json_client_execute, [ :pointer, :string ], :string
46-
attach_function :td_json_client_destroy, [ :pointer ], :void
47-
end
48-
49-
class TDClient
50-
def initialize
51-
@client = TDJson.td_json_client_create
52-
end
53-
54-
def send(query)
55-
TDJson.td_json_client_send(@client, JSON.dump(query))
56-
end
57-
58-
def receive(timeout = 1.0)
59-
raw = TDJson.td_json_client_receive(@client, timeout)
60-
raw && JSON.parse(raw)
61-
end
62-
63-
def execute(query)
64-
raw = TDJson.td_json_client_execute(@client, JSON.dump(query))
65-
raw && JSON.parse(raw)
66-
end
67-
68-
def close
69-
TDJson.td_json_client_destroy(@client)
70-
end
71-
end
72-
73104
client = TDClient.new
74105

75106
# Set log level
@@ -131,34 +162,8 @@ namespace :telegram do
131162
else
132163
puts "Unhandled authorization state: #{new_state}"
133164
end
134-
135165
when "updateNewMessage"
136-
message = update["message"]
137-
chat_id = message["chat_id"]
138-
content = message["content"]
139-
140-
if content["@type"] == "messageText"
141-
message_content = content["text"]["text"]
142-
process_message(message_content)
143-
puts "Chat ID: #{chat_id} | Text: #{message_content}"
144-
else
145-
puts "Chat ID: #{chat_id} | Type: #{content['@type']}"
146-
end
147-
puts "----------------------"
148-
149-
when "updateMessageContent"
150-
chat_id = update["chat_id"]
151-
new_content = update["new_content"]
152-
153-
if new_content["@type"] == "messageText"
154-
message_content = new_content["text"]["text"]
155-
process_message(message_content)
156-
puts "Chat ID: #{chat_id} | New Text: #{message_content}"
157-
else
158-
puts "Chat ID: #{chat_id} | New Type: #{new_content['@type']}"
159-
end
160-
puts "----------------------"
161-
166+
handleUpdateNewMessage(update, client)
162167
else
163168
# ignore other updates
164169
end
@@ -172,9 +177,7 @@ namespace :telegram do
172177
end
173178
end
174179

175-
def process_message(message_content)
176-
group_id = GroupClassifierState::TELEGRAM_DATA_COLLECTOR_GROUP_ID
177-
group_name = GroupClassifierState::TELEGRAM_DATA_COLLECTOR_GROUP_NAME
180+
def process_message(message_content, group_id, group_name, user_id, user_name)
178181
classifier = SpamClassifierService.new(group_id, group_name)
179182
message_hash = Digest::SHA256.hexdigest(message_content.to_s)
180183
existing_message = TrainedMessage.find_by(message_hash: message_hash)
@@ -190,30 +193,51 @@ def process_message(message_content)
190193
# reduces bias and improves accuracy
191194
is_spam, spam_score, ham_score = classifier.classify(message_content)
192195
puts "classified result: #{is_spam ? "maybe_spam": "maybe_ham"}"
193-
if spam_count > ham_count
194-
# only interested in ham
195-
if !is_spam
196-
TrainedMessage.create!(
197-
group_id: group_id,
198-
group_name: group_name,
199-
message: message_content,
200-
message_type: :maybe_ham,
201-
sender_chat_id: 0,
202-
sender_user_name: "Telegram collector"
203-
)
204-
end
196+
if (spam_count > ham_count && !is_spam) || (spam_count <= ham_count && is_spam)
197+
TrainedMessage.create!(
198+
group_id: group_id,
199+
group_name: group_name,
200+
message: message_content,
201+
message_type: is_spam ? :maybe_spam : :maybe_ham,
202+
sender_user_name: user_name || "Telegram collector",
203+
sender_chat_id: user_id
204+
)
205+
end
206+
end
205207

206-
else
207-
# interested in spam
208-
if is_spam
209-
TrainedMessage.create!(
210-
group_id: group_id,
211-
group_name: group_name,
212-
message: message_content,
213-
message_type: :maybe_spam,
214-
sender_chat_id: 0,
215-
sender_user_name: "Telegram collector"
216-
)
208+
def handleUpdateNewMessage(update, client)
209+
message = update["message"]
210+
chat_id = message["chat_id"]
211+
sender_id = message["sender_id"]["user_id"] rescue nil
212+
content = message["content"]
213+
user_name = "Unknown User"
214+
message_content = content["text"]["text"] if content["@type"] == "messageText"
215+
216+
# Send asynchronous requests for chat and user data
217+
client.send_async({ "@type" => "getChat", "chat_id" => chat_id }) do |chat_update|
218+
chat = chat_update
219+
group_name = chat["title"] || "Unknown Group"
220+
221+
if sender_id
222+
client.send_async({ "@type" => "getUser", "user_id" => sender_id }) do |user_update|
223+
user = user_update
224+
user_name = user["first_name"]
225+
user_name += " " + user["last_name"] if user["last_name"]
226+
user_name = user["username"] if user["username"]
227+
228+
unless message_content.blank?
229+
process_message(message_content, chat_id, group_name, sender_id, user_name)
230+
puts "Group(#{chat_id}): #{group_name} | User: #{user_name} | Text: #{message_content}"
231+
puts "----------------------"
232+
end
233+
end
234+
else
235+
# Handle cases where there's no sender_id (e.g., channel posts)
236+
unless message_content.blank?
237+
process_message(message_content, chat_id, group_name, sender_id, user_name)
238+
puts "Group: #{group_name} | User: #{user_name} | Text: #{message_content}"
239+
puts "----------------------"
240+
end
217241
end
218242
end
219243
end

0 commit comments

Comments
 (0)