Skip to content

Commit b04a117

Browse files
committed
refactor the structure of telegram_spam_sniper_bot
1. Move the TDJson module and TDClient class 2. Classifiy user_name in message handler as well
1 parent 1c59495 commit b04a117

File tree

2 files changed

+120
-90
lines changed

2 files changed

+120
-90
lines changed

app/services/tdlib_client.rb

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
require "ffi"
2+
require "json"
3+
4+
# Minimal TDLib FFI wrapper, tdlib-ruby is conflict with
5+
# telegram-bot-ruby as they depends on dry-core
6+
module TDJson
7+
tdlib_path = ENV["TDLIB_PATH"]
8+
if Rails.env.development? && tdlib_path && !tdlib_path.empty?
9+
extend FFI::Library
10+
lib_name = "tdjson"
11+
if FFI::Platform.windows?
12+
ffi_lib File.join(ENV.fetch("TDLIB_PATH"), "#{lib_name}.dll")
13+
elsif FFI::Platform.mac?
14+
ffi_lib File.join(ENV.fetch("TDLIB_PATH"), "lib#{lib_name}.dylib")
15+
else
16+
ffi_lib File.join(ENV.fetch("TDLIB_PATH"), "lib#{lib_name}.so")
17+
end
18+
19+
attach_function :td_json_client_create, [], :pointer
20+
attach_function :td_json_client_send, [ :pointer, :string ], :void
21+
attach_function :td_json_client_receive, [ :pointer, :double ], :string
22+
attach_function :td_json_client_execute, [ :pointer, :string ], :string
23+
attach_function :td_json_client_destroy, [ :pointer ], :void
24+
end
25+
end
26+
27+
class TDClient
28+
def initialize
29+
@client = TDJson.td_json_client_create
30+
@request_queue = {}
31+
end
32+
33+
def send_async(query, &block)
34+
request_id = SecureRandom.uuid
35+
@request_queue[request_id] = block
36+
query["@extra"] = { request_id: request_id }.to_json
37+
TDJson.td_json_client_send(@client, JSON.dump(query))
38+
end
39+
40+
def receive(timeout = 1.0)
41+
raw = TDJson.td_json_client_receive(@client, timeout)
42+
return unless raw
43+
44+
update = JSON.parse(raw)
45+
if update["@extra"]
46+
extra = JSON.parse(update["@extra"])
47+
if extra["request_id"]
48+
callback = @request_queue.delete(extra["request_id"])
49+
callback.call(update) if callback
50+
end
51+
end
52+
update
53+
end
54+
55+
def execute(query)
56+
raw = TDJson.td_json_client_execute(@client, JSON.dump(query))
57+
raw && JSON.parse(raw)
58+
end
59+
60+
def send(query)
61+
TDJson.td_json_client_send(@client, JSON.dump(query))
62+
end
63+
64+
def close
65+
TDJson.td_json_client_destroy(@client)
66+
end
67+
68+
def get_chat(chat_id)
69+
execute({
70+
"@type" => "getChat",
71+
"chat_id" => chat_id
72+
})
73+
end
74+
75+
def get_user(user_id)
76+
execute({
77+
"@type" => "getUser",
78+
"user_id" => user_id
79+
})
80+
end
81+
end

lib/tasks/telegram_data_collector.rake

Lines changed: 39 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -1,85 +1,3 @@
1-
require "ffi"
2-
require "json"
3-
4-
# Minimal TDLib FFI wrapper, tdlib-ruby is conflict with
5-
# telegram-bot-ruby as they depends on dry-core
6-
module TDJson
7-
tdlib_path = ENV["TDLIB_PATH"]
8-
if Rails.env.development? && tdlib_path && !tdlib_path.empty?
9-
extend FFI::Library
10-
lib_name = "tdjson"
11-
if FFI::Platform.windows?
12-
ffi_lib File.join(ENV.fetch("TDLIB_PATH"), "#{lib_name}.dll")
13-
elsif FFI::Platform.mac?
14-
ffi_lib File.join(ENV.fetch("TDLIB_PATH"), "lib#{lib_name}.dylib")
15-
else
16-
ffi_lib File.join(ENV.fetch("TDLIB_PATH"), "lib#{lib_name}.so")
17-
end
18-
19-
attach_function :td_json_client_create, [], :pointer
20-
attach_function :td_json_client_send, [ :pointer, :string ], :void
21-
attach_function :td_json_client_receive, [ :pointer, :double ], :string
22-
attach_function :td_json_client_execute, [ :pointer, :string ], :string
23-
attach_function :td_json_client_destroy, [ :pointer ], :void
24-
end
25-
end
26-
27-
class TDClient
28-
def initialize
29-
@client = TDJson.td_json_client_create
30-
@request_queue = {}
31-
end
32-
33-
def send_async(query, &block)
34-
request_id = SecureRandom.uuid
35-
@request_queue[request_id] = block
36-
query["@extra"] = { request_id: request_id }.to_json
37-
TDJson.td_json_client_send(@client, JSON.dump(query))
38-
end
39-
40-
def receive(timeout = 1.0)
41-
raw = TDJson.td_json_client_receive(@client, timeout)
42-
return unless raw
43-
44-
update = JSON.parse(raw)
45-
if update["@extra"]
46-
extra = JSON.parse(update["@extra"])
47-
if extra["request_id"]
48-
callback = @request_queue.delete(extra["request_id"])
49-
callback.call(update) if callback
50-
end
51-
end
52-
update
53-
end
54-
55-
def execute(query)
56-
raw = TDJson.td_json_client_execute(@client, JSON.dump(query))
57-
raw && JSON.parse(raw)
58-
end
59-
60-
def send(query)
61-
TDJson.td_json_client_send(@client, JSON.dump(query))
62-
end
63-
64-
def close
65-
TDJson.td_json_client_destroy(@client)
66-
end
67-
68-
def get_chat(chat_id)
69-
execute({
70-
"@type" => "getChat",
71-
"chat_id" => chat_id
72-
})
73-
end
74-
75-
def get_user(user_id)
76-
execute({
77-
"@type" => "getUser",
78-
"user_id" => user_id
79-
})
80-
end
81-
end
82-
831
namespace :telegram do
842
desc "Starts the TDLib client to listen for telegram messages"
853
task listen: :environment do
@@ -181,28 +99,59 @@ namespace :telegram do
18199
end
182100

183101
def process_message(message_content, group_id, group_name, user_id, user_name)
102+
# Memoize the classifier to avoid creating it twice
184103
classifier = SpamClassifierService.new(group_id, group_name)
104+
185105
message_hash = Digest::SHA256.hexdigest(message_content.to_s)
186106
existing_message = TrainedMessage.find_by(message_hash: message_hash)
107+
187108
if existing_message
188109
puts "Message already exists, skipping"
189110
return
190111
end
191112

192-
spam_count = TrainedMessage.where(message_type: [ :spam, :maybe_spam ]).count
193-
ham_count = TrainedMessage.where(message_type: [ :ham, :maybe_ham ]).count
113+
# Process message content
114+
train_message_if_needed(
115+
classifier,
116+
message_content,
117+
:message_content,
118+
group_id,
119+
group_name,
120+
user_id,
121+
user_name
122+
)
123+
124+
# Process user name
125+
train_message_if_needed(
126+
classifier,
127+
user_name,
128+
:user_name,
129+
group_id,
130+
group_name,
131+
user_id,
132+
user_name
133+
)
134+
end
135+
136+
def train_message_if_needed(classifier, text_to_classify, training_target, group_id, group_name, user_id, user_name)
137+
is_spam, _, _ = classifier.classify(text_to_classify)
138+
139+
puts "#{training_target} classified result: #{is_spam ? 'maybe_spam' : 'maybe_ham'}"
140+
141+
spam_count = TrainedMessage.where(message_type: [ :spam, :maybe_spam ], training_target: training_target).count
142+
ham_count = TrainedMessage.where(message_type: [ :ham, :maybe_ham ], training_target: training_target).count
143+
144+
# Logic to balance the dataset
145+
should_create = (spam_count > ham_count && !is_spam) || (spam_count <= ham_count && is_spam)
194146

195-
# Having reasonably balanced datasets is generally beneficial for
196-
# reduces bias and improves accuracy
197-
is_spam, spam_score, ham_score = classifier.classify(message_content)
198-
puts "classified result: #{is_spam ? "maybe_spam": "maybe_ham"}"
199-
if (spam_count > ham_count && !is_spam) || (spam_count <= ham_count && is_spam)
147+
if should_create
200148
TrainedMessage.create!(
201149
group_id: group_id,
202150
group_name: group_name,
203-
message: message_content,
151+
message: text_to_classify,
204152
message_type: is_spam ? :maybe_spam : :maybe_ham,
205153
sender_user_name: user_name || "Telegram collector",
154+
training_target: training_target,
206155
sender_chat_id: user_id
207156
)
208157
end

0 commit comments

Comments
 (0)