|
1 |
| -require "ffi" |
2 |
| -require "json" |
3 |
| - |
4 |
| -# Minimal TDLib FFI wrapper, tdlib-ruby is conflict with |
5 |
| -# telegram-bot-ruby as they depends on dry-core |
6 |
| -module TDJson |
7 |
| - tdlib_path = ENV["TDLIB_PATH"] |
8 |
| - if Rails.env.development? && tdlib_path && !tdlib_path.empty? |
9 |
| - extend FFI::Library |
10 |
| - lib_name = "tdjson" |
11 |
| - if FFI::Platform.windows? |
12 |
| - ffi_lib File.join(ENV.fetch("TDLIB_PATH"), "#{lib_name}.dll") |
13 |
| - elsif FFI::Platform.mac? |
14 |
| - ffi_lib File.join(ENV.fetch("TDLIB_PATH"), "lib#{lib_name}.dylib") |
15 |
| - else |
16 |
| - ffi_lib File.join(ENV.fetch("TDLIB_PATH"), "lib#{lib_name}.so") |
17 |
| - end |
18 |
| - |
19 |
| - attach_function :td_json_client_create, [], :pointer |
20 |
| - attach_function :td_json_client_send, [ :pointer, :string ], :void |
21 |
| - attach_function :td_json_client_receive, [ :pointer, :double ], :string |
22 |
| - attach_function :td_json_client_execute, [ :pointer, :string ], :string |
23 |
| - attach_function :td_json_client_destroy, [ :pointer ], :void |
24 |
| - end |
25 |
| -end |
26 |
| - |
27 |
| -class TDClient |
28 |
| - def initialize |
29 |
| - @client = TDJson.td_json_client_create |
30 |
| - @request_queue = {} |
31 |
| - end |
32 |
| - |
33 |
| - def send_async(query, &block) |
34 |
| - request_id = SecureRandom.uuid |
35 |
| - @request_queue[request_id] = block |
36 |
| - query["@extra"] = { request_id: request_id }.to_json |
37 |
| - TDJson.td_json_client_send(@client, JSON.dump(query)) |
38 |
| - end |
39 |
| - |
40 |
| - def receive(timeout = 1.0) |
41 |
| - raw = TDJson.td_json_client_receive(@client, timeout) |
42 |
| - return unless raw |
43 |
| - |
44 |
| - update = JSON.parse(raw) |
45 |
| - if update["@extra"] |
46 |
| - extra = JSON.parse(update["@extra"]) |
47 |
| - if extra["request_id"] |
48 |
| - callback = @request_queue.delete(extra["request_id"]) |
49 |
| - callback.call(update) if callback |
50 |
| - end |
51 |
| - end |
52 |
| - update |
53 |
| - end |
54 |
| - |
55 |
| - def execute(query) |
56 |
| - raw = TDJson.td_json_client_execute(@client, JSON.dump(query)) |
57 |
| - raw && JSON.parse(raw) |
58 |
| - end |
59 |
| - |
60 |
| - def send(query) |
61 |
| - TDJson.td_json_client_send(@client, JSON.dump(query)) |
62 |
| - end |
63 |
| - |
64 |
| - def close |
65 |
| - TDJson.td_json_client_destroy(@client) |
66 |
| - end |
67 |
| - |
68 |
| - def get_chat(chat_id) |
69 |
| - execute({ |
70 |
| - "@type" => "getChat", |
71 |
| - "chat_id" => chat_id |
72 |
| - }) |
73 |
| - end |
74 |
| - |
75 |
| - def get_user(user_id) |
76 |
| - execute({ |
77 |
| - "@type" => "getUser", |
78 |
| - "user_id" => user_id |
79 |
| - }) |
80 |
| - end |
81 |
| -end |
82 |
| - |
83 | 1 | namespace :telegram do
|
84 | 2 | desc "Starts the TDLib client to listen for telegram messages"
|
85 | 3 | task listen: :environment do
|
@@ -181,28 +99,59 @@ namespace :telegram do
|
181 | 99 | end
|
182 | 100 |
|
183 | 101 | def process_message(message_content, group_id, group_name, user_id, user_name)
|
| 102 | + # Memoize the classifier to avoid creating it twice |
184 | 103 | classifier = SpamClassifierService.new(group_id, group_name)
|
| 104 | + |
185 | 105 | message_hash = Digest::SHA256.hexdigest(message_content.to_s)
|
186 | 106 | existing_message = TrainedMessage.find_by(message_hash: message_hash)
|
| 107 | + |
187 | 108 | if existing_message
|
188 | 109 | puts "Message already exists, skipping"
|
189 | 110 | return
|
190 | 111 | end
|
191 | 112 |
|
192 |
| - spam_count = TrainedMessage.where(message_type: [ :spam, :maybe_spam ]).count |
193 |
| - ham_count = TrainedMessage.where(message_type: [ :ham, :maybe_ham ]).count |
| 113 | + # Process message content |
| 114 | + train_message_if_needed( |
| 115 | + classifier, |
| 116 | + message_content, |
| 117 | + :message_content, |
| 118 | + group_id, |
| 119 | + group_name, |
| 120 | + user_id, |
| 121 | + user_name |
| 122 | + ) |
| 123 | + |
| 124 | + # Process user name |
| 125 | + train_message_if_needed( |
| 126 | + classifier, |
| 127 | + user_name, |
| 128 | + :user_name, |
| 129 | + group_id, |
| 130 | + group_name, |
| 131 | + user_id, |
| 132 | + user_name |
| 133 | + ) |
| 134 | +end |
| 135 | + |
| 136 | +def train_message_if_needed(classifier, text_to_classify, training_target, group_id, group_name, user_id, user_name) |
| 137 | + is_spam, _, _ = classifier.classify(text_to_classify) |
| 138 | + |
| 139 | + puts "#{training_target} classified result: #{is_spam ? 'maybe_spam' : 'maybe_ham'}" |
| 140 | + |
| 141 | + spam_count = TrainedMessage.where(message_type: [ :spam, :maybe_spam ], training_target: training_target).count |
| 142 | + ham_count = TrainedMessage.where(message_type: [ :ham, :maybe_ham ], training_target: training_target).count |
| 143 | + |
| 144 | + # Logic to balance the dataset |
| 145 | + should_create = (spam_count > ham_count && !is_spam) || (spam_count <= ham_count && is_spam) |
194 | 146 |
|
195 |
| - # Having reasonably balanced datasets is generally beneficial for |
196 |
| - # reduces bias and improves accuracy |
197 |
| - is_spam, spam_score, ham_score = classifier.classify(message_content) |
198 |
| - puts "classified result: #{is_spam ? "maybe_spam": "maybe_ham"}" |
199 |
| - if (spam_count > ham_count && !is_spam) || (spam_count <= ham_count && is_spam) |
| 147 | + if should_create |
200 | 148 | TrainedMessage.create!(
|
201 | 149 | group_id: group_id,
|
202 | 150 | group_name: group_name,
|
203 |
| - message: message_content, |
| 151 | + message: text_to_classify, |
204 | 152 | message_type: is_spam ? :maybe_spam : :maybe_ham,
|
205 | 153 | sender_user_name: user_name || "Telegram collector",
|
| 154 | + training_target: training_target, |
206 | 155 | sender_chat_id: user_id
|
207 | 156 | )
|
208 | 157 | end
|
|
0 commit comments