Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
581e725
Initial implementation of raw mail importer
amatsuda Oct 16, 2025
352f5f5
Extract Message.from_mail to a method
amatsuda Oct 16, 2025
0917468
Mail file might not exist or might be a blank file
amatsuda Oct 16, 2025
0ad4d14
Let's assume that all mail body are encoded in ISO-2022-JP
amatsuda Oct 16, 2025
4e1c01e
Use unparsed From value
amatsuda Oct 16, 2025
4e62345
Mail body may sometimes be broken, but let's proceed anyway
amatsuda Oct 16, 2025
a4d6066
Fall back to Mail's default encoding handling
amatsuda Oct 16, 2025
1e1122c
Turned out that it's hard to do this without Kconv...
amatsuda Oct 16, 2025
6365a1f
Decode subject
amatsuda Oct 16, 2025
5e4ea8d
Retrieve parent message-id from in-reply-to, references header
amatsuda Oct 16, 2025
4110f59
rails g migration add_index_messages_message_id_header
amatsuda Oct 17, 2025
ca6f150
Workaround "string contains null byte" on ruby-dev: 13859
amatsuda Oct 17, 2025
7281ed4
Some more null bytes
amatsuda Oct 17, 2025
7feef35
Properly encode from to UTF-8
amatsuda Oct 17, 2025
e9d9a77
Mail#from_address can be nil
amatsuda Oct 17, 2025
5b5a144
Work around ruby-core: 161 mojibake
amatsuda Oct 17, 2025
d5ffd4a
There can be emails without a subject
amatsuda Oct 17, 2025
59f4ee8
Kconv does everything almost properly, indeed
amatsuda Oct 21, 2025
ff38b43
Mail#from_address could return nil
amatsuda Oct 21, 2025
6ecace8
subject can be nil
amatsuda Oct 18, 2025
ff5ff6a
Work around "Encoding::CompatibilityError: incompatible character enc…
amatsuda Oct 18, 2025
12e446a
Workaround broken subject
amatsuda Oct 18, 2025
ebf6152
message_id could include a broken mojibake char
amatsuda Oct 18, 2025
2fed221
message-id can be nil
amatsuda Oct 20, 2025
48bee57
Perf improvement by not creating AR objects
amatsuda Oct 19, 2025
2d44e5c
Report errors at once
amatsuda Oct 19, 2025
c92551a
Perf improvement by not logging debug logs
amatsuda Oct 19, 2025
7fdc658
Mail converts linebreaks to CRLF
amatsuda Oct 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion app/helpers/messages_helper.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module MessagesHelper
def without_list_prefix(subject)
subject.sub(/^\[.+?\]\s*/, '')
subject&.sub(/^\[.+?\]\s*/, '')
end

MARGIN = 50
Expand Down
39 changes: 39 additions & 0 deletions app/models/message.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,45 @@ class Message < ApplicationRecord
self.skip_time_zone_conversion_for_attributes = [:published_at]

class << self
def from_mail(mail, list, list_seq)
body = Kconv.toutf8 mail.body.raw_source
if ((list.name == 'ruby-dev') && list_seq.in?([13859, 26229, 39731, 39734])) || ((list.name == 'ruby-core') && list_seq.in?([5231])) || ((list.name == 'ruby-list') && list_seq.in?([29637, 29711, 30148])) || ((list.name == 'ruby-talk') && list_seq.in?([5198, 61316]))
body.gsub!("\u0000", '')
end
if (list.name == 'ruby-list') && list_seq.in?([37565, 38116, 43106])
mail.header[:subject].value.chop!
end
if (list.name == 'ruby-list') && (list_seq.in?([41850, 43710]))
mail.header[:subject].value = Kconv.toutf8 mail.header[:subject].value
end
subject = mail.subject
subject = Kconv.toutf8 subject if subject
from = Kconv.toutf8 mail.from_address&.raw
if !from && (list.name == 'ruby-core') && (list_seq == 161)
from = mail.from.encode Encoding::UTF_8, Encoding::KOI8_R
end

message_id = mail.message_id&.encode Encoding::UTF_8, invalid: :replace, undef: :replace

# mail.in_reply_to returns strange Array object in some cases (?), so let's use the raw value
parent_message_id_header = extract_message_id_from_in_reply_to(mail.header[:in_reply_to]&.value)
parent_message_id = Message.where(message_id_header: parent_message_id_header).pick(:id) if parent_message_id_header
if !parent_message_id && (String === mail.references)
parent_message_id = Message.where(message_id_header: mail.references).pick(:id)
end
if !parent_message_id && (Array === mail.references)
mail.references.compact.each do |ref|
break if (parent_message_id = Message.where(message_id_header: ref).pick(:id))
end
end

new list_id: list.id, list_seq: list_seq, body: body, subject: subject, from: from, published_at: mail.date, message_id_header: message_id, parent_id: parent_message_id
end

private def extract_message_id_from_in_reply_to(header)
header && header.strip.scan(/<([^>]+)>/).flatten.first
end

def from_s3(list_name, list_seq, s3_client = Aws::S3::Client.new(region: BLADE_BUCKET_REGION))
obj = s3_client.get_object(bucket: BLADE_BUCKET_NAME, key: "#{list_name}/#{list_seq}")

Expand Down
42 changes: 42 additions & 0 deletions bin/import_mails
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/usr/bin/env ruby

require 'optparse'
require 'mail'

BASE_DIR = Rails.root.join('tmp')

params = {}
OptionParser.new do |opts|
opts.on('--list LIST')
opts.on('--from FROM', Integer)
opts.on('--to TO', Integer)
end.parse!(into: params)

list = List.find_by_name(params[:list])

errors = []

Rails.logger.level = Logger::INFO

Message.transaction do
(params[:from]..params[:to]).each do |seq|
begin
filepath = BASE_DIR.join(list.name, seq.to_s)
next unless filepath.exist?

str = File.binread filepath
next if str.blank?

mail = Mail.read_from_string str
message = Message.from_mail mail, list, seq
message.save!
rescue ActiveRecord::RecordNotUnique
STDERR.puts("#{list}:#{seq} already exists in Postgres")
rescue StandardError => e
errors << [seq, e]
STDERR.puts("failed to import #{list}:#{seq}: #{e}")
end
end
end

pp errors if errors.any?
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
class AddIndexMessagesMessageIdHeader < ActiveRecord::Migration[8.0]
def change
end
end
2 changes: 1 addition & 1 deletion db/schema.rb

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 14 additions & 0 deletions test/models/message_test.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,20 @@
require "test_helper"

class MessageTest < ActiveSupport::TestCase
test 'from_mail' do
mail = Mail.read_from_string(<<END_OF_BODY)
Subject: [ruby-list:1] Hello
From: [email protected]
Date: 2005-12-15T19:32:40+09:00

Hello, world!
END_OF_BODY
m = Message.from_mail(mail, List.find_by_name('ruby-list'), 1)
assert_equal "Hello, world!\r\n", m.body

assert_equal DateTime.parse('2005-12-15T19:32:40+09:00'), m.published_at
end

test 'from_string' do
m = Message.from_string(<<END_OF_BODY)
Subject: [ruby-list:1] Hello
Expand Down