Skip to content

Commit c70acb7

Browse files
authored
Merge pull request #86 from yokenzan/feature/master-csv-loading-optimization
perf(master): optimize loading and search caching
2 parents 99e365c + 8f48db9 commit c70acb7

File tree

9 files changed

+732
-5
lines changed

9 files changed

+732
-5
lines changed

.gitignore

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,10 @@
99
/vendor/
1010
Gemfile.lock
1111
/node_modules
12+
/sorbet/
13+
/bin/tapioca
14+
/.serena/
15+
/.vscode/
16+
/csv/master/*/utf8/
17+
/csv/master/*/.cache/
18+

Rakefile

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,33 @@
11
# frozen_string_literal: true
22

33
require 'bundler/gem_tasks'
4+
require 'fileutils'
45
require 'rspec/core/rake_task'
56
require 'rubocop/rake_task'
67

78
RSpec::Core::RakeTask.new(:test)
89
RuboCop::RakeTask.new(:lint)
910

1011
task default: :test
12+
13+
namespace :master do
14+
desc 'Shift_JISのマスタCSV/TXTをUTF-8へ変換して保存する'
15+
task :convert_utf8 do
16+
master_root = File.expand_path('csv/master', __dir__)
17+
year_dirs = Dir.glob(File.join(master_root, '*')).select { | path | File.directory?(path) }
18+
19+
year_dirs.each do | year_dir |
20+
utf8_dir = File.join(year_dir, 'utf8')
21+
FileUtils.mkdir_p(utf8_dir)
22+
23+
source_files = Dir.glob(File.join(year_dir, '*.{csv,txt,CSV,TXT}'))
24+
source_files.each do | source_path |
25+
source = File.binread(source_path).force_encoding('Shift_JIS')
26+
utf8 = source.encode(Encoding::UTF_8, invalid: :replace, undef: :replace)
27+
output_path = File.join(utf8_dir, File.basename(source_path))
28+
File.binwrite(output_path, utf8)
29+
puts "converted: #{source_path} -> #{output_path}"
30+
end
31+
end
32+
end
33+
end

benchmark/master_loading.rb

Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
# frozen_string_literal: true
2+
3+
require 'benchmark'
4+
require 'logger'
5+
require 'open3'
6+
require 'pathname'
7+
require_relative '../lib/receiptisan'
8+
9+
# rubocop:disable Metrics/ModuleLength
10+
module MasterLoadingBenchmark
11+
VERSION = Receiptisan::Model::ReceiptComputer::Master::Version::V2024_R06
12+
MASTER = Receiptisan::Model::ReceiptComputer::Master
13+
LOADER_TYPES = %i[
14+
shinryou_koui
15+
iyakuhin
16+
tokutei_kizai
17+
comment
18+
shoubyoumei
19+
shuushokugo
20+
].freeze
21+
22+
class ForeachCollector
23+
Result = Struct.new(:loader, :io_and_encoding, :string_processing, :object_building, :lines, keyword_init: true)
24+
25+
def initialize
26+
@results = Hash.new do | hash, key |
27+
hash[key] = Result.new(
28+
loader: key,
29+
io_and_encoding: 0.0,
30+
string_processing: 0.0,
31+
object_building: 0.0,
32+
lines: 0
33+
)
34+
end
35+
end
36+
37+
def record(loader_name, io_and_encoding:, string_processing:, object_building:, lines:)
38+
result = @results[loader_name]
39+
result.io_and_encoding += io_and_encoding
40+
result.string_processing += string_processing
41+
result.object_building += object_building
42+
result.lines += lines
43+
end
44+
45+
def to_a
46+
@results.values.sort_by(&:loader)
47+
end
48+
end
49+
50+
module ForeachProfiler
51+
class << self
52+
attr_accessor :collector
53+
end
54+
end
55+
56+
module LoaderTraitInstrument
57+
def foreach(csv_paths)
58+
logger.info 'prepare to load following CSV %d files:' % csv_paths.length
59+
logger.info csv_paths.map(&:to_path)
60+
61+
io_and_encoding = 0.0
62+
string_processing = 0.0
63+
object_building = 0.0
64+
lines = 0
65+
66+
csv_paths.each do | csv_path |
67+
load_path, read_encoding = resolve_load_path(csv_path)
68+
contents = nil
69+
io_and_encoding += Benchmark.realtime do
70+
contents = File.read(load_path, mode: "r:#{read_encoding}:UTF-8")
71+
end
72+
73+
rows = contents.split("\n")
74+
rows.each do | row |
75+
values = nil
76+
string_processing += Benchmark.realtime do
77+
values = row.delete_suffix("\r").tr('"', '').split(',')
78+
end
79+
object_building += Benchmark.realtime do
80+
yield values
81+
end
82+
lines += 1
83+
end
84+
85+
logger.info "#{load_path}(#{rows.length} lines) was loaded."
86+
end
87+
88+
ForeachProfiler.collector&.record(
89+
self.class.name.split('::').last,
90+
io_and_encoding: io_and_encoding,
91+
string_processing: string_processing,
92+
object_building: object_building,
93+
lines: lines
94+
)
95+
end
96+
end
97+
98+
module_function
99+
100+
def run
101+
Receiptisan::Model::ReceiptComputer::Master::Loader::LoaderTrait.prepend(LoaderTraitInstrument)
102+
puts '=== Master CSV Loading Benchmark ==='
103+
puts "Version: #{VERSION}"
104+
puts
105+
106+
benchmark_full_loading
107+
benchmark_by_loader_type
108+
benchmark_search_command
109+
end
110+
111+
def benchmark_full_loading
112+
puts '[1] Full load benchmark (Loader#load)'
113+
times = []
114+
rss_diffs = []
115+
116+
3.times do | i |
117+
before_rss = rss_kb
118+
elapsed = Benchmark.realtime { build_loader.load(VERSION) }
119+
after_rss = rss_kb
120+
times << elapsed
121+
rss_diffs << (after_rss - before_rss)
122+
puts format(' run%-2d: %.3fs (RSS %+d KB)', i + 1, elapsed, after_rss - before_rss)
123+
end
124+
125+
puts format(' cold run: %.3fs', times.first)
126+
puts format(' warm avg: %.3fs', average(times.drop(1)))
127+
puts format(' RSS diff avg: %.1f KB', average(rss_diffs))
128+
puts
129+
end
130+
131+
def benchmark_by_loader_type
132+
puts '[2][3] Breakdown by loader type and foreach phases'
133+
LOADER_TYPES.each do | type |
134+
elapsed_times = []
135+
collector = ForeachCollector.new
136+
137+
3.times do | i |
138+
ForeachProfiler.collector = collector
139+
elapsed = Benchmark.realtime { build_loader.load_type(VERSION, type) }
140+
elapsed_times << elapsed
141+
puts format(' %-15s run%-2d: %.3fs', type, i + 1, elapsed)
142+
end
143+
144+
puts format(' %-15s cold: %.3fs / warm avg: %.3fs', type, elapsed_times.first, average(elapsed_times.drop(1)))
145+
collector.to_a.each do | result |
146+
puts format(
147+
' %-18s io+enc: %.3fs | split: %.3fs | build: %.3fs | lines: %d',
148+
result.loader,
149+
result.io_and_encoding,
150+
result.string_processing,
151+
result.object_building,
152+
result.lines
153+
)
154+
end
155+
ensure
156+
ForeachProfiler.collector = nil
157+
end
158+
puts
159+
end
160+
161+
def benchmark_search_command
162+
puts '[4][5] receiptisan search end-to-end'
163+
cases = [
164+
['max-shinryou', %w[--type shinryou-koui --name 初診 --month 202406]],
165+
['mid-iyakuhin', %w[--type iyakuhin --name アセト --month 202406]],
166+
['small-kizai', %w[--type tokutei-kizai --name カテーテル --month 202406]],
167+
]
168+
169+
cases.each do | name, args |
170+
times = []
171+
3.times do | i |
172+
elapsed = Benchmark.realtime do
173+
stdout, stderr, status = Open3.capture3(
174+
'bundle', 'exec', 'ruby', 'exe/receiptisan', 'search', *args,
175+
chdir: repo_root
176+
)
177+
next if status.success?
178+
179+
raise "search failed (#{name}): #{stderr}\n#{stdout}"
180+
end
181+
times << elapsed
182+
puts format(' %-12s run%-2d: %.3fs', name, i + 1, elapsed)
183+
end
184+
185+
puts format(' %-12s cold: %.3fs / warm avg: %.3fs', name, times.first, average(times.drop(1)))
186+
end
187+
puts
188+
end
189+
190+
def build_loader
191+
MASTER::Loader.new(MASTER::ResourceResolver.new, Logger.new(nil))
192+
end
193+
194+
def rss_kb
195+
line = File.read('/proc/self/status').each_line.find { | l | l.start_with?('VmRSS:') }
196+
return 0 unless line
197+
198+
line.split[1].to_i
199+
end
200+
201+
def average(values)
202+
return 0.0 if values.empty?
203+
204+
values.sum(0.0) / values.length
205+
end
206+
207+
def repo_root
208+
Pathname(__dir__).join('..').expand_path.to_path
209+
end
210+
# rubocop:enable Metrics/ModuleLength
211+
end
212+
213+
MasterLoadingBenchmark.run if $PROGRAM_NAME == __FILE__

lib/receiptisan/model/receipt_computer/master/loader.rb

Lines changed: 82 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
# frozen_string_literal: true
22

3+
require 'fileutils'
4+
require 'pathname'
5+
36
require_relative 'loader/loader_trait'
47
require_relative 'loader/shinryou_koui_loader'
58
require_relative 'loader/iyakuhin_loader'
@@ -31,8 +34,12 @@ def load(version)
3134
logger.info("preparing to load master version #{version.year}")
3235

3336
csv_paths = @resource_resolver.detect_csv_files(version)
37+
cache_path = detect_cache_path(csv_paths)
38+
cache = load_from_cache(cache_path, csv_paths)
39+
return cache if cache
3440

35-
load_from_version_and_csv(version, **csv_paths).tap do
41+
load_from_version_and_csv(version, **csv_paths).tap do | master |
42+
write_cache(cache_path, master)
3643
logger.info("loading master version #{version.year} completed")
3744
end
3845
end
@@ -71,7 +78,20 @@ def load_from_version_and_csv(
7178
# @return [Hash]
7279
def load_type(version, type)
7380
csv_paths = @resource_resolver.detect_csv_files(version)
81+
cache_path = detect_type_cache_path(csv_paths, type)
82+
cache = load_from_cache(cache_path, csv_paths)
83+
return cache if cache
84+
85+
load_type_from_csv_paths(version, type, csv_paths).tap do | loaded |
86+
write_cache(cache_path, loaded)
87+
end
88+
end
7489

90+
# @param version [Version]
91+
# @param type [Symbol]
92+
# @param csv_paths [Hash<Symbol, Array<Pathname>>]
93+
# @return [Hash]
94+
def load_type_from_csv_paths(version, type, csv_paths)
7595
case type
7696
when :shinryou_koui
7797
@shinryou_koui_loader.load(version, csv_paths[:shinryou_koui_csv_path])
@@ -92,6 +112,67 @@ def load_type(version, type)
92112

93113
private
94114

115+
# @param csv_paths [Hash<Symbol, Array<Pathname>>]
116+
# @return [Pathname]
117+
def detect_cache_path(csv_paths)
118+
sample_path = csv_paths.values.flatten.first
119+
sample_path.parent.join('.cache', 'master.marshal')
120+
end
121+
122+
# @param csv_paths [Hash<Symbol, Array<Pathname>>]
123+
# @param type [Symbol]
124+
# @return [Pathname]
125+
def detect_type_cache_path(csv_paths, type)
126+
sample_path = csv_paths.values.flatten.first
127+
sample_path.parent.join('.cache', "#{type}.marshal")
128+
end
129+
130+
# @param cache_path [Pathname]
131+
# @param csv_paths [Hash<Symbol, Array<Pathname>>]
132+
# @return [Master, nil]
133+
def load_from_cache(cache_path, csv_paths)
134+
return nil unless cache_available?(cache_path, csv_paths)
135+
136+
logger.info("loading master cache: #{cache_path}")
137+
# rubocop:disable Security/MarshalLoad
138+
Marshal.load(cache_path.binread)
139+
# rubocop:enable Security/MarshalLoad
140+
rescue StandardError => e
141+
logger.warn("failed to load cache(#{cache_path}): #{e.class}: #{e.message}")
142+
nil
143+
end
144+
145+
# @param cache_path [Pathname]
146+
# @param master [Master]
147+
# @return [void]
148+
def write_cache(cache_path, master)
149+
FileUtils.mkdir_p(cache_path.dirname)
150+
cache_path.binwrite(Marshal.dump(master))
151+
rescue StandardError => e
152+
logger.warn("failed to write cache(#{cache_path}): #{e.class}: #{e.message}")
153+
end
154+
155+
# @param cache_path [Pathname]
156+
# @param csv_paths [Hash<Symbol, Array<Pathname>>]
157+
# @return [Boolean]
158+
def cache_available?(cache_path, csv_paths)
159+
return false unless cache_path.exist?
160+
161+
cache_mtime = cache_path.mtime
162+
target_dir = csv_paths.values.flatten.first.parent
163+
latest_mtime = target_source_paths(target_dir).map(&:mtime).max
164+
return true unless latest_mtime
165+
166+
cache_mtime >= latest_mtime
167+
end
168+
169+
# @param target_dir [Pathname]
170+
# @return [Array<Pathname>]
171+
def target_source_paths(target_dir)
172+
pattern = target_dir.join('**', '*.{csv,txt,CSV,TXT}').to_path
173+
Dir.glob(pattern).map { | path | Pathname(path) }
174+
end
175+
95176
attr_reader :logger
96177
end
97178
end

0 commit comments

Comments
 (0)