Skip to content

Commit 02a8a1f

Browse files
authored
Merge pull request #1 from mbeijen/claude/fix-encoding
Fix UTF-8 encoding in scraper; add DB repair script
2 parents ac8094b + dcebe5f commit 02a8a1f

File tree

2 files changed

+52
-4
lines changed

2 files changed

+52
-4
lines changed

lib/tasks/get_latest.rake

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
gameIds = Array.new
1616
for i in args.arg1.to_i..args.arg2.to_i
1717
seasonsUrl = 'http://j-archive.com/showseason.php?season='+i.to_s
18-
seasonList = Nokogiri::HTML(URI.open(seasonsUrl))
18+
seasonList = Nokogiri::HTML(URI.open(seasonsUrl).read, nil, 'UTF-8')
1919
linkList = seasonList.css('table td a')
2020
linkList.each do |ll|
2121
href = ll.attr('href');
@@ -27,7 +27,7 @@
2727

2828
gameIds.each do |gid|
2929
gameurl = 'http://www.j-archive.com/showgame.php?game_id='+gid.to_s
30-
game = Nokogiri::HTML(URI.open(gameurl))
30+
game = Nokogiri::HTML(URI.open(gameurl).read, nil, 'UTF-8')
3131

3232
## OK, were going to do this twice, once for each round
3333
questions = game.css("#jeopardy_round .clue")
@@ -66,8 +66,7 @@
6666
answerDiv = game.css("#".concat(id))
6767

6868
#=========== Set Answer =============
69-
answermatch = /ponse">(.*)<\/e/.match(answerDiv.to_html)
70-
var_answer = answermatch.captures[0].to_s
69+
var_answer = answerDiv.css('em.correct_response').first&.text().to_s
7170
var_question = q.css('.clue_text').first.text()
7271
index = q.xpath('count(preceding-sibling::*)').to_i
7372
var_category = categoryArr[index]

script/fix_encoding.rb

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# Repair mojibake in clues imported before the Nokogiri UTF-8 fix.
2+
#
3+
# Root cause: Nokogiri parsed j-archive.com (UTF-8) as Latin-1, so each
4+
# multi-byte UTF-8 sequence was stored as individual Latin-1 codepoints.
5+
# Questions have plain mojibake ("Elysée"). Answers additionally have
6+
# HTML-entity-encoded mojibake ("ap&Atilde;&copy;ritif") because the
7+
# scraper used .to_html before extracting the text.
8+
#
9+
# Fix: for questions, re-encode as ISO-8859-1 bytes then force UTF-8.
10+
# for answers, HTML-unescape first, then same re-encoding.
11+
#
12+
# Run with:
13+
# cd /srv/jservice && rails runner script/fix_encoding.rb
14+
#
15+
require 'cgi'
16+
17+
def fix_mojibake(s)
18+
s.encode('ISO-8859-1').force_encoding('UTF-8')
19+
rescue Encoding::UndefinedConversionError, EncodingError
20+
s
21+
end
22+
23+
fixed_q = 0
24+
fixed_a = 0
25+
errors = 0
26+
27+
Clue.find_each do |clue|
28+
updates = {}
29+
30+
q_fixed = fix_mojibake(clue.question)
31+
if q_fixed != clue.question && q_fixed.valid_encoding?
32+
updates[:question] = q_fixed
33+
fixed_q += 1
34+
end
35+
36+
a_unescaped = CGI.unescapeHTML(clue.answer)
37+
a_fixed = fix_mojibake(a_unescaped)
38+
if a_fixed != clue.answer && a_fixed.valid_encoding?
39+
updates[:answer] = a_fixed
40+
fixed_a += 1
41+
end
42+
43+
clue.update_columns(updates) unless updates.empty?
44+
rescue => e
45+
errors += 1
46+
warn "Clue #{clue.id}: #{e}"
47+
end
48+
49+
puts "Done. Fixed #{fixed_q} questions and #{fixed_a} answers (#{errors} errors)."

0 commit comments

Comments
 (0)