File tree Expand file tree Collapse file tree 2 files changed +52
-4
lines changed
Expand file tree Collapse file tree 2 files changed +52
-4
lines changed Original file line number Diff line number Diff line change 1515 gameIds = Array . new
1616 for i in args . arg1 . to_i ..args . arg2 . to_i
1717 seasonsUrl = 'http://j-archive.com/showseason.php?season=' +i . to_s
18- seasonList = Nokogiri ::HTML ( URI . open ( seasonsUrl ) )
18+ seasonList = Nokogiri ::HTML ( URI . open ( seasonsUrl ) . read , nil , 'UTF-8' )
1919 linkList = seasonList . css ( 'table td a' )
2020 linkList . each do |ll |
2121 href = ll . attr ( 'href' ) ;
2727
2828 gameIds . each do |gid |
2929 gameurl = 'http://www.j-archive.com/showgame.php?game_id=' +gid . to_s
30- game = Nokogiri ::HTML ( URI . open ( gameurl ) )
30+ game = Nokogiri ::HTML ( URI . open ( gameurl ) . read , nil , 'UTF-8' )
3131
3232 ## OK, were going to do this twice, once for each round
3333 questions = game . css ( "#jeopardy_round .clue" )
6666 answerDiv = game . css ( "#" . concat ( id ) )
6767
6868 #=========== Set Answer =============
69- answermatch = /ponse">(.*)<\/ e/ . match ( answerDiv . to_html )
70- var_answer = answermatch . captures [ 0 ] . to_s
69+ var_answer = answerDiv . css ( 'em.correct_response' ) . first &.text ( ) . to_s
7170 var_question = q . css ( '.clue_text' ) . first . text ( )
7271 index = q . xpath ( 'count(preceding-sibling::*)' ) . to_i
7372 var_category = categoryArr [ index ]
Original file line number Diff line number Diff line change 1+ # Repair mojibake in clues imported before the Nokogiri UTF-8 fix.
2+ #
3+ # Root cause: Nokogiri parsed j-archive.com (UTF-8) as Latin-1, so each
4+ # multi-byte UTF-8 sequence was stored as individual Latin-1 codepoints.
5+ # Questions have plain mojibake ("Elysée"). Answers additionally have
6+ # HTML-entity-encoded mojibake ("apéritif") because the
7+ # scraper used .to_html before extracting the text.
8+ #
9+ # Fix: for questions, re-encode as ISO-8859-1 bytes then force UTF-8.
10+ # for answers, HTML-unescape first, then same re-encoding.
11+ #
12+ # Run with:
13+ # cd /srv/jservice && rails runner script/fix_encoding.rb
14+ #
15+ require 'cgi'
16+
17+ def fix_mojibake ( s )
18+ s . encode ( 'ISO-8859-1' ) . force_encoding ( 'UTF-8' )
19+ rescue Encoding ::UndefinedConversionError , EncodingError
20+ s
21+ end
22+
23+ fixed_q = 0
24+ fixed_a = 0
25+ errors = 0
26+
27+ Clue . find_each do |clue |
28+ updates = { }
29+
30+ q_fixed = fix_mojibake ( clue . question )
31+ if q_fixed != clue . question && q_fixed . valid_encoding?
32+ updates [ :question ] = q_fixed
33+ fixed_q += 1
34+ end
35+
36+ a_unescaped = CGI . unescapeHTML ( clue . answer )
37+ a_fixed = fix_mojibake ( a_unescaped )
38+ if a_fixed != clue . answer && a_fixed . valid_encoding?
39+ updates [ :answer ] = a_fixed
40+ fixed_a += 1
41+ end
42+
43+ clue . update_columns ( updates ) unless updates . empty?
44+ rescue => e
45+ errors += 1
46+ warn "Clue #{ clue . id } : #{ e } "
47+ end
48+
49+ puts "Done. Fixed #{ fixed_q } questions and #{ fixed_a } answers (#{ errors } errors)."
You can’t perform that action at this time.
0 commit comments