|
| 1 | +#!/usr/bin/ruby |
| 2 | + |
| 3 | +require "open-uri" |
| 4 | +require 'uri' |
| 5 | +require 'yaml' |
| 6 | + |
| 7 | +# This global variable holds a list corresponding to all ProGit v1 chapters |
| 8 | +# (which start at chapter 1, i.e. `$v1_to_v2[4]` corresponds to chapter 4, |
| 9 | +# that's why `$v1_to_v2[0]` is `nil`). |
| 10 | +# |
| 11 | +# The entry for each chapter, is a list corresponding to each section (again, |
| 12 | +# starting at 1, that's why the the first item is `nil`) whose value is the |
| 13 | +# "cs_number" (i.e. `<chapter>.<section>`) of the _ProGit v2_ book to which |
| 14 | +# it should map. |
| 15 | +$v1_to_v2 = [ |
| 16 | + nil, |
| 17 | + [nil, "1.1", "1.2", "1.3", "1.5", "1.6", "1.7", "1.8"], |
| 18 | + [nil, "2.1", "2.2", "2.3", "2.4", "2.5", "2.6", "2.7", "2.8"], |
| 19 | + [nil, "3.1", "3.2", "3.3", "3.4", "3.5", "3.6", "3.7"], |
| 20 | + [nil, "4.1", "4.2", "4.3", "4.4", "4.9", "4.9", "4.9", "4.9", "4.5", "4.9", "4.10"], |
| 21 | + [nil, "5.1", "5.2", "5.3", "5.4"], |
| 22 | + [nil, "7.1", "7.2", "7.3", "7.6", "7.10", "7.11", "7.1", "7.15"], |
| 23 | + [nil, "8.1", "8.2", "8.3", "8.4", "8.5"], |
| 24 | + [nil, "9.1", "9.2", "9.3"], |
| 25 | + [nil, "10.1", "10.2", "10.3", "10.4", "10.5", "10.6", "10.7", "10.9"] |
| 26 | +] |
| 27 | + |
| 28 | +$mapping = {} |
| 29 | +$translations = [] |
| 30 | + |
| 31 | +def retrieve_mapping(language) |
| 32 | + puts "Retrieving TOC for #{language}" |
| 33 | + cached = "cached.book-toc.#{language}.html" |
| 34 | + if File.exists?(cached) |
| 35 | + html = File.read(cached) |
| 36 | + else |
| 37 | + html = URI.parse("https://web.archive.org/web/20140109005424/http://git-scm.com/book/#{language}/").read |
| 38 | + File.write(cached, html) |
| 39 | + end |
| 40 | + lines = html.split("\n") |
| 41 | + |
| 42 | + # parse translations |
| 43 | + if language == "en" |
| 44 | + skip = true |
| 45 | + lines.each do |line| |
| 46 | + if skip |
| 47 | + skip = false if line.include? "This book is translated into" |
| 48 | + elsif line =~ /<a href="[^"]*\/([^"]+)">/ |
| 49 | + $translations << $1 |
| 50 | + elsif line.include?("<hr") |
| 51 | + skip = true |
| 52 | + end |
| 53 | + end |
| 54 | + end |
| 55 | + |
| 56 | + # parse chapters and sections |
| 57 | + $mapping[language] = {} |
| 58 | + |
| 59 | + skip = true |
| 60 | + cs_number = "" |
| 61 | + lines.each do |line| |
| 62 | + if skip |
| 63 | + skip = false if line.include?("<ol class=\"book-toc\">") |
| 64 | + else |
| 65 | + if line.include?("</div>") |
| 66 | + skip = true |
| 67 | + else |
| 68 | + if line =~ /<h2>(\d+)\./ |
| 69 | + # map v1 to v2 |
| 70 | + cs_number = $v1_to_v2[$1.to_i][1] |
| 71 | + elsif line =~ /^\s*(\d+\.\d+)\s*$/ |
| 72 | + chapter_v1, section_v1 = $1.split(".") |
| 73 | + # map v1 to v2 |
| 74 | + cs_number = $v1_to_v2[chapter_v1.to_i][section_v1.to_i] |
| 75 | + elsif line =~ />Index of Commands</ |
| 76 | + cs_number = "A3.1" |
| 77 | + end |
| 78 | + |
| 79 | + if line =~ /<a href="[^"]*:\/\/git-scm.com\/book\/(?:[^"\/]+\/)?([^"\/]+)"/ |
| 80 | + $mapping[language][cs_number] = [] if $mapping[language][cs_number].nil? |
| 81 | + $mapping[language][cs_number] << URI.decode_www_form_component($1, enc="utf-8") |
| 82 | + end |
| 83 | + end |
| 84 | + end |
| 85 | + end |
| 86 | +end |
| 87 | + |
| 88 | +retrieve_mapping "en" |
| 89 | +$translations.each { |language| retrieve_mapping language } |
| 90 | +File.write("data/book_v1.yml", $mapping.to_yaml) |
0 commit comments