Skip to content

Commit 02d4c61

Browse files
committed
Download and parse HTML
1 parent 790bd86 commit 02d4c61

File tree

5 files changed

+4963
-39
lines changed

5 files changed

+4963
-39
lines changed

Rakefile

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,3 @@ require_relative "./lib/us_news_rankings"
66
RSpec::Core::RakeTask.new(:spec)
77

88
task :default => :spec
9-
10-
task :extract_tables do
11-
html_files = Dir.glob("web/rankings/education/**/*page*.html") # Dir.glob("web/rankings/**/*.html")
12-
puts "EXTRACTING RANKINGS TABLES FROM #{html_files.count} HTML PAGES"
13-
14-
html_files.each do |html_filepath|
15-
puts html_filepath
16-
17-
begin
18-
page = UsNewsRankings::Page.new(html_filepath)
19-
page.parse
20-
rescue UsNewsRankings::Page::InvalidTableError
21-
puts "...skipping"
22-
end
23-
end
24-
end

lib/us_news_rankings/category.rb

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,36 @@ def initialize(year)
66
@year = year
77
end
88

9+
def source_urls
10+
raise "Oh, please implement #source_urls on the child class."
11+
end
12+
13+
def html_dir
14+
raise "Oh, please implement #html_dir on the child class."
15+
end
16+
17+
def pages
18+
source_urls.each_with_index.map{|url, number| UsNewsRankings::Page.new({
19+
category: self,
20+
url: url,
21+
number: number + 1
22+
})}
23+
end
24+
25+
# @example [{school_name: "abc", rank: 1}, {school_name: "def", rank: 2}, {school_name: "xyz", rank: 3}]
926
def rankings
10-
raise "please implement #rankings on the child class"
27+
@rankings || extract_rankings.sort_by{|ranking| ranking[:rank].to_i }
28+
end
29+
30+
def extract_rankings
31+
extracted_rankings = []
32+
pages.each do |page|
33+
page.table_rows.each do |row|
34+
ranking = UsNewsRankings::Education::GraduateSchools::LawClinical::Ranking.new(row)
35+
extracted_rankings << ranking.to_h if ranking.ranked?
36+
end
37+
end
38+
return extracted_rankings
1139
end
1240
end
1341
end

lib/us_news_rankings/education/graduate_schools/law_clinical/category.rb

Lines changed: 2 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -16,24 +16,8 @@ def source_urls
1616
SOURCE_URLS[year]
1717
end
1818

19-
def pages
20-
source_urls.map{|url| UsNewsRankings::Page.new(url) }
21-
end
22-
23-
# @example [{school_name: "abc", rank: 1}, {school_name: "def", rank: 2}, {school_name: "xyz", rank: 3}]
24-
def rankings
25-
@rankings || extract_rankings.sort_by{|ranking| ranking[:rank].to_i }
26-
end
27-
28-
def extract_rankings
29-
extracted_rankings = []
30-
pages.each do |page|
31-
page.table_rows.each do |row|
32-
ranking = UsNewsRankings::Education::GraduateSchools::LawClinical::Ranking.new(row)
33-
extracted_rankings << ranking.to_h if ranking.ranked?
34-
end
35-
end
36-
return extracted_rankings
19+
def html_dir
20+
"./web/education/graduate_schools/law_clinical"
3721
end
3822
end
3923
end

lib/us_news_rankings/page.rb

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,42 @@
44

55
module UsNewsRankings
66
class Page
7-
attr_reader :url
7+
attr_reader :category, :url, :number
88

99
# @param url [String] the page's source url
10-
def initialize(url)
10+
def initialize(category:, url:, number:)
11+
@category = category
1112
@url = url
13+
@number = number
14+
end
15+
16+
def html_dir
17+
File.join(category.html_dir, category.year.to_s)
18+
end
19+
20+
def html_filepath
21+
File.join(html_dir, "page#{number}.html")
22+
end
23+
24+
def download_document
25+
FileUtils.mkdir_p(html_dir)
26+
File.open(html_filepath, 'w') do |file|
27+
file.write(url_source.to_xhtml(indent: 2)) # file.write(table.to_xhtml(indent: 2))
28+
end
29+
end
30+
31+
def url_source
32+
puts "REQUESTING URL SOURCE FROM #{url}"
33+
@url_source ||= Nokogiri::HTML(open(url))
1234
end
1335

1436
# @return [Nokogiri::HTML::Document]
1537
def document
16-
@document ||= Nokogiri::HTML(open(url))
38+
download_document unless File.exist?(html_filepath)
39+
@document ||= Nokogiri::HTML(open(html_filepath))
1740
end
1841

1942
def table
20-
#raise "Oh, please implement #table in the child class"
2143
@table ||= (document.at_css("table.ranking-data") || document.at_css("table.searchresult") || document.at_css("table.flex-table"))
2244
end
2345

0 commit comments

Comments
 (0)