Skip to content

Commit d676c18

Browse files
authored
Merge pull request #56 from dkinzer/allow-harvest-with-until-param
Allow for passing until param.
2 parents 8410bed + af6fa62 commit d676c18

File tree

8 files changed

+94
-30
lines changed

8 files changed

+94
-30
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
/pkg/
22
/doc/
33
/coverage*
4+
/tmp/
45
/.yardoc/
5-
6+
.DS_Store
67
# Exclude Gemfile.lock (best practice for gems)
78
Gemfile.lock
89

Rakefile

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,13 @@ namespace :test do
3434
t.warning = false
3535
end
3636

37+
Rake::TestTask.new('harvester') do |t|
38+
t.libs << ['lib', 'test/harvester']
39+
t.pattern = 'test/harvester/tc_*.rb'
40+
#t.verbose = true
41+
t.warning = false
42+
end
43+
3744
Rake::TestTask.new('provider') do |t|
3845
t.libs << ['lib', 'test/provider']
3946
t.pattern = 'test/provider/tc_*.rb'

bin/oai

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,6 @@
55

66
require 'optparse'
77

8-
DIRECTORY_LAYOUT = "%Y/%m".freeze
9-
108
require 'oai/harvester'
119

1210
include OAI::Harvester

lib/oai/harvester.rb

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
require 'fileutils'
88
require 'ostruct'
99
require 'readline'
10-
require 'chronic'
1110
require 'socket'
1211

1312
require 'oai/client'

lib/oai/harvester/harvest.rb

Lines changed: 34 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,16 @@
33

44
module OAI
55
module Harvester
6-
76
class Harvest
7+
DIRECTORY_LAYOUT = "%Y/%m".freeze
88

9-
def initialize(config = nil, directory = nil, date = nil)
9+
def initialize(config = nil, directory = nil, date = nil, to = nil)
1010
@config = config || Config.load
1111
@directory = directory || @config.storage
1212
@from = date
1313
@from.freeze
14+
@until = to
15+
@until.freeze
1416
@parser = defined?(XML::Document) ? 'libxml' : 'rexml'
1517
end
1618

@@ -30,7 +32,11 @@ def start(sites = nil, interactive = false)
3032

3133
def harvest(site)
3234
opts = build_options_hash(@config.sites[site])
33-
harvest_time = Time.now.utc
35+
if @until
36+
harvest_time = @until.to_time.utc
37+
else
38+
harvest_time = Time.now.utc
39+
end
3440

3541
if "YYYY-MM-DD" == granularity(opts[:url])
3642
opts[:until] = harvest_time.strftime("%Y-%m-%d")
@@ -43,22 +49,27 @@ def harvest(site)
4349
# Allow a from date to be passed in
4450
opts[:from] = earliest(opts[:url]) unless opts[:from]
4551
opts.delete(:set) if 'all' == opts[:set]
46-
4752
begin
4853
# Connect, and download
4954
file, records = call(opts.delete(:url), opts)
5055

51-
# Move document to storage directory
52-
dir = File.join(@directory, date_based_directory(harvest_time))
53-
FileUtils.mkdir_p dir
54-
FileUtils.mv(file.path,
55-
File.join(dir, "#{site}-#{filename(Time.parse(opts[:from]),
56-
harvest_time)}.xml.gz"))
56+
# Move document to storage directory if configured
57+
if @directory
58+
directory_layout = @config.layouts[site] if @config.layouts
59+
dir = File.join(@directory, date_based_directory(harvest_time, directory_layout))
60+
FileUtils.mkdir_p dir
61+
FileUtils.mv(file.path,
62+
File.join(dir, "#{site}-#{filename(Time.parse(opts[:from]),
63+
harvest_time)}.xml.gz"))
64+
else
65+
puts "no configured destination for temp file" if @interactive
66+
end
5767
@config.sites[site]['last'] = harvest_time
58-
rescue
59-
raise $! unless $!.respond_to?(:code)
60-
raise $! if not @interactive || "noRecordsMatch" != $!.code
61-
puts "No new records available"
68+
rescue OAI::NoMatchException
69+
puts "No new records available" if @interactive
70+
rescue OAI::Exception => ex
71+
raise ex if not @interactive
72+
puts ex.message
6273
end
6374
end
6475

@@ -69,15 +80,15 @@ def call(url, opts)
6980
records = 0;
7081
client = OAI::Client.new(url, :parser => @parser)
7182
provider_config = client.identify
72-
83+
7384
file = Tempfile.new('oai_data')
7485
gz = Zlib::GzipWriter.new(file)
7586
gz << "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n"
7687
gz << "<records>"
7788
begin
7889
response = client.list_records(options)
79-
get_records(response.doc).each do |rec|
80-
gz << rec
90+
response.each do |rec|
91+
gz << rec._source
8192
records += 1
8293
end
8394
puts "#{records} records retrieved" if @interactive
@@ -89,8 +100,8 @@ def call(url, opts)
89100
puts "\nresumption token recieved, continuing" if @interactive
90101
response = client.list_records(:resumption_token =>
91102
response.resumption_token)
92-
get_records(response.doc).each do |rec|
93-
gz << rec
103+
response.each do |rec|
104+
gz << rec._source
94105
records += 1
95106
end
96107
puts "#{records} records retrieved" if @interactive
@@ -118,8 +129,9 @@ def build_options_hash(site)
118129
options
119130
end
120131

121-
def date_based_directory(time)
122-
"#{time.strftime(DIRECTORY_LAYOUT)}"
132+
def date_based_directory(time, directory_layout = nil)
133+
directory_layout ||= Harvest::DIRECTORY_LAYOUT
134+
"#{time.strftime(directory_layout)}"
123135
end
124136

125137
def filename(from_time, until_time)
@@ -147,4 +159,4 @@ def earliest(url)
147159
end
148160

149161
end
150-
end
162+
end

lib/oai/harvester/logging.rb

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,9 @@ class Harvest
1111
def initialize(*args)
1212
orig_init(*args)
1313
@summary = []
14-
@logger = Logger.new(File.join(@config.logfile, "harvester.log"),
15-
shift_age = 'weekly') if @config.logfile
14+
@logger = @config.logfile ? Logger.new(File.join(@config.logfile, "harvester.log"), 'weekly') : Logger.new(STDOUT)
1615
@logger.datetime_format = "%Y-%m-%d %H:%M"
17-
16+
1817
# Turn off logging if no logging directory is specified.
1918
@logger.level = Logger::FATAL unless @config.logfile
2019
end
@@ -25,7 +24,7 @@ def start(sites = nil, interactive = false)
2524
orig_start(sites)
2625
begin
2726
OAI::Harvester::
28-
Mailer.send(@config.mail_server, @config.email, @summary)
27+
Mailer.send(@config.mail_server, @config.email, @summary) if @config.email
2928
rescue
3029
@logger.error { "Error sending out summary email: #{$!}"}
3130
end

test/harvester/tc_harvest.rb

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
require 'test_helper_harvester'
2+
3+
class HarvestTest < Test::Unit::TestCase
4+
ONE_HOUR = 3600
5+
EARLIEST_FIXTURE = "1998-05-02T04:00:00Z"
6+
LATEST_FIXTURE = "2005-12-25T05:00:00Z"
7+
def test_harvest
8+
until_value = Time.now.utc - ONE_HOUR
9+
config = OpenStruct.new(sites: { 'test' => { 'url' => 'http://localhost:3333/oai' }})
10+
OAI::Harvester::Harvest.new(config).start
11+
last = config.sites.dig('test', 'last')
12+
assert_kind_of Time, last
13+
assert last >= (until_value + ONE_HOUR), "#{last} < #{(until_value + ONE_HOUR)}"
14+
end
15+
16+
def test_harvest_from_last
17+
from_value = Time.parse(LATEST_FIXTURE).utc
18+
now = Time.now.utc
19+
config = OpenStruct.new(sites: { 'test' => { 'url' => 'http://localhost:3333/oai' }})
20+
OAI::Harvester::Harvest.new(config, nil, from_value).start
21+
last = config.sites.dig('test', 'last')
22+
assert last >= now, "#{last} < #{now}"
23+
end
24+
25+
def test_harvest_after_last
26+
from_value = Time.parse(LATEST_FIXTURE).utc + 1
27+
config = OpenStruct.new(sites: { 'test' => { 'url' => 'http://localhost:3333/oai' }})
28+
OAI::Harvester::Harvest.new(config, nil, from_value).start
29+
last = config.sites.dig('test', 'last')
30+
assert_kind_of NilClass, last
31+
end
32+
33+
def test_harvest_with_until
34+
until_value = Time.parse(EARLIEST_FIXTURE).utc + ONE_HOUR
35+
config = OpenStruct.new(sites: { 'test' => { 'url' => 'http://localhost:3333/oai' }})
36+
OAI::Harvester::Harvest.new(config, nil, nil, until_value).start
37+
last = config.sites.dig('test', 'last')
38+
assert_kind_of Time, last
39+
assert_equal last, until_value
40+
end
41+
end
42+
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
require 'oai'
2+
require 'oai/harvester'
3+
require 'test/unit'
4+
5+
require File.dirname(__FILE__) + '/../client/helpers/provider'
6+
require File.dirname(__FILE__) + '/../client/helpers/test_wrapper'

0 commit comments

Comments
 (0)