traject_sample/index.rb at master · traject/traject_sample · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
# Traject Example, 2013.09.24
# Bill Dueber
#
# A more-or-less real life example of a configuration file for indexing marc
# records.
#
# A different take on most of this stuff (and hence worth taking your
# time to look at) is packaged with traject at
# https://github.com/traject-project/traject/blob/master/test/test_support/demo_config.rb
#
# Finally, the full configuration used for HathiTrust and the University of Michigan, including some
# Plain Old Ruby Objects needed to compute a variety of HT-specific rights
# information, is at http://github.com/billdueber/ht_traject
# That will be a bit more confusing, but also a more realistic take on how we
# deal with some legacy issues and so on.


######################################################
######## Set the load path and load up macros ########
######################################################


# I like to keep my local files under 'lib'. Adding this will also
# allow Traject::TranslationMap to find files in
# './lib/translation_maps/'

$:.unshift  "#{File.dirname(__FILE__)}/lib"


# Pull in the standard marc21 semantics, to get stuff like
# 'marc_sortable_title'. 'marc_publication_date', etc.
require 'traject/macros/marc21_semantics'
extend  Traject::Macros::Marc21Semantics

# Ditto with the opinionated format classifier;
# this gives you the 'marc_formats' macro
require 'traject/macros/marc_format_classifier'
extend Traject::Macros::MarcFormats


# Provide settings for readers, writers, etc.
settings do

  # We'll use the stock Ruby MARC reader for binary MARC.
  # Other options include:
  #  * "Traject::Marc4JReader" (using java's MARC4J -- can be very fast with MARC-XML)
  #  * "Traject::AlephSequential" (see https://github.com/traject-project/traject_alephsequential_reader)
  #  * "Traject::NDJReader" (for newline-delimited JSON, in the marc-in-json format)
  # The docs provide more information

  provide "reader_class_name", "Traject::MarcReader"

  # If you were using MARC-XML, you'd uncomment this. Recognized by both
  # Traject::MarcReader and Traject::Marc4JReader

  #   provide "marc_source.type", "xml"


  # We also need to provide a writer. The default is the Solr-JSON writer,
  # to write to a suitably-configured Solr installation, but you can
  # include (via '-c') one of the writer
  # configuration files in writer/, say writer/json.rb, or even
  # use the command-line option --writer to give the class name
  # of the writer (e.g., Traject::DebugWriter)


  # set this depending on what sort of progress you
  # want to see in the logs. I'm having it spit
  # out a line and timings every 1k records
  provide "log.batch_progress", 1_000

end


# It's a good idea to output exactly what you're running, so you know
# you're using the versions of java and jruby that you think you are.
#
# Highlights that you have access to a logger within this (and any)
# configuration file. Don't be afraid to use it.

logger.info RUBY_DESCRIPTION


#############################################
#############################################
######### The indexing rules ################
#############################################
#############################################
#
# A traject configuration file -- like this one -- is just
# a bunch of ruby code. Because of this, you can require files,
# set variables, compute logic based on environment variables,
# etc. -- whatever you need.
#
# Available to the configuation files are four methods:
# * logger, which you've already seen, holds a logger
# * settings, also seen above, which allows you to pass settings to
#   the traject process. Use of settings is not limited to a single
#   instance -- you can use it in every configuration file and override
#   stuff on the command line as well.
# * each_record(blk_or_lambda). each_record applies the code in the block or
#   lambda to every record, but doesn't send anything to the writer.
#   The block (or lambda) should take two arguments: the record, and
#   a Traject::Indexer::Context object (see below)
# * to_field 'field_name' block (or to_field("field_name", lambda)). The
#   block or lambda takes three arguments:
#   - the record
#   - an "accumulator", a ruby array-like into which you stuff values that
#     should be associated with the field_name. Generally you'll be putting
#     things into the accumulator, but you can also delete stuff from it,
#     transform it with #map! and it's ilk, and basically do anything you want.
#     At the moment your block/proc exits, though, every non-nil value (if any)
#     currently in the accumulator will be stored in the context
#     as context.output_hash[field_name]
#   - A context (Traject::Indexer::Context) object. It provides a few conveniences:
#     - context.clipboard : a hash into which you can put intermediate values
#     - contet.output_hash : the has of field/values pairs that will eventually
#       be sent to the configured writer. You can mess with this to achieve
#       arbitrary side-effect, although you don't want to make your code
#       too opaque.
#     - the method #skip! , which basically says, "yeah, let's stop processing
#       this one and never send anything to the writer."
#
# It's entirely possible to never use the context (which is always optional
# to pass into the proc object).


################################
###### Setup ###################
################################

# each_record performs and action on every record/context pair that comes through,
# but doesn't set a field value or send anything to the writer.

# Here, I'm using it to set up a place on the clipboard where I can
# stick stuff if I need to, knowing that it's not going to interfere
# with anything done by a macro or anything.
#


each_record do |rec, context|
  context.clipboard[:mysuff] = {}
end


################################
###### CORE FIELDS #############
################################

# 'to_field' takes a name and either a block or a lambda, as described above.
# A macro (in this case, 'extract_marc') is simply some code that returns an
# appropriate lambda that does the work that you want.
#
# You're going to want to look at the docs for extract_marc for the full
# syntax.

# Note that we only want one id, so we'll take the first one
to_field "id", extract_marc("001", :first => true)


# Wait, what was that? extract_marc is a ruby method that returns
# a lambda expression (e.g., an anonymous function). We build up
# the lambda we want based on the arguments (001, just the first one) and
# it gets run every time we're trying to construct the 'id' field to
# send to solr.


# Many of us want to store the actual, original marc record
# in solr somewhere, in some format.

# Save binary marc, if that's your thing
# to_field 'fullrecord', serialized_marc(:format=>'binary')

# Or JSON
# to_field 'fullrecord', serialized_marc(:format=>'json')

# Again, note that whatever happens, all that matters is that the value
# you want gets added to the accumulator.

to_field 'fullrecord', serialized_marc(:format=>'json')


# Another useful macro.
# Get the values for all the fields between 100 and 999
to_field "allfields", extract_all_marc_values(:from=>'100', :to=>'999')


################################
###### BAILING OUT #############
################################

# At any point during the indexing process, you can call
# context.skip!(log_message) to bail out of this record.
#
# Let's pretend we've got records in our file with temporary IDs
# that start with 'TEMP' (for, say, in-process records). We can
# skip out on them at any time. Indexing immediately stops for that
# record (i.e., no more indexing steps will run) and it never gets
# sent to the writer.
#
# This also shows how we can dip into the context.output_hash to
# get at stuff that has already been indexed, instead of re-doing the
# work.

each_record do |rec, context|
  id = context.output_hash['id']
  if id and id =~ /\ATEMP/
    context.skip!("Skipped temp record #{id}")
  end
end


################################
######## IDENTIFIERS ###########
################################


# Get the OCLC numbers (as defined in traject/macros/marc_semantics.rb)
# I want to let people find them in the 035z, too, but you may not.
to_field 'oclc', oclcnum('035a:035z')

# You can do the same sort of thing "by hand", like this.
# Find 035a that start with the string 'sdr'
# This is just a regular ruby assignment to a regular variable
sdr_pattern = /^sdr-/

# Now we do our logic. The block takes a record and an "accumulator", and
# whatever is in the accumulator at the end of the block is what get assigned
# with the field name in the eventual output hash.
to_field 'sdrnum' do |record, acc|
  oh35a_spec = Traject::MarcExtractor.cached('035a') # use #cached, not #new
  acc.concat oh35a_spec.extract(record).grep(sdr_pattern) # only get the ones that match the pattern
end


# Get both 10- and 13-character ISBNs
# StdNum::ISBN.allNormalizedValues(v) returns both forms of the ISBN
# passed, or `nil` if the thing passed didn't validate as an ISBN.
# In the latter case, just put in whatever was passed, because maybe
# it's useful to someone.
#
# Again, note that this is just Ruby inside the to_field call.

require 'library_stdnums'

to_field 'isbn' do |record, acc|
   isbn_spec = Traject::MarcExtractor.cached('020az', :separator=>nil)

   vals = []
   isbn_spec.extract(record).each do |v|
     std = StdNum::ISBN.allNormalizedValues(v)
     if std.size > 0
       vals.concat std
     else
       vals << v
     end
   end
   vals.uniq! # If it already has both a 10 and a 13, each will have generated the other
   acc.concat vals
end

# ISSNs are easier.
to_field 'issn', extract_marc('022a:022l:022m:022y:022z:247x')

# Here, I'll take advantage of the fact that extract_marc can take an
# array of field specifications (optionally still colon-delimited)
# to make things more readable

to_field 'isn_related', extract_marc(%w[
  400x:410x:411x:440x:490x
  500x:510x:534xz:556z:581z
  700x:710x:711x:730x
  760x:762x:765xz:767xz
  770xz:772x:773xz:774xz:775xz:776xz:777x
  780xz:785xz:786xz:787xz
  ])

to_field 'sudoc', extract_marc('086az')
to_field "lccn", extract_marc('010a')
to_field 'rptnum', extract_marc('088a')

################################
######### AUTHOR FIELDS ########
################################

to_field 'mainauthor', extract_marc('100abcd:110abcd:111abc')
to_field 'author', extract_marc("100abcd:110abcd:111abc:700abcd:710abcd:711abc")
to_field 'author2', extract_marc("110ab:111ab:700abcd:710ab:711ab")


# Can only have one value to sort on, so specify :first=>true
to_field "authorSort", extract_marc("100abcd:110abcd:111abc:110ab:700abcd:710ab:711ab", :first=>true)

# Other author values for searching and bumping relevancy
to_field "author_top", extract_marc("100abcdefgjklnpqtu0:110abcdefgklnptu04:111acdefgjklnpqtu04:700abcdejqux034:710abcdeux034:711acdegjnqux034:720a:765a:767a:770a:772a:774a:775a:776a:777a:780a:785a:786a:787a:245c")
to_field "author_rest", extract_marc("505r")


################################
########## TITLES ##############
################################

# For titles, we mostly want with and without non-filing characters
# so we can boost exact phrase searching.
to_field 'title',     extract_marc_filing_version('245abdefghknp', :include_original => true)
to_field 'title_a',   extract_marc_filing_version('245a', :include_original => true)
to_field 'title_ab',  extract_marc_filing_version('245ab', :include_original => true)
to_field 'title_c',   extract_marc('245c')

# For vernacular title (which I want separate for a variety of
# reasons), I want to  make sure I specify :only alternate_scripts
#
# The :alternate_script argument tells it to look for alternate scripts in the 880
# fields, so you don't have to write up all that logic yourself.

to_field 'vtitle',    extract_marc('245abdefghknp', :alternate_script=>:only, :trim_punctuation => true)

# Sortable title, using the provided marc_sortable_title macro
to_field "titleSort", marc_sortable_title


# We have lots of "title" fields in order to give different relevancy weights to them.
to_field "title_top", extract_marc("240adfghklmnoprs0:245abfghknps:247abfghknps:111acdefgjklnpqtu04:130adfghklmnoprst0")
to_field "title_rest", extract_marc(%w[
  210ab:222ab:242abhnpy:243adfghklmnoprs:246abdenp:247abdenp
  700fghjklmnoprstx03:710fghklmnoprstx03:711acdefghjklnpqstux034
  730adfghklmnoprstx03:740ahnp:765st:767st
  770st:772st:773st:775st:776st:777st:780st:785st:786st:787st
  830adfghklmnoprstv
  440anpvx:490avx:505t
  ])

to_field "series", extract_marc("440ap:800abcdfpqt:830ap")
to_field "series2", extract_marc("490a")

####################################
#### Callnumber / LCSH #############
####################################

to_field 'callnumber', extract_marc('050ab:090ab')
to_field 'broad_subject', marc_lcc_to_broad_category

###############################
##### Location ################
###############################

# take advantage of provided macro in traject for geo faceting
to_field "geo", marc_geo_facet

# Naive country of publication stuff. Included here because it shows:
#  * how to use a translation map
#  * that we can rely on indexing steps happening in order
#  * how to dig into the context object if you need to

# First, I'll just get the language codes out of the 008, which doesn't
# necessarily catch everywhere the country might be indicated
to_field 'lousy_country_code', extract_marc('008[15-17]:008[17]') do |rec, acc, context|
  # remove spaces and ditch the empties
  acc.map!(&:strip)
  acc.compact!
  acc.reject!{|code| code.empty?}
end

# Now, a first shot at getting the country names would be to do this:
to_field 'easy_lousy_country_name',
              extract_marc('008[15-17]:008[17]',
                           :trim_punctuation=>true,
                           :translation_map=>'sample/country_map')


# ...but that doesn't work the way you'd hope because it
# trims the punctuation *after* applying the translation
# map. That means a country code like 'io ' won't map
# to Indonesia using our map.
#
# So, we do it by hand


# Now, use those values to compute something else with them.
to_field 'lousy_country_name' do |rec, acc, context|
  # First, get a "new" translation map.
  #
  # The ./lib/ directory was added to the ruby path way up in
  # line 27, which means Traject::TranslationMap will automatically
  # look inside 'lib/translation_maps' for translation files.
  #
  # Note that I "namespaced" my translation maps by putting
  # them in a subdirectory ('sample') inside lib/translation_maps.
  # This isn't formally required, but we want to do it to avoid
  # namespace collisions (i.e., it's not hard to imagine lots of
  # traject macros that use a translation file called 'format'
  # or whatnot).
  #
  # We *could* get the tr map outside the to_field call and only
  # do it once, but all the hard work of finding
  # and loading it from disk is cached, so it's really cheap to
  # do it inside as well. I prefer to keep stuff inside the to_field when possible
  # for readability

  country_tmap = Traject::TranslationMap.new('sample/country_map')

  # Now dig into the context output_hash to get anything that might
  # have come out of our `lousy_geo_code` step above and
  # translate it

  if context.output_hash['lousy_country_code']
    context.output_hash['lousy_country_code'].each do |code|
      name = country_tmap[code]
      acc << name if name
    end
  end
end


################################
########### MISC ###############
################################

# Once again, take advantage of traject-supplied macros
to_field "pubdate", marc_publication_date
to_field "format", marc_formats


# For the publisher, make sure to take RDA-style 264, second
# indicator = 1
to_field "publisher", extract_marc('260b:264|*1|b:533c')
to_field "edition", extract_marc('250a')

to_field 'era', marc_era_facet

# Note how easy it is to mix control/variable field specs
to_field 'language', marc_languages("008[35-37]:041a:041d:041e:041j")

# Various librarians like to have the actual 008 language code around
to_field 'language008', extract_marc('008[35-37]') do |r, acc|
  acc.reject! {|x| x !~ /\S/} # ditch values that are just spaces
end


# A clearly misguided attempt to find editors. Included here
# because it's still a pretty good example of a complex routine.
#
# Note how we have to use things that actually affect the
# accumulator itself (#reject!, #map!, #compact!, #replace, etc.) since we
# can't assign to the accumulator. We need to actually
# affect the array to which it points in situ.
#
# tldr: never do acc = acc.map{|x| ...}. It won't work.

to_field 'editor', extract_marc('245c') do |record, accumulator, context|

  # Throw away everything that doesn't look like it mentions editors
  accumulator.reject!{|val| val !~ /edited by/i}

  # pull out the editors. Well, some of them, anyway
  accumulator.map! do |val|
    match = /edited by (.+?)(;|\Z)/i.match(val)
    match && match[1]
  end

  # Remove any nils
  accumulator.compact!

  # Split on 'and' or '&'
  accumulator.map!{|val| val.split /(?:\sand\s|\s&\s)/ }
  # Flatten it out, in case we actually got any splits
  accumulator.flatten!
end