Skip to content

Commit ca90a33

Browse files
authored
Merge pull request #1781 from next-l/tika-container
Tikaのコンテナを追加
2 parents 7d327a2 + b95ecb4 commit ca90a33

File tree

8 files changed

+37
-17
lines changed

8 files changed

+37
-17
lines changed

.env.template

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ REDIS_URL=redis://redis/enju-leaf-${RAILS_ENV}
1111

1212
SOLR_URL=http://solr:8983/solr/enju_leaf_${RAILS_ENV}
1313

14+
TIKA_URL=http://tika:9998
15+
1416
MINIO_ROOT_USER=enju
1517
MINIO_ROOT_PASSWORD=password
1618

@@ -21,11 +23,12 @@ ENJU_LEAF_TIME_ZONE=Asia/Tokyo
2123
ENJU_LEAF_STORAGE_BUCKET=enju-leaf
2224
ENJU_LEAF_STORAGE_ENDPOINT=http://minio:9000
2325
ENJU_LEAF_ACTION_MAILER_DELIVERY_METHOD=test
26+
ENJU_LEAF_EXTRACT_TEXT=
2427
# ENJU_LEAF_RESOURCESYNC_BASE_URL=http://localhost:8080
2528

2629
CANTALOUPE_BASE_URI=http://localhost:8182
2730

28-
no_proxy=localhost,webpacker,minio,solr
31+
no_proxy=localhost,webpacker,minio,solr,tika
2932
NO_PROXY=${no_proxy}
3033

3134
UID=1000

.env.test

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,11 @@ DATABASE_URL=postgres://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}/e
55

66
SOLR_URL=http://localhost:8983/solr/enju_leaf_test
77

8+
TIKA_URL=http://localhost:9998
9+
810
MINIO_ROOT_USER=enju
911
MINIO_ROOT_PASSWORD=password
1012

1113
ENJU_LEAF_DEFAULT_LOCALE=en
1214
ENJU_LEAF_TIME_ZONE=UTC
15+
ENJU_LEAF_EXTRACT_TEXT=true

.github/workflows/rubyonrails.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ jobs:
3434
bundler-cache: true
3535
# Add or replace test runners here
3636
- name: Start containers
37-
run: cp .env.template .env && docker-compose up -d solr
37+
run: cp .env.template .env && docker-compose up -d solr tika
3838
- name: Setup Code Climate test-reporter
3939
run: |
4040
curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter

app/models/manifestation.rb

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ class Manifestation < ApplicationRecord
235235
end
236236
after_create :clear_cached_numdocs
237237
after_destroy :index_series_statement
238-
after_save :index_series_statement, :extract_text!
238+
after_save :index_series_statement
239239
after_touch do |manifestation|
240240
manifestation.index
241241
manifestation.index_series_statement
@@ -367,24 +367,21 @@ def self.pickup(keyword = nil, current_user = nil)
367367
end
368368

369369
def extract_text
370-
return nil unless attachment.attached?
371-
return nil unless ENV['ENJU_EXTRACT_TEXT'] == 'true'
370+
return unless attachment.attached?
371+
return unless ENV['ENJU_LEAF_EXTRACT_TEXT'] == 'true'
372372

373-
client = Faraday.new(url: ENV['SOLR_URL'] || Sunspot.config.solr.url) do |conn|
374-
conn.request :multipart
373+
client = Faraday.new(url: ENV['TIKA_URL'] || 'http://tika:9998') do |conn|
375374
conn.adapter :net_http
376375
end
377-
response = client.post('update/extract?extractOnly=true&wt=json&extractFormat=text') do |req|
378-
req.headers['Content-type'] = 'text/html'
379-
req.body = attachment.download
376+
377+
response = client.put('/tika/text') do |req|
378+
req.headers['Content-Type'] = attachment.content_type
379+
req.headers['Content-Length'] = attachment.byte_size.to_s
380+
req.body = Faraday::UploadIO.new(StringIO.new(attachment.download), attachment.content_type)
380381
end
381-
update_column(:fulltext, JSON.parse(response.body)[""])
382-
end
383382

384-
def extract_text!
385-
extract_text
386-
index
387-
Sunspot.commit
383+
payload = JSON.parse(response.body)['X-TIKA:content'].strip.tr("\t", " ").gsub(/\r?\n/, "")
384+
payload
388385
end
389386

390387
def created(agent)

docker-compose.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ networks:
1212
x-app: &app
1313
env_file:
1414
- .env
15+
environment:
16+
- no_proxy: localhost,web,workers,tika
1517
build:
1618
context: .
1719
args:
@@ -168,3 +170,11 @@ services:
168170
interval: 30s
169171
timeout: 20s
170172
retries: 3
173+
174+
tika:
175+
image: apache/tika:2.6.0.1
176+
ports:
177+
- 127.0.0.1:9998:9998
178+
restart: always
179+
networks:
180+
internal:

spec/controllers/manifestations_controller_spec.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,7 @@ def valid_attributes
238238
manifestation = FactoryBot.create(:manifestation, description: "foo")
239239
periodical.derived_manifestations << manifestation
240240
periodical.save!
241+
manifestation.save!
241242
get :index, params: { query: "foo" }
242243
manifestations = assigns(:manifestations)
243244
expect(manifestations).not_to be_blank

spec/models/loc_search_spec.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
expect(manifestation.original_title).to eq "Everything is miscellaneous : the power of the new digital disorder"
1111
expect(manifestation.manifestation_content_type.name).to eq "text"
1212
expect(manifestation.carrier_type.name).to eq "volume"
13-
expect(manifestation.publishers.size).to eq 1
13+
expect(manifestation.publishers.count).to eq 1
1414
expect(manifestation.publishers.first.full_name).to eq "Times Books"
1515
expect(manifestation.publication_place).to eq "New York"
1616
expect(manifestation.creators.size).to eq 1

spec/models/manifestation_spec.rb

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,12 @@
273273
expect(manifestations(:manifestation_00101).reservable?).to be_falsy
274274
end
275275
end
276+
277+
it 'should extract fulltext' do
278+
manifestation = FactoryBot.create(:manifestation)
279+
manifestation.attachment.attach(io: File.open("spec/fixtures/files/resource_import_file_sample1.tsv"), filename: 'sample.txt')
280+
expect(manifestation.extract_text).to match(/資料ID/)
281+
end
276282
end
277283

278284
# == Schema Information

0 commit comments

Comments
 (0)