Skip to content

Commit b7f6321

Browse files
authored
Merge pull request #452 from NASA-IMPACT/450-code-tune-up
Revert changes from health check and document classifier
2 parents 5bbcbdc + cfa988b commit b7f6321

File tree

9 files changed

+41
-278
lines changed

9 files changed

+41
-278
lines changed

.gitignore

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -286,13 +286,11 @@ sde_indexing_helper/media/
286286
**/.ipynb_checkpoints/
287287
**/*.xlsx
288288

289-
290-
# config details for the api access
289+
# Config details for the api access
291290
config_generation/config.py
292291

293-
294-
#model's inference files
292+
# Model's inference files
295293
Document_Classifier_inference/model.pt
296294

297-
#Database Backup
295+
# Database backup
298296
backup.json
Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,18 @@
11
# Automated Document Tagging
22

3-
43
# Project Description:
5-
This purpose of this tag the content of a given url onto one of the five classes "Image","Documentation","Software and Tools",
6-
"Mission and Instruments", and "Data".
74

8-
#Datasets:
9-
Reference link for datasets: https://docs.google.com/spreadsheets/d/1rK7hvb_HRd-sqL3jrSYll5BiDvwnzQY2qVWDmpg6Bbk/edit#gid=1560325588
5+
This purpose of this is to tag the content of a given url onto one of the six classes "Image","Documentation","Software and Tools",
6+
"Mission and Instruments", "Training and Education", and "Data".
107

11-
# to run the repository:
12-
* location for saved model in drive: https://drive.google.com/drive/u/1/folders/1jkJSpN3ZuXhZIis4dSc-v0LkSV3pMrcs
13-
* saved weight_name: model.pt
14-
* prediction sample:python3 main.py predicts --config_file config.json --url "url_link"
8+
# Datasets:
9+
10+
Reference link for datasets: https://docs.google.com/spreadsheets/d/1rK7hvb_HRd-sqL3jrSYll5BiDvwnzQY2qVWDmpg6Bbk/edit#gid=1560325588
1511

12+
# To run the inference pipeline:
1613

14+
- location for saved model in drive: https://drive.google.com/drive/u/1/folders/1jkJSpN3ZuXhZIis4dSc-v0LkSV3pMrcs
15+
- saved weight_name: model.pt
16+
- prediction sample: `python3 main.py predicts --config_file config.json --url "url_link"`
1717

1818
For more details: contact [email protected]

sde_collections/serializers.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,6 @@ class CandidateURLSerializer(serializers.ModelSerializer):
4040
generated_title_id = serializers.SerializerMethodField(read_only=True)
4141
match_pattern_type = serializers.SerializerMethodField(read_only=True)
4242
candidate_urls_count = serializers.SerializerMethodField(read_only=True)
43-
inferenced_by = serializers.CharField(read_only=True)
44-
is_pdf = serializers.BooleanField(required=False)
4543

4644
def get_candidate_urls_count(self, obj):
4745
titlepattern = obj.titlepattern_urls.last()
@@ -69,8 +67,6 @@ class Meta:
6967
"document_type",
7068
"document_type_display",
7169
"visited",
72-
"inferenced_by",
73-
"is_pdf",
7470
)
7571

7672

sde_collections/urls.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
PushToGithubView,
1414
RequiredUrlsDeleteView,
1515
TitlePatternViewSet,
16-
HealthCheckView,
1716
)
1817

1918
router = routers.DefaultRouter()
@@ -33,11 +32,6 @@
3332
PushToGithubView.as_view(),
3433
name="push-to-github",
3534
),
36-
path(
37-
"api/health-check/<int:pk>",
38-
view=HealthCheckView.as_view(),
39-
name="health-check"
40-
),
4135
path(
4236
"delete-required-url/<int:pk>",
4337
view=RequiredUrlsDeleteView.as_view(),
@@ -54,5 +48,4 @@
5448
# Update an existing CandidateURL instance: /candidate-urls/{id}/
5549
# Delete an existing CandidateURL instance: /candidate-urls/{id}/
5650
path("api/", include(router.urls)),
57-
path("api/model_inference", views.model_inference, name="model_inference"),
5851
]

sde_collections/views.py

Lines changed: 14 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,22 @@
1-
import csv
21
import re
3-
from io import StringIO
42

53
from django.contrib.auth import get_user_model
64
from django.contrib.auth.mixins import LoginRequiredMixin
75
from django.db import models
8-
from django.http import HttpResponse
96
from django.shortcuts import redirect
107
from django.urls import reverse
118
from django.utils import timezone
12-
from django.views import View
139
from django.views.generic.detail import DetailView
1410
from django.views.generic.edit import DeleteView
1511
from django.views.generic.list import ListView
1612
from rest_framework import generics, status, viewsets
1713
from rest_framework.response import Response
1814
from rest_framework.views import APIView
1915

20-
from Document_Classifier_inference.main import batch_predicts
21-
2216
from .forms import CollectionGithubIssueForm, RequiredUrlForm
2317
from .models.candidate_url import CandidateURL
2418
from .models.collection import Collection, RequiredUrls
25-
from .models.collection_choice_fields import (
26-
CurationStatusChoices,
27-
WorkflowStatusChoices,
28-
)
19+
from .models.collection_choice_fields import CurationStatusChoices, WorkflowStatusChoices
2920
from .models.pattern import DocumentTypePattern, ExcludePattern, TitlePattern
3021
from .serializers import (
3122
CandidateURLBulkCreateSerializer,
@@ -36,46 +27,10 @@
3627
TitlePatternSerializer,
3728
)
3829
from .tasks import push_to_github_task
39-
from .utils.health_check import health_check
4030

4131
User = get_user_model()
4232

4333

44-
def model_inference(request):
45-
if request.method == "POST":
46-
collection_id = request.POST.get("collection_id")
47-
candidate_urls = CandidateURL.objects.filter(
48-
collection_id=Collection.objects.get(pk=collection_id),
49-
).exclude(document_type__in=[1, 2, 3, 4, 5, 6])
50-
# These list of urls are to be inferred
51-
to_infer_url_list = [candidate_url.url for candidate_url in candidate_urls]
52-
if to_infer_url_list:
53-
collection_id = candidate_urls[0].collection_id
54-
prediction, pdf_lists = batch_predicts(
55-
"Document_Classifier_inference/config.json", to_infer_url_list
56-
)
57-
# Update document_type for corresponding URLs
58-
for candidate_url in candidate_urls:
59-
new_document_type = prediction.get(candidate_url.url)
60-
if new_document_type is not None:
61-
candidate_url.document_type = new_document_type
62-
candidate_url.inferenced_by = "model"
63-
candidate_url.save() # Updating the changes in candidateurl table
64-
# Create a new DocumentTypePattern entry for each URL and its document_type
65-
DocumentTypePattern.objects.create(
66-
collection_id=candidate_url.collection_id,
67-
match_pattern=candidate_url.url.replace("https://", ""),
68-
match_pattern_type=DocumentTypePattern.MatchPatternTypeChoices.INDIVIDUAL_URL,
69-
document_type=new_document_type,
70-
) # Adding the new record in documenttypepattern table
71-
if (
72-
candidate_url.url in pdf_lists
73-
): # flagging created for url with pdf response
74-
candidate_url.is_pdf = True
75-
candidate_url.save()
76-
return HttpResponse(status=204)
77-
78-
7934
class CollectionListView(LoginRequiredMixin, ListView):
8035
"""
8136
Display a list of collections in the system
@@ -139,8 +94,7 @@ def post(self, request, *args, **kwargs):
13994
else:
14095
if "claim_button" in request.POST:
14196
user = self.request.user
142-
collection.curation_status = CurationStatusChoices.BEING_CURATED
143-
collection.workflow_status = WorkflowStatusChoices.CURATION_IN_PROGRESS
97+
collection.curation_status = WorkflowStatusChoices.CURATION_IN_PROGRESS
14498
collection.curated_by = user
14599
collection.curation_started = timezone.now()
146100
collection.save()
@@ -310,28 +264,20 @@ def get_queryset(self):
310264

311265
def create(self, request, *args, **kwargs):
312266
document_type = request.POST.get("document_type")
313-
inferencer = request.POST.get("inferencer")
314-
collection_id = request.POST.get("collection")
315-
match_pattern = request.POST.get("match_pattern")
316-
candidate_url = CandidateURL.objects.get(
317-
collection_id=Collection.objects.get(id=collection_id),
318-
url="https://" + match_pattern,
319-
)
320267
if not int(document_type) == 0: # 0=none
321-
candidate_url.inferenced_by = inferencer
322-
candidate_url.save()
323268
return super().create(request, *args, **kwargs)
324-
try:
325-
candidate_url.inferenced_by = ""
326-
candidate_url.save()
327-
DocumentTypePattern.objects.get(
328-
collection_id=Collection.objects.get(id=collection_id),
329-
match_pattern=match_pattern,
330-
match_pattern_type=DocumentTypePattern.MatchPatternTypeChoices.INDIVIDUAL_URL,
331-
).delete()
332-
return Response(status=status.HTTP_204_NO_CONTENT)
333-
except DocumentTypePattern.DoesNotExist:
334-
return Response(status=status.HTTP_204_NO_CONTENT)
269+
else:
270+
collection_id = request.POST.get("collection")
271+
match_pattern = request.POST.get("match_pattern")
272+
try:
273+
DocumentTypePattern.objects.get(
274+
collection_id=Collection.objects.get(id=collection_id),
275+
match_pattern=match_pattern,
276+
match_pattern_type=DocumentTypePattern.MatchPatternTypeChoices.INDIVIDUAL_URL,
277+
).delete()
278+
return Response(status=status.HTTP_200_OK)
279+
except DocumentTypePattern.DoesNotExist:
280+
return Response(status=status.HTTP_204_NO_CONTENT)
335281

336282

337283
class CollectionViewSet(viewsets.ModelViewSet):
@@ -353,38 +299,3 @@ def post(self, request):
353299
{"Success": "Started pushing collections to github"},
354300
status=status.HTTP_200_OK,
355301
)
356-
357-
358-
class HealthCheckView(View):
359-
"""
360-
This view checks whether the rules in indexer db has been correctly reflected
361-
in our prod/test sinequa instances or not and at the end generates a report.
362-
"""
363-
364-
def get(self, *args, **kwargs):
365-
collection = Collection.objects.get(pk=kwargs.get("pk"))
366-
sync_check_report = health_check(collection, server_name="production")
367-
field_names = [
368-
"id",
369-
"collection_name",
370-
"config_folder",
371-
"curation_status",
372-
"workflow_status",
373-
"pattern_name",
374-
"pattern",
375-
"scraped_title",
376-
"non_compliant_url",
377-
]
378-
379-
# download the report in CSV format
380-
csv_data = StringIO()
381-
writer = csv.DictWriter(csv_data, fieldnames=field_names)
382-
writer.writeheader()
383-
for item in sync_check_report:
384-
writer.writerow(item)
385-
386-
http_response = HttpResponse(content_type="text/csv")
387-
http_response["Content-Disposition"] = 'attachment; filename="report.csv"'
388-
http_response.write(csv_data.getvalue())
389-
390-
return http_response

0 commit comments

Comments
 (0)