1
- import csv
2
1
import re
3
- from io import StringIO
4
2
5
3
from django .contrib .auth import get_user_model
6
4
from django .contrib .auth .mixins import LoginRequiredMixin
7
5
from django .db import models
8
- from django .http import HttpResponse
9
6
from django .shortcuts import redirect
10
7
from django .urls import reverse
11
8
from django .utils import timezone
12
- from django .views import View
13
9
from django .views .generic .detail import DetailView
14
10
from django .views .generic .edit import DeleteView
15
11
from django .views .generic .list import ListView
16
12
from rest_framework import generics , status , viewsets
17
13
from rest_framework .response import Response
18
14
from rest_framework .views import APIView
19
15
20
- from Document_Classifier_inference .main import batch_predicts
21
-
22
16
from .forms import CollectionGithubIssueForm , RequiredUrlForm
23
17
from .models .candidate_url import CandidateURL
24
18
from .models .collection import Collection , RequiredUrls
25
- from .models .collection_choice_fields import (
26
- CurationStatusChoices ,
27
- WorkflowStatusChoices ,
28
- )
19
+ from .models .collection_choice_fields import CurationStatusChoices , WorkflowStatusChoices
29
20
from .models .pattern import DocumentTypePattern , ExcludePattern , TitlePattern
30
21
from .serializers import (
31
22
CandidateURLBulkCreateSerializer ,
36
27
TitlePatternSerializer ,
37
28
)
38
29
from .tasks import push_to_github_task
39
- from .utils .health_check import health_check
40
30
41
31
User = get_user_model ()
42
32
43
33
44
- def model_inference (request ):
45
- if request .method == "POST" :
46
- collection_id = request .POST .get ("collection_id" )
47
- candidate_urls = CandidateURL .objects .filter (
48
- collection_id = Collection .objects .get (pk = collection_id ),
49
- ).exclude (document_type__in = [1 , 2 , 3 , 4 , 5 , 6 ])
50
- # These list of urls are to be inferred
51
- to_infer_url_list = [candidate_url .url for candidate_url in candidate_urls ]
52
- if to_infer_url_list :
53
- collection_id = candidate_urls [0 ].collection_id
54
- prediction , pdf_lists = batch_predicts (
55
- "Document_Classifier_inference/config.json" , to_infer_url_list
56
- )
57
- # Update document_type for corresponding URLs
58
- for candidate_url in candidate_urls :
59
- new_document_type = prediction .get (candidate_url .url )
60
- if new_document_type is not None :
61
- candidate_url .document_type = new_document_type
62
- candidate_url .inferenced_by = "model"
63
- candidate_url .save () # Updating the changes in candidateurl table
64
- # Create a new DocumentTypePattern entry for each URL and its document_type
65
- DocumentTypePattern .objects .create (
66
- collection_id = candidate_url .collection_id ,
67
- match_pattern = candidate_url .url .replace ("https://" , "" ),
68
- match_pattern_type = DocumentTypePattern .MatchPatternTypeChoices .INDIVIDUAL_URL ,
69
- document_type = new_document_type ,
70
- ) # Adding the new record in documenttypepattern table
71
- if (
72
- candidate_url .url in pdf_lists
73
- ): # flagging created for url with pdf response
74
- candidate_url .is_pdf = True
75
- candidate_url .save ()
76
- return HttpResponse (status = 204 )
77
-
78
-
79
34
class CollectionListView (LoginRequiredMixin , ListView ):
80
35
"""
81
36
Display a list of collections in the system
@@ -139,8 +94,7 @@ def post(self, request, *args, **kwargs):
139
94
else :
140
95
if "claim_button" in request .POST :
141
96
user = self .request .user
142
- collection .curation_status = CurationStatusChoices .BEING_CURATED
143
- collection .workflow_status = WorkflowStatusChoices .CURATION_IN_PROGRESS
97
+ collection .curation_status = WorkflowStatusChoices .CURATION_IN_PROGRESS
144
98
collection .curated_by = user
145
99
collection .curation_started = timezone .now ()
146
100
collection .save ()
@@ -310,28 +264,20 @@ def get_queryset(self):
310
264
311
265
def create (self , request , * args , ** kwargs ):
312
266
document_type = request .POST .get ("document_type" )
313
- inferencer = request .POST .get ("inferencer" )
314
- collection_id = request .POST .get ("collection" )
315
- match_pattern = request .POST .get ("match_pattern" )
316
- candidate_url = CandidateURL .objects .get (
317
- collection_id = Collection .objects .get (id = collection_id ),
318
- url = "https://" + match_pattern ,
319
- )
320
267
if not int (document_type ) == 0 : # 0=none
321
- candidate_url .inferenced_by = inferencer
322
- candidate_url .save ()
323
268
return super ().create (request , * args , ** kwargs )
324
- try :
325
- candidate_url .inferenced_by = ""
326
- candidate_url .save ()
327
- DocumentTypePattern .objects .get (
328
- collection_id = Collection .objects .get (id = collection_id ),
329
- match_pattern = match_pattern ,
330
- match_pattern_type = DocumentTypePattern .MatchPatternTypeChoices .INDIVIDUAL_URL ,
331
- ).delete ()
332
- return Response (status = status .HTTP_204_NO_CONTENT )
333
- except DocumentTypePattern .DoesNotExist :
334
- return Response (status = status .HTTP_204_NO_CONTENT )
269
+ else :
270
+ collection_id = request .POST .get ("collection" )
271
+ match_pattern = request .POST .get ("match_pattern" )
272
+ try :
273
+ DocumentTypePattern .objects .get (
274
+ collection_id = Collection .objects .get (id = collection_id ),
275
+ match_pattern = match_pattern ,
276
+ match_pattern_type = DocumentTypePattern .MatchPatternTypeChoices .INDIVIDUAL_URL ,
277
+ ).delete ()
278
+ return Response (status = status .HTTP_200_OK )
279
+ except DocumentTypePattern .DoesNotExist :
280
+ return Response (status = status .HTTP_204_NO_CONTENT )
335
281
336
282
337
283
class CollectionViewSet (viewsets .ModelViewSet ):
@@ -353,38 +299,3 @@ def post(self, request):
353
299
{"Success" : "Started pushing collections to github" },
354
300
status = status .HTTP_200_OK ,
355
301
)
356
-
357
-
358
- class HealthCheckView (View ):
359
- """
360
- This view checks whether the rules in indexer db has been correctly reflected
361
- in our prod/test sinequa instances or not and at the end generates a report.
362
- """
363
-
364
- def get (self , * args , ** kwargs ):
365
- collection = Collection .objects .get (pk = kwargs .get ("pk" ))
366
- sync_check_report = health_check (collection , server_name = "production" )
367
- field_names = [
368
- "id" ,
369
- "collection_name" ,
370
- "config_folder" ,
371
- "curation_status" ,
372
- "workflow_status" ,
373
- "pattern_name" ,
374
- "pattern" ,
375
- "scraped_title" ,
376
- "non_compliant_url" ,
377
- ]
378
-
379
- # download the report in CSV format
380
- csv_data = StringIO ()
381
- writer = csv .DictWriter (csv_data , fieldnames = field_names )
382
- writer .writeheader ()
383
- for item in sync_check_report :
384
- writer .writerow (item )
385
-
386
- http_response = HttpResponse (content_type = "text/csv" )
387
- http_response ["Content-Disposition" ] = 'attachment; filename="report.csv"'
388
- http_response .write (csv_data .getvalue ())
389
-
390
- return http_response
0 commit comments