|
22 | 22 | from .utils.github_helper import GitHubHandler
|
23 | 23 |
|
24 | 24 |
|
25 |
| -@celery_app.task() |
26 |
| -def generate_candidate_urls_async(config_folder): |
27 |
| - """Generate candidate urls using celery.""" |
28 |
| - process = CrawlerProcess(get_project_settings()) |
29 |
| - process.crawl(spider_factory(config_folder)) |
30 |
| - process.start() |
31 |
| - |
32 |
| - |
33 |
| -@celery_app.task() |
34 |
| -def import_candidate_urls_task(collection_ids=[], config_folder_names=[]): |
35 |
| - s3 = boto3.client( |
36 |
| - "s3", |
37 |
| - region_name="us-east-1", |
38 |
| - aws_access_key_id=settings.AWS_ACCESS_KEY_ID, |
39 |
| - aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY, |
40 |
| - ) |
41 |
| - TEMP_FOLDER_NAME = "temp" |
42 |
| - os.makedirs(TEMP_FOLDER_NAME, exist_ok=True) |
43 |
| - |
44 |
| - if collection_ids: |
45 |
| - collections = Collection.objects.filter(id__in=collection_ids) |
46 |
| - elif config_folder_names: |
47 |
| - collections = Collection.objects.filter(config_folder__in=config_folder_names) |
48 |
| - else: |
49 |
| - collections = Collection.objects.all() |
50 |
| - |
51 |
| - for collection in collections: |
52 |
| - s3_file_path = f"scraped_urls/{collection.config_folder}.zip" |
53 |
| - zip_file_name = f"{TEMP_FOLDER_NAME}/{collection.config_folder}.zip" |
54 |
| - json_folder_name = f"{os.path.splitext(zip_file_name)[0]}" |
55 |
| - urls_file = f"{json_folder_name}/urls.json" |
56 |
| - try: |
57 |
| - s3.download_file( |
58 |
| - settings.AWS_STORAGE_BUCKET_NAME, |
59 |
| - s3_file_path, |
60 |
| - zip_file_name, |
61 |
| - ) |
62 |
| - with zipfile.ZipFile(zip_file_name, "r") as zip_ref: |
63 |
| - zip_ref.extractall(json_folder_name) |
64 |
| - except botocore.exceptions.ClientError: |
65 |
| - continue |
66 |
| - collection.candidate_urls.all().delete() |
67 |
| - |
68 |
| - print(f"Importing {collection.config_folder}") |
69 |
| - |
70 |
| - data = json.load(open(urls_file)) |
71 |
| - augmented_data = [ |
72 |
| - { |
73 |
| - "model": "sde_collections.candidateurl", |
74 |
| - "fields": { |
75 |
| - "collection": collection.pk, |
76 |
| - "url": item["url"], |
77 |
| - "scraped_title": item["scraped_title"], |
78 |
| - }, |
79 |
| - } |
80 |
| - for item in data |
81 |
| - ] |
82 |
| - |
83 |
| - json.dump(augmented_data, open(urls_file, "w")) |
84 |
| - |
85 |
| - collection.candidate_urls.all().delete() |
86 |
| - |
87 |
| - subprocess.run(f'python manage.py loaddata "{urls_file}"', shell=True) |
88 |
| - collection.apply_all_patterns() |
89 |
| - collection.curation_status = 2 # ready to curate |
90 |
| - collection.workflow_status = ( |
91 |
| - WorkflowStatusChoices.URLS_GENERATED |
92 |
| - ) # ready to curate |
93 |
| - collection.save() |
94 |
| - shutil.rmtree(TEMP_FOLDER_NAME) |
95 |
| - |
96 |
| - |
97 |
| -@celery_app.task() |
98 |
| -def import_all_candidate_urls_task(): |
99 |
| - s3 = boto3.client( |
100 |
| - "s3", |
101 |
| - region_name="us-east-1", |
102 |
| - aws_access_key_id=settings.AWS_ACCESS_KEY_ID, |
103 |
| - aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY, |
104 |
| - ) |
105 |
| - TEMP_FOLDER_NAME = "temp" |
106 |
| - os.makedirs(TEMP_FOLDER_NAME, exist_ok=True) |
107 |
| - |
108 |
| - s3_file_path = "scraped_urls_all/all_data.zip" |
109 |
| - zip_file_name = f"{TEMP_FOLDER_NAME}/all_data.zip" |
110 |
| - json_folder_name = f"{os.path.splitext(zip_file_name)[0]}" |
111 |
| - urls_file = f"{json_folder_name}/urls.json" |
112 |
| - |
113 |
| - print("Downloading zip file from S3") |
114 |
| - try: |
115 |
| - s3.download_file( |
116 |
| - settings.AWS_STORAGE_BUCKET_NAME, |
117 |
| - s3_file_path, |
118 |
| - zip_file_name, |
119 |
| - ) |
120 |
| - print("Unzipping") |
121 |
| - with zipfile.ZipFile(zip_file_name, "r") as zip_ref: |
122 |
| - zip_ref.extractall(json_folder_name) |
123 |
| - except botocore.exceptions.ClientError: |
124 |
| - print("error") |
125 |
| - return |
126 |
| - |
127 |
| - data = json.load(open(urls_file)) |
128 |
| - augmented_data = [] |
129 |
| - config_folder_to_pk_dict = dict( |
130 |
| - Collection.objects.all().values_list( |
131 |
| - "config_folder", "pk", flat=False, named=True |
132 |
| - ) |
133 |
| - ) |
134 |
| - # ignore these because they are API collections and don't have URLs |
135 |
| - ignore_collections = [ |
136 |
| - "/SMD/ASTRO_NAVO_HEASARC/", |
137 |
| - "/SMD/CASEI_Campaign/", |
138 |
| - "/SMD/CASEI_Deployment/", |
139 |
| - "/SMD/CASEI_Instrument/", |
140 |
| - "/SMD/CASEI_Platform/", |
141 |
| - "/SMD/CMR_API/", |
142 |
| - "/SMD/PDS_API_Legacy_All/", |
143 |
| - ] |
144 |
| - |
145 |
| - print("Creating django fixture") |
146 |
| - for item in data: |
147 |
| - collection_name = item["collection"] |
148 |
| - if collection_name in ignore_collections: |
149 |
| - continue |
150 |
| - try: |
151 |
| - item_dict = { |
152 |
| - "model": "sde_collections.candidateurl", |
153 |
| - "fields": { |
154 |
| - "collection": config_folder_to_pk_dict[ |
155 |
| - item["collection"].split("/")[2] |
156 |
| - ], |
157 |
| - "url": item["url"], |
158 |
| - "scraped_title": item["scraped_title"], |
159 |
| - }, |
160 |
| - } |
161 |
| - except Collection.DoesNotExist: |
162 |
| - continue |
163 |
| - augmented_data.append(item_dict) |
164 |
| - |
165 |
| - print("Dumping django fixture to file") |
166 |
| - json.dump(augmented_data, open(urls_file, "w")) |
167 |
| - |
168 |
| - print("Deleting existing candidate URLs") |
169 |
| - CandidateURL.objects.all().delete() |
170 |
| - |
171 |
| - print("Loading fixture; this may take a while") |
172 |
| - subprocess.run(f'python manage.py loaddata "{urls_file}"', shell=True) |
173 |
| - |
174 |
| - print("Applying existing patterns; this may take a while") |
175 |
| - for collection in Collection.objects.all(): |
176 |
| - collection.apply_all_patterns() |
177 |
| - |
178 |
| - print("Deleting temp files") |
179 |
| - shutil.rmtree(TEMP_FOLDER_NAME) |
180 |
| - |
181 |
| - |
182 | 25 | def _get_data_to_import(collection, server_name):
|
183 | 26 | # ignore these because they are API collections and don't have URLs
|
184 | 27 | ignore_collections = [
|
|
0 commit comments