Skip to content

Commit ff69fa7

Browse files
committed
Remove unused functions for S3 imports etc
1 parent 87307d1 commit ff69fa7

File tree

2 files changed

+0
-187
lines changed

2 files changed

+0
-187
lines changed

sde_collections/management/commands/load_urls_from_s3.py

Lines changed: 0 additions & 30 deletions
This file was deleted.

sde_collections/tasks.py

Lines changed: 0 additions & 157 deletions
Original file line numberDiff line numberDiff line change
@@ -22,163 +22,6 @@
2222
from .utils.github_helper import GitHubHandler
2323

2424

25-
@celery_app.task()
26-
def generate_candidate_urls_async(config_folder):
27-
"""Generate candidate urls using celery."""
28-
process = CrawlerProcess(get_project_settings())
29-
process.crawl(spider_factory(config_folder))
30-
process.start()
31-
32-
33-
@celery_app.task()
34-
def import_candidate_urls_task(collection_ids=[], config_folder_names=[]):
35-
s3 = boto3.client(
36-
"s3",
37-
region_name="us-east-1",
38-
aws_access_key_id=settings.AWS_ACCESS_KEY_ID,
39-
aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY,
40-
)
41-
TEMP_FOLDER_NAME = "temp"
42-
os.makedirs(TEMP_FOLDER_NAME, exist_ok=True)
43-
44-
if collection_ids:
45-
collections = Collection.objects.filter(id__in=collection_ids)
46-
elif config_folder_names:
47-
collections = Collection.objects.filter(config_folder__in=config_folder_names)
48-
else:
49-
collections = Collection.objects.all()
50-
51-
for collection in collections:
52-
s3_file_path = f"scraped_urls/{collection.config_folder}.zip"
53-
zip_file_name = f"{TEMP_FOLDER_NAME}/{collection.config_folder}.zip"
54-
json_folder_name = f"{os.path.splitext(zip_file_name)[0]}"
55-
urls_file = f"{json_folder_name}/urls.json"
56-
try:
57-
s3.download_file(
58-
settings.AWS_STORAGE_BUCKET_NAME,
59-
s3_file_path,
60-
zip_file_name,
61-
)
62-
with zipfile.ZipFile(zip_file_name, "r") as zip_ref:
63-
zip_ref.extractall(json_folder_name)
64-
except botocore.exceptions.ClientError:
65-
continue
66-
collection.candidate_urls.all().delete()
67-
68-
print(f"Importing {collection.config_folder}")
69-
70-
data = json.load(open(urls_file))
71-
augmented_data = [
72-
{
73-
"model": "sde_collections.candidateurl",
74-
"fields": {
75-
"collection": collection.pk,
76-
"url": item["url"],
77-
"scraped_title": item["scraped_title"],
78-
},
79-
}
80-
for item in data
81-
]
82-
83-
json.dump(augmented_data, open(urls_file, "w"))
84-
85-
collection.candidate_urls.all().delete()
86-
87-
subprocess.run(f'python manage.py loaddata "{urls_file}"', shell=True)
88-
collection.apply_all_patterns()
89-
collection.curation_status = 2 # ready to curate
90-
collection.workflow_status = (
91-
WorkflowStatusChoices.URLS_GENERATED
92-
) # ready to curate
93-
collection.save()
94-
shutil.rmtree(TEMP_FOLDER_NAME)
95-
96-
97-
@celery_app.task()
98-
def import_all_candidate_urls_task():
99-
s3 = boto3.client(
100-
"s3",
101-
region_name="us-east-1",
102-
aws_access_key_id=settings.AWS_ACCESS_KEY_ID,
103-
aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY,
104-
)
105-
TEMP_FOLDER_NAME = "temp"
106-
os.makedirs(TEMP_FOLDER_NAME, exist_ok=True)
107-
108-
s3_file_path = "scraped_urls_all/all_data.zip"
109-
zip_file_name = f"{TEMP_FOLDER_NAME}/all_data.zip"
110-
json_folder_name = f"{os.path.splitext(zip_file_name)[0]}"
111-
urls_file = f"{json_folder_name}/urls.json"
112-
113-
print("Downloading zip file from S3")
114-
try:
115-
s3.download_file(
116-
settings.AWS_STORAGE_BUCKET_NAME,
117-
s3_file_path,
118-
zip_file_name,
119-
)
120-
print("Unzipping")
121-
with zipfile.ZipFile(zip_file_name, "r") as zip_ref:
122-
zip_ref.extractall(json_folder_name)
123-
except botocore.exceptions.ClientError:
124-
print("error")
125-
return
126-
127-
data = json.load(open(urls_file))
128-
augmented_data = []
129-
config_folder_to_pk_dict = dict(
130-
Collection.objects.all().values_list(
131-
"config_folder", "pk", flat=False, named=True
132-
)
133-
)
134-
# ignore these because they are API collections and don't have URLs
135-
ignore_collections = [
136-
"/SMD/ASTRO_NAVO_HEASARC/",
137-
"/SMD/CASEI_Campaign/",
138-
"/SMD/CASEI_Deployment/",
139-
"/SMD/CASEI_Instrument/",
140-
"/SMD/CASEI_Platform/",
141-
"/SMD/CMR_API/",
142-
"/SMD/PDS_API_Legacy_All/",
143-
]
144-
145-
print("Creating django fixture")
146-
for item in data:
147-
collection_name = item["collection"]
148-
if collection_name in ignore_collections:
149-
continue
150-
try:
151-
item_dict = {
152-
"model": "sde_collections.candidateurl",
153-
"fields": {
154-
"collection": config_folder_to_pk_dict[
155-
item["collection"].split("/")[2]
156-
],
157-
"url": item["url"],
158-
"scraped_title": item["scraped_title"],
159-
},
160-
}
161-
except Collection.DoesNotExist:
162-
continue
163-
augmented_data.append(item_dict)
164-
165-
print("Dumping django fixture to file")
166-
json.dump(augmented_data, open(urls_file, "w"))
167-
168-
print("Deleting existing candidate URLs")
169-
CandidateURL.objects.all().delete()
170-
171-
print("Loading fixture; this may take a while")
172-
subprocess.run(f'python manage.py loaddata "{urls_file}"', shell=True)
173-
174-
print("Applying existing patterns; this may take a while")
175-
for collection in Collection.objects.all():
176-
collection.apply_all_patterns()
177-
178-
print("Deleting temp files")
179-
shutil.rmtree(TEMP_FOLDER_NAME)
180-
181-
18225
def _get_data_to_import(collection, server_name):
18326
# ignore these because they are API collections and don't have URLs
18427
ignore_collections = [

0 commit comments

Comments
 (0)