|
1 | 1 | """Celery tasks invoked from the API endpoints.""" |
| 2 | +import json |
| 3 | +import logging |
| 4 | +import rdflib |
| 5 | +import redis |
| 6 | +import requests |
| 7 | +from atenvironment import environment |
| 8 | +from rdflib import URIRef |
| 9 | +from urllib.parse import urlparse |
| 10 | +from urllib.error import URLError |
| 11 | +from tsa.analyzer import Analyzer |
2 | 12 | from tsa.celery import celery |
| 13 | +from tsa.transformation import PipelineFactory |
| 14 | + |
| 15 | +@celery.task |
| 16 | +@environment('ETL', 'VIRTUOSO') |
| 17 | +def system_check(etl, virtuoso): |
| 18 | + log = logging.getLogger(__name__) |
| 19 | + log.info("System check started") |
| 20 | + log.info(f"Testing LP-ETL, URL: {etl!s}") |
| 21 | + requests.get(etl).raise_for_status() |
| 22 | + |
| 23 | + virtuoso_url = f"{virtuoso!s}/sparql" |
| 24 | + log.info(f"Testing virtuoso, URL: {virtuoso_url}") |
| 25 | + requests.get(virtuoso_url).raise_for_status() |
| 26 | + |
| 27 | + log.info("System check successful") |
| 28 | + |
3 | 29 |
|
4 | 30 | @celery.task |
5 | 31 | def hello(): |
6 | 32 | return "Hello world!" |
7 | 33 |
|
| 34 | + |
| 35 | +@celery.task |
| 36 | +def analyze(iri, etl=True): |
| 37 | + log = logging.getLogger(__name__) |
| 38 | + log.info(f"Analyzing {iri!s}") |
| 39 | + if etl: |
| 40 | + (transform.s(iri) | poll.s() | inspect.s()).apply_async() |
| 41 | + else: |
| 42 | + guess = rdflib.util.guess_format(iri) |
| 43 | + if guess is None: |
| 44 | + r = requests.head(iri) |
| 45 | + r.raise_for_status() |
| 46 | + guess = r.headers.get('content-type') |
| 47 | + g = rdflib.ConjunctiveGraph() |
| 48 | + log.info(f"Guessing format to be {guess!s}") |
| 49 | + g.parse(iri, format=guess) |
| 50 | + a = Analyzer() |
| 51 | + index(g, iri) |
| 52 | + return a.analyze(g) |
| 53 | + |
| 54 | +@environment('REDIS') |
| 55 | +def index(g, source_iri, redis_cfg): |
| 56 | + r = redis.StrictRedis.from_url(redis_cfg) |
| 57 | + pipe = r.pipeline() |
| 58 | + exp = 60*60 #1H |
| 59 | + for (s, p, o) in g: |
| 60 | + s = str(s) |
| 61 | + p = str(p) |
| 62 | + o = str(o) |
| 63 | + source_iri = str(source_iri) |
| 64 | + |
| 65 | + pipe.sadd(s, source_iri, p, o) |
| 66 | + pipe.sadd(p, source_iri, s, o) |
| 67 | + pipe.sadd(o, source_iri, p, s) |
| 68 | + pipe.sadd(source_iri, s, p, o) |
| 69 | + |
| 70 | + pipe.expire(s, exp) |
| 71 | + pipe.expire(p, exp) |
| 72 | + pipe.expire(o, exp) |
| 73 | + pipe.expire(source_iri, exp) |
| 74 | + pipe.execute() |
| 75 | + |
| 76 | +@celery.task |
| 77 | +@environment('REDIS') |
| 78 | +def analyze_upload(key, mime, etl, redis_cfg): |
| 79 | + log = logging.getLogger(__name__) |
| 80 | + r = redis.StrictRedis.from_url(redis_cfg) |
| 81 | + if r.strlen(key) < 1024 * 1024: #approx 1MB |
| 82 | + g = rdflib.ConjunctiveGraph() |
| 83 | + g.parse(data=r.get(key), format=mime) |
| 84 | + a = Analyzer() |
| 85 | + return a.analyze(g) |
| 86 | + else: |
| 87 | + log.warn(f"Not analyzing an upload as it's too big: {key!s}") |
| 88 | + r.delete(key) |
| 89 | + |
| 90 | + |
| 91 | +@celery.task |
| 92 | +def inspect(iri): |
| 93 | + log = logging.getLogger(__name__) |
| 94 | + g = rdflib.ConjunctiveGraph() |
| 95 | + g.parse(iri) |
| 96 | + a = Analyzer() |
| 97 | + return a.analyze(g) |
| 98 | + |
| 99 | + |
| 100 | +@celery.task |
| 101 | +@environment('ETL', 'VIRTUOSO', 'DBA_PASSWORD') |
| 102 | +def transform(iri, etl, virtuoso, dbaPass): |
| 103 | + log = logging.getLogger(__name__) |
| 104 | + #create pipeline and call to start executions |
| 105 | + # prepare JSON-LD pipeline |
| 106 | + |
| 107 | + log.info(f"Prepare pipeline for {iri!s}") |
| 108 | + pf = PipelineFactory() |
| 109 | + p = urlparse(virtuoso) |
| 110 | + pipeline = json.dumps(pf.createPipeline(iri, {'server': p.hostname, 'port': 1111, 'user': 'dba', 'password': dbaPass, 'iri': iri})) |
| 111 | + |
| 112 | + log.info(f"Pipeline:\n{pipeline!s}") |
| 113 | + |
| 114 | + # create the pipeline |
| 115 | + r = requests.post(f"{etl!s}/resources/pipelines", files={'pipeline': pipeline}) |
| 116 | + r.raise_for_status() |
| 117 | + |
| 118 | + g = rdflib.ConjunctiveGraph() |
| 119 | + g.parse(data=r.text, format="trig") |
| 120 | + |
| 121 | + pipeline = g.value(object=URIRef("http://linkedpipes.com/ontology/Pipeline"), predicate=rdflib.namespace.RDF.type) |
| 122 | + log.info(f"Pipeline IRI: {pipeline!s}") |
| 123 | + |
| 124 | + # POST /resources/executions |
| 125 | + r = requests.post(f"{etl!s}/resources/executions?pipeline={pipeline}") |
| 126 | + r.raise_for_status() |
| 127 | + log.info(f"Execution trigger result:\n{r.json()!s}") |
| 128 | + return f"{etl!s}/resources/executions/{r.json()['iri'].split('/')[-1]}" |
| 129 | + |
| 130 | + |
| 131 | +@celery.task(bind=True, retry_backoff=True, max_retries=None, default_retry_delay=30, time_limit=60*60) |
| 132 | +def poll(self, iri): |
| 133 | + def after_return(self, status, retval, task_id, args, kwargs, einfo): |
| 134 | + cleanup.apply_async() |
| 135 | + self.after_return = after_return |
| 136 | + |
| 137 | + log = logging.getLogger(__name__) |
| 138 | + log.info(f"Polling {iri!s}") |
| 139 | + |
| 140 | + r = requests.get(iri + "/overview") |
| 141 | + content = r.text |
| 142 | + log.info(content) |
| 143 | + r.raise_for_status() |
| 144 | + |
| 145 | + j = json.loads(content) |
| 146 | + if j['status']['@id'] == "http://etl.linkedpipes.com/resources/status/failed": |
| 147 | + log.error("Execution failed") |
| 148 | + |
| 149 | + try: |
| 150 | + r = requests.get(iri + "/logs") |
| 151 | + r.raise_for_status() |
| 152 | + log.error("ETL log:\n" + r.text) |
| 153 | + except HTTPError as e: |
| 154 | + raise EtlJobFailed(r) from e |
| 155 | + |
| 156 | + raise EtlJobFailed(r) |
| 157 | + elif not (j['status']['@id'] == "http://etl.linkedpipes.com/resources/status/finished"): |
| 158 | + log.info("Execution is not finished yet") |
| 159 | + self.retry() |
| 160 | + else: |
| 161 | + #get result uri |
| 162 | + log.info(f"Final graph:\n{str(g)!s}") |
| 163 | + result = "" |
| 164 | + return result |
| 165 | + |
| 166 | + |
8 | 167 | @celery.task |
9 | | -def analyze(iri): |
10 | | - return "" |
| 168 | +@environment('ETL') |
| 169 | +def cleanup(iri, etl): |
| 170 | + log = logging.getLogger(__name__) |
| 171 | + log.info(f"Deleting {iri!s}") |
| 172 | + |
| 173 | + r = requests.delete(f"{etl!s}/pipelines?iri={iri!s}") |
| 174 | + r.raise_for_status() |
| 175 | + |
| 176 | + log.info(f"Pipeline {iri!s} deleted") |
| 177 | + |
| 178 | + |
| 179 | +class EtlError(Exception): |
| 180 | + pass |
| 181 | + |
| 182 | + |
| 183 | +class EtlJobFailed(EtlError): |
| 184 | + pass |
0 commit comments