diff --git a/.gitignore b/.gitignore index 73fdf9b..3f58470 100644 --- a/.gitignore +++ b/.gitignore @@ -165,6 +165,6 @@ cython_debug/ .idea/ repository_data.private.json -*.csv +# *.csv google-creds.json \ No newline at end of file diff --git a/config.json b/config.json index 418f74d..82ed6b6 100644 --- a/config.json +++ b/config.json @@ -9,7 +9,7 @@ "nginx": [], "build": true }, - { + { "serviceName": "word_score", "modelBasePath": "src/search/word_score/local/.", "apiBasePath": "/search/word_score/local/", @@ -18,6 +18,18 @@ "nginx": [], "build": true }, + { + "serviceName": "weather_update", + "modelBasePath": "src/weather_update/local/.", + "apiBasePath": "/weather_update/local/", + "containerPort": 8000, + "environment": { + "HASURA_ADMIN_SECRET": "${HASURA_ADMIN_SECRET}", + "AKAI_AUTH_BEARER": "${AKAI_AUTH_BEARER}" + }, + "nginx": [], + "build": true + }, { "serviceName": "spell_check", "modelBasePath": "src/spell_check/kenlm/local/.", diff --git a/repository_data.json b/repository_data.json index 3a095ff..7700002 100644 --- a/repository_data.json +++ b/repository_data.json @@ -130,6 +130,14 @@ "request_class": "ModelRequest" } } + }, + "weather_update": { + "local": { + "__is_async": false, + "__is_base": true, + "model_class": "Model", + "request_class": "ModelRequest" + } } } } \ No newline at end of file diff --git a/src/weather_update/README.md b/src/weather_update/README.md new file mode 100644 index 0000000..170201c --- /dev/null +++ b/src/weather_update/README.md @@ -0,0 +1,7 @@ +# Weather Update + +Weather information PDF's: +- https://mausam.imd.gov.in/bhubaneswar/mcdata/1730Z.pdf +- https://mausam.imd.gov.in/bhubaneswar/mcdata/District.pdf + +This folder parses the tables in these pdfs, makes a merged dataframe and updates the database. \ No newline at end of file diff --git a/src/weather_update/__init__.py b/src/weather_update/__init__.py new file mode 100644 index 0000000..49202e0 --- /dev/null +++ b/src/weather_update/__init__.py @@ -0,0 +1 @@ +from .local import * \ No newline at end of file diff --git a/src/weather_update/local/.env_template b/src/weather_update/local/.env_template new file mode 100644 index 0000000..39ed997 --- /dev/null +++ b/src/weather_update/local/.env_template @@ -0,0 +1,2 @@ +HASURA_ADMIN_SECRET= +AKAI_AUTH_BEARER= \ No newline at end of file diff --git a/src/weather_update/local/Dockerfile b/src/weather_update/local/Dockerfile new file mode 100644 index 0000000..a13dea4 --- /dev/null +++ b/src/weather_update/local/Dockerfile @@ -0,0 +1,18 @@ +# Use an official Python runtime as a parent image +FROM python:3.9-slim + +WORKDIR /app + + +#install requirements +COPY requirements.txt requirements.txt +RUN pip3 install -r requirements.txt +RUN apt-get update && apt-get install -y libgl1-mesa-glx libglib2.0-0 +RUN DEBIAN_FRONTEND=noninteractive apt install -y ghostscript python3-tk + + +# Copy the rest of the application code to the working directory +COPY . /app/ +EXPOSE 8000 +# Set the entrypoint for the container +CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] \ No newline at end of file diff --git a/src/weather_update/local/README.md b/src/weather_update/local/README.md new file mode 100644 index 0000000..857eedc --- /dev/null +++ b/src/weather_update/local/README.md @@ -0,0 +1,17 @@ +# Weather Update + +## Test Deployment + +- Git clone the repo and cd to the project location. +- cd to `local`, i.e., `cd ./src/weather_update/local`. +- Make a `.env` following the template in `.env_template` file. +- Start your docker engine and `docker build -t weather_update .`. +- Do `docker run --env-file .env -p 8000:8000 weather_update`. +- `curl -X POST -H "Content-Type: application/json" -d '{}' http://0.0.0.0:8000/`. +- The reponse for above would be:
+` +{ + "status_code": status code of the post request to database. +} +` + diff --git a/src/weather_update/local/__init__.py b/src/weather_update/local/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/weather_update/local/api.py b/src/weather_update/local/api.py new file mode 100644 index 0000000..584bab3 --- /dev/null +++ b/src/weather_update/local/api.py @@ -0,0 +1,24 @@ +from model import Model +from request import ModelRequest +from quart import Quart, request +import aiohttp + +app = Quart(__name__) + +model = None + +@app.before_serving +async def startup(): + app.client = aiohttp.ClientSession() + global model + model = Model(app) + +@app.route('/', methods=['POST']) +async def update(): + global model + data = await request.get_json() + req = ModelRequest(**data) + return model.inference(req) + +if __name__ == "__main__": + app.run() \ No newline at end of file diff --git a/src/weather_update/local/content/DistrictMapping.csv b/src/weather_update/local/content/DistrictMapping.csv new file mode 100644 index 0000000..01721ee --- /dev/null +++ b/src/weather_update/local/content/DistrictMapping.csv @@ -0,0 +1,40 @@ +Name Mentioned,Mapped District +BALASORE,Balasore +CHANDBALI,Bhadrak +CUTTACK,Cuttack +PARADIP,Jagatsinghpur +BHUBANESWAR,Khurda +GOPALPUR,Ganjam +PURI,Puri +ANGUL,Angul +BARIPADA,Mayurbhanj +JHARSUGUDA,Jharsuguda +KEONJHARGARH,Keonjhar +SAMBALPUR,Sambalpur +SUNDARGARH,Sundargarh +HIRAKUD,Sambalpur +TALCHER,Angul +BHAWANIPATNA,Kalahandi +BOLANGIR,Bolangir +KORAPUT,Koraput +PHULABANI,Kandhamal +TITLAGARH,Bolangir +MALKANGIRI,Malkangiri +SONEPUR,Sonepur +DARINGIBADI,Kandhamal +NAYAGARH,Nayagarh +BOUDH,Boudh +CHATRAPUR,Ganjam +PARALAKHEMUNDI,Gajapati +RAYAGADA,Rayagada +BHADRAK,Bhadrak +JAJPUR,Jajpur +DHENKANAL,Dhenkanal +BARGARH,Bargarh +DEOGARH,Deogarh +NUAPADA,Nuapada +NAWARANGPUR,Nabarangapur +KHORDHA,Khurda +JAGATSINGHPUR,Jagatsinghpur +KENDRAPADA,Kendrapara +ROURKELA,Sundargarh diff --git a/src/weather_update/local/model.py b/src/weather_update/local/model.py new file mode 100644 index 0000000..209b397 --- /dev/null +++ b/src/weather_update/local/model.py @@ -0,0 +1,54 @@ +from request import ModelRequest +from utils import parse_pdfs +from datetime import date +import requests +import json +import os + +secret = os.environ.get('HASURA_ADMIN_SECRET') +token = os.environ.get('AKAI_AUTH_BEARER') + + +class Model: + def __new__(cls, context): + cls.context = context + if not hasattr(cls, 'instance'): + cls.instance = super(Model, cls).__new__(cls) + return cls.instance + + def inference(self, request: ModelRequest): + df = parse_pdfs() + json_df = df.to_dict(orient='records') + + url = 'https://hasura.staging.akai.samagra.io/api/rest/getlastdocumentid' + headers = { + 'x-hasura-admin-secret': secret + } + response = requests.get(url, headers=headers) + data = json.loads(response.text) + id = data['document'][0]['id'] + today = date.today() + + data = [] + for item in json_df: + id += 1 + ob = { + "id": id, + "content": str(item), + "tags": str(today) + " Weather Update" + } + data.append(ob) + + url = 'https://staging.akai.samagra.io/document' + headers = { + 'Authorization': f'bearer {token}', + 'Content-Type': 'application/json' + } + + response = requests.post(url, headers=headers, json=data) + return {"status_code": response.status_code} + + + + + diff --git a/src/weather_update/local/request.py b/src/weather_update/local/request.py new file mode 100644 index 0000000..fde2b72 --- /dev/null +++ b/src/weather_update/local/request.py @@ -0,0 +1,10 @@ +import json + + +class ModelRequest(): + def __init__(self, text="None"): + self.text = text + + def to_json(self): + return json.dumps(self, default=lambda o: o.__dict__, + sort_keys=True, indent=4) \ No newline at end of file diff --git a/src/weather_update/local/requirements.txt b/src/weather_update/local/requirements.txt new file mode 100644 index 0000000..8d15566 --- /dev/null +++ b/src/weather_update/local/requirements.txt @@ -0,0 +1,9 @@ +quart +aiohttp +camelot-py +ghostscript +excalibur-py +camelot-py[cv] +PyPDF2==2.0.0 +pdfplumber +requests \ No newline at end of file diff --git a/src/weather_update/local/utils.py b/src/weather_update/local/utils.py new file mode 100644 index 0000000..f987184 --- /dev/null +++ b/src/weather_update/local/utils.py @@ -0,0 +1,106 @@ +import camelot +import pandas as pd +import pdfplumber +import requests +from io import BytesIO + +pdf1 = "https://mausam.imd.gov.in/bhubaneswar/mcdata/District.pdf" +pdf2 = "https://mausam.imd.gov.in/bhubaneswar/mcdata/1730Z.pdf" + +def custom_aggregate(x): + numeric_values = [] + + t = None + for i in x: + t = i + try: + numeric_values.append(float(i)) + except ValueError: + pass + + if numeric_values: + return sum(numeric_values) / len(numeric_values) + else: + return t + +def parse_pdfs(): + table = camelot.read_pdf(pdf1, pages="all") + + first_page = table[0].df.copy(deep=True) + + # camelot doesn't read the 5, 6 rows of first page properly (so read them using pdfplumber) + response = requests.get(pdf1) + with pdfplumber.open(BytesIO(response.content)) as pdf: + temp_page = pdf.pages[0] + temp_table = temp_page.extract_table() + temp_df = pd.DataFrame(temp_table[1:], columns=temp_table[0]) + first_page.iloc[5:7, 1:8] = temp_df.iloc[13:15, 2:] + + second_page = table[1].df.copy(deep=True) + + first_table = pd.concat([first_page, second_page], ignore_index=True) + first_table.drop(columns=8, inplace=True) + + for col in first_table.columns[2:7]: + first_table.at[14, col] = str(first_table.at[14, col]) + " \n" + str(first_table.at[15, col]) + + # appending header to second row + first_table.iloc[0, 3:7] = first_table.iloc[0, 2] + first_table.iloc[1] = first_table.iloc[0] + "\n" + first_table.iloc[1] + + first_table.drop([0, 15], inplace=True) + first_table.reset_index(drop=True, inplace=True) + + third_page = table[2].df.copy(deep=True) + fourth_page = table[3].df.copy(deep=True) + fifth_page = table[4].df.copy(deep=True) + + second_table = pd.concat([third_page, fourth_page, fifth_page], ignore_index=True) + second_table.iloc[0, 3:7] = second_table.iloc[0, 2] + + # appending header to second row + second_table.iloc[0, 3:7] = second_table.iloc[0, 2] + second_table.iloc[1] = second_table.iloc[0] + "\n" + second_table.iloc[1] + second_table.drop(0, inplace=True) + second_table.reset_index(drop=True, inplace=True) + + for df in [first_table, second_table]: + for i in range(2, len(df)): + if df.loc[i, 0] == '': + df.loc[i, 0] = df.loc[i - 1, 0] + + semi_merge = pd.concat([first_table, second_table.iloc[:, 2:]], axis=1) + + first_row = semi_merge.iloc[:1] + sorted_df = semi_merge.iloc[1:].sort_values(by=1) + semi_merge = pd.concat([first_row, sorted_df], ignore_index=True) + + table2 = camelot.read_pdf(pdf2, pages="all") + table = table2[0].df.copy(deep=True) + + x = pd.read_csv("./content/DistrictMapping.csv") + map = dict(zip(x['Name Mentioned'], x['Mapped District'])) + + table[1] = table[1].map(lambda x: map.get(x, x)) + table.replace('TRACE', 0.1, inplace=True) + + response = requests.get(pdf2) + with pdfplumber.open(BytesIO(response.content)) as pdf: + temp_page = pdf.pages[0] + temp_table = temp_page.extract_table() + temp_df = pd.DataFrame(temp_table[1:], columns=temp_table[0]) + table.at[1, 6] = temp_df.at[4, '24 HOURS'].replace("I", "").replace("S", "") + table.drop(columns=0, inplace=True) + + first_row = table.iloc[:1] + result_df = table.iloc[1:].groupby(1).agg(custom_aggregate).reset_index() + table = pd.concat([first_row, result_df], ignore_index=True) + + merge = pd.concat([semi_merge, table.iloc[:, 1:]], axis=1) + + merge.columns = merge.iloc[0] + merge = merge.drop(merge.index[0]) + merge = merge.reset_index(drop=True) + + return merge +