AmakrushAI · ksgr5566 · Sep 29, 2023
diff --git a/.gitignore b/.gitignore
@@ -165,6 +165,6 @@ cython_debug/
 .idea/
 
 repository_data.private.json
-*.csv
+# *.csv
 
 google-creds.json
diff --git a/config.json b/config.json
@@ -9,7 +9,7 @@
       "nginx": [],
       "build": true
     },    
-        {
+    {
       "serviceName": "word_score",
       "modelBasePath": "src/search/word_score/local/.",
       "apiBasePath": "/search/word_score/local/",
@@ -18,6 +18,18 @@
       "nginx": [],
       "build": true
     },
+    {
+      "serviceName": "weather_update",
+      "modelBasePath": "src/weather_update/local/.",
+      "apiBasePath": "/weather_update/local/",
+      "containerPort": 8000,
+      "environment": {
+        "HASURA_ADMIN_SECRET": "${HASURA_ADMIN_SECRET}",
+        "AKAI_AUTH_BEARER": "${AKAI_AUTH_BEARER}"
+      },
+      "nginx": [],
+      "build": true
+    },
     {
       "serviceName": "spell_check",
       "modelBasePath": "src/spell_check/kenlm/local/.",

diff --git a/repository_data.json b/repository_data.json
@@ -130,6 +130,14 @@
                     "request_class": "ModelRequest"
                 }
             }
+        },
+        "weather_update": {
+            "local": {
+                "__is_async": false,
+                 "__is_base": true,
+                "model_class": "Model",
+                "request_class": "ModelRequest"
+            }
         }
     }
 }
diff --git a/src/weather_update/README.md b/src/weather_update/README.md
@@ -0,0 +1,7 @@
+# Weather Update
+
+Weather information PDF's:
+- https://mausam.imd.gov.in/bhubaneswar/mcdata/1730Z.pdf
+- https://mausam.imd.gov.in/bhubaneswar/mcdata/District.pdf
+
+This folder parses the tables in these pdfs, makes a merged dataframe and updates the database.
diff --git a/src/weather_update/__init__.py b/src/weather_update/__init__.py
@@ -0,0 +1 @@
+from .local import *
diff --git a/src/weather_update/local/.env_template b/src/weather_update/local/.env_template
@@ -0,0 +1,2 @@
+HASURA_ADMIN_SECRET=
+AKAI_AUTH_BEARER=
diff --git a/src/weather_update/local/Dockerfile b/src/weather_update/local/Dockerfile
@@ -0,0 +1,18 @@
+# Use an official Python runtime as a parent image
+FROM python:3.9-slim
+
+WORKDIR /app
+
+
+#install requirements
+COPY requirements.txt requirements.txt
+RUN pip3 install -r requirements.txt
+RUN apt-get update && apt-get install -y libgl1-mesa-glx libglib2.0-0
+RUN DEBIAN_FRONTEND=noninteractive apt install -y ghostscript python3-tk
+
+
+# Copy the rest of the application code to the working directory
+COPY . /app/
+EXPOSE 8000
+# Set the entrypoint for the container
+CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"]
diff --git a/src/weather_update/local/README.md b/src/weather_update/local/README.md
@@ -0,0 +1,17 @@
+# Weather Update
+
+## Test Deployment
+
+- Git clone the repo and cd to the project location.
+- cd to `local`, i.e., `cd ./src/weather_update/local`.
+- Make a `.env` following the template in `.env_template` file.
+- Start your docker engine and `docker build -t weather_update .`.
+- Do `docker run --env-file .env -p 8000:8000 weather_update`.
+- `curl -X POST -H "Content-Type: application/json" -d '{}' http://0.0.0.0:8000/`.
+- The reponse for above would be: <br>
+`
+{
+    "status_code": status code of the post request to database.
+}
+`
+
diff --git a/src/weather_update/local/__init__.py b/src/weather_update/local/__init__.py
diff --git a/src/weather_update/local/api.py b/src/weather_update/local/api.py
@@ -0,0 +1,24 @@
+from model import Model
+from request import ModelRequest
+from quart import Quart, request
+import aiohttp
+
+app = Quart(__name__)
+
+model = None
+
+@app.before_serving
+async def startup():
+    app.client = aiohttp.ClientSession()
+    global model
+    model = Model(app)
+
+@app.route('/', methods=['POST'])
+async def update():
+    global model
+    data = await request.get_json()
+    req = ModelRequest(**data)
+    return model.inference(req)
+
+if __name__ == "__main__":
+    app.run()
diff --git a/src/weather_update/local/content/DistrictMapping.csv b/src/weather_update/local/content/DistrictMapping.csv
@@ -0,0 +1,40 @@
+Name Mentioned,Mapped District
+BALASORE,Balasore
+CHANDBALI,Bhadrak
+CUTTACK,Cuttack
+PARADIP,Jagatsinghpur
+BHUBANESWAR,Khurda
+GOPALPUR,Ganjam
+PURI,Puri
+ANGUL,Angul
+BARIPADA,Mayurbhanj
+JHARSUGUDA,Jharsuguda
+KEONJHARGARH,Keonjhar
+SAMBALPUR,Sambalpur
+SUNDARGARH,Sundargarh
+HIRAKUD,Sambalpur
+TALCHER,Angul
+BHAWANIPATNA,Kalahandi
+BOLANGIR,Bolangir
+KORAPUT,Koraput
+PHULABANI,Kandhamal
+TITLAGARH,Bolangir
+MALKANGIRI,Malkangiri
+SONEPUR,Sonepur
+DARINGIBADI,Kandhamal
+NAYAGARH,Nayagarh
+BOUDH,Boudh
+CHATRAPUR,Ganjam
+PARALAKHEMUNDI,Gajapati
+RAYAGADA,Rayagada
+BHADRAK,Bhadrak
+JAJPUR,Jajpur
+DHENKANAL,Dhenkanal
+BARGARH,Bargarh
+DEOGARH,Deogarh
+NUAPADA,Nuapada
+NAWARANGPUR,Nabarangapur
+KHORDHA,Khurda
+JAGATSINGHPUR,Jagatsinghpur
+KENDRAPADA,Kendrapara
+ROURKELA,Sundargarh
diff --git a/src/weather_update/local/model.py b/src/weather_update/local/model.py
@@ -0,0 +1,54 @@
+from request import ModelRequest
+from utils import parse_pdfs
+from datetime import date
+import requests
+import json
+import os
+
+secret = os.environ.get('HASURA_ADMIN_SECRET')
+token = os.environ.get('AKAI_AUTH_BEARER')
+
+
+class Model:
+    def __new__(cls, context):
+        cls.context = context
+        if not hasattr(cls, 'instance'):
+            cls.instance = super(Model, cls).__new__(cls)
+        return cls.instance
+
+    def inference(self, request: ModelRequest):
+        df = parse_pdfs()
+        json_df = df.to_dict(orient='records')
+
+        url = 'https://hasura.staging.akai.samagra.io/api/rest/getlastdocumentid'
+        headers = {
+            'x-hasura-admin-secret': secret
+        }
+        response = requests.get(url, headers=headers)
+        data = json.loads(response.text)
+        id = data['document'][0]['id']
+        today = date.today()
+
+        data = []
+        for item in json_df:
+            id += 1
+            ob = {
+                "id": id,
+                "content": str(item),
+                "tags": str(today) + " Weather Update"
+            }
+            data.append(ob)
+
+        url = 'https://staging.akai.samagra.io/document'
+        headers = {
+            'Authorization': f'bearer {token}',
+            'Content-Type': 'application/json'
+        }
+
+        response = requests.post(url, headers=headers, json=data)
+        return {"status_code": response.status_code}
+
+
+
+
+
diff --git a/src/weather_update/local/request.py b/src/weather_update/local/request.py
@@ -0,0 +1,10 @@
+import json
+
+
+class ModelRequest():
+    def __init__(self, text="None"):
+        self.text = text
+
+    def to_json(self):
+        return json.dumps(self, default=lambda o: o.__dict__,
+                          sort_keys=True, indent=4)
diff --git a/src/weather_update/local/requirements.txt b/src/weather_update/local/requirements.txt
@@ -0,0 +1,9 @@
+quart
+aiohttp
+camelot-py
+ghostscript
+excalibur-py
+camelot-py[cv]
+PyPDF2==2.0.0
+pdfplumber
+requests
diff --git a/src/weather_update/local/utils.py b/src/weather_update/local/utils.py
@@ -0,0 +1,106 @@
+import camelot
+import pandas as pd
+import pdfplumber
+import requests
+from io import BytesIO
+
+pdf1 = "https://mausam.imd.gov.in/bhubaneswar/mcdata/District.pdf"
+pdf2 = "https://mausam.imd.gov.in/bhubaneswar/mcdata/1730Z.pdf"
+
+def custom_aggregate(x):
+    numeric_values = []
+
+    t = None
+    for i in x:
+        t = i
+        try:
+            numeric_values.append(float(i))
+        except ValueError:
+            pass
+
+    if numeric_values:
+        return sum(numeric_values) / len(numeric_values)
+    else:
+        return t
+
+def parse_pdfs():
+    table = camelot.read_pdf(pdf1, pages="all")
+
+    first_page = table[0].df.copy(deep=True)
+
+    # camelot doesn't read the 5, 6 rows of first page properly (so read them using pdfplumber)
+    response = requests.get(pdf1)
+    with pdfplumber.open(BytesIO(response.content)) as pdf:
+        temp_page = pdf.pages[0]
+        temp_table = temp_page.extract_table()
+        temp_df = pd.DataFrame(temp_table[1:], columns=temp_table[0])
+        first_page.iloc[5:7, 1:8] = temp_df.iloc[13:15, 2:]
+
+    second_page = table[1].df.copy(deep=True)
+
+    first_table = pd.concat([first_page, second_page], ignore_index=True)
+    first_table.drop(columns=8, inplace=True)
+
+    for col in first_table.columns[2:7]:
+        first_table.at[14, col] = str(first_table.at[14, col]) + " \n" + str(first_table.at[15, col])
+
+    # appending header to second row
+    first_table.iloc[0, 3:7] = first_table.iloc[0, 2]
+    first_table.iloc[1] = first_table.iloc[0] + "\n" + first_table.iloc[1]
+
+    first_table.drop([0, 15], inplace=True)
+    first_table.reset_index(drop=True, inplace=True)
+
+    third_page = table[2].df.copy(deep=True)
+    fourth_page = table[3].df.copy(deep=True)
+    fifth_page = table[4].df.copy(deep=True)
+
+    second_table = pd.concat([third_page, fourth_page, fifth_page], ignore_index=True)
+    second_table.iloc[0, 3:7] = second_table.iloc[0, 2]
+
+    # appending header to second row
+    second_table.iloc[0, 3:7] = second_table.iloc[0, 2]
+    second_table.iloc[1] = second_table.iloc[0] + "\n" + second_table.iloc[1]
+    second_table.drop(0, inplace=True)
+    second_table.reset_index(drop=True, inplace=True)
+
+    for df in [first_table, second_table]:
+        for i in range(2, len(df)):
+            if df.loc[i, 0] == '':
+                df.loc[i, 0] = df.loc[i - 1, 0]
+
+    semi_merge = pd.concat([first_table, second_table.iloc[:, 2:]], axis=1)
+
+    first_row = semi_merge.iloc[:1]
+    sorted_df = semi_merge.iloc[1:].sort_values(by=1)
+    semi_merge = pd.concat([first_row, sorted_df], ignore_index=True)
+
+    table2 = camelot.read_pdf(pdf2, pages="all")
+    table = table2[0].df.copy(deep=True)
+
+    x = pd.read_csv("./content/DistrictMapping.csv")
+    map = dict(zip(x['Name Mentioned'], x['Mapped District']))
+
+    table[1] = table[1].map(lambda x: map.get(x, x))
+    table.replace('TRACE', 0.1, inplace=True)
+
+    response = requests.get(pdf2)
+    with pdfplumber.open(BytesIO(response.content)) as pdf:
+        temp_page = pdf.pages[0]
+        temp_table = temp_page.extract_table()
+        temp_df = pd.DataFrame(temp_table[1:], columns=temp_table[0])
+        table.at[1, 6] = temp_df.at[4, '24 HOURS'].replace("I", "").replace("S", "")
+    table.drop(columns=0, inplace=True)
+
+    first_row = table.iloc[:1]
+    result_df = table.iloc[1:].groupby(1).agg(custom_aggregate).reset_index()
+    table = pd.concat([first_row, result_df], ignore_index=True)
+
+    merge = pd.concat([semi_merge, table.iloc[:, 1:]], axis=1)
+
+    merge.columns = merge.iloc[0]
+    merge = merge.drop(merge.index[0])
+    merge = merge.reset_index(drop=True)
+
+    return merge
+
-Original file line number
+Diff line change
@@ Expand Up / @@ -165,6 +165,6 @@ cython_debug/ @@
     .idea/
     repository_data.private.json
-    *.csv
+    # *.csv
     google-creds.json