Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,6 @@ cython_debug/
.idea/

repository_data.private.json
*.csv
# *.csv

google-creds.json
14 changes: 13 additions & 1 deletion config.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"nginx": [],
"build": true
},
{
{
"serviceName": "word_score",
"modelBasePath": "src/search/word_score/local/.",
"apiBasePath": "/search/word_score/local/",
Expand All @@ -18,6 +18,18 @@
"nginx": [],
"build": true
},
{
"serviceName": "weather_update",
"modelBasePath": "src/weather_update/local/.",
"apiBasePath": "/weather_update/local/",
"containerPort": 8000,
"environment": {
"HASURA_ADMIN_SECRET": "${HASURA_ADMIN_SECRET}",
"AKAI_AUTH_BEARER": "${AKAI_AUTH_BEARER}"
},
"nginx": [],
"build": true
},
{
"serviceName": "spell_check",
"modelBasePath": "src/spell_check/kenlm/local/.",
Expand Down
8 changes: 8 additions & 0 deletions repository_data.json
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,14 @@
"request_class": "ModelRequest"
}
}
},
"weather_update": {
"local": {
"__is_async": false,
"__is_base": true,
"model_class": "Model",
"request_class": "ModelRequest"
}
}
}
}
7 changes: 7 additions & 0 deletions src/weather_update/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Weather Update

Weather information PDF's:
- https://mausam.imd.gov.in/bhubaneswar/mcdata/1730Z.pdf
- https://mausam.imd.gov.in/bhubaneswar/mcdata/District.pdf

This folder parses the tables in these pdfs, makes a merged dataframe and updates the database.
1 change: 1 addition & 0 deletions src/weather_update/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .local import *
2 changes: 2 additions & 0 deletions src/weather_update/local/.env_template
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
HASURA_ADMIN_SECRET=
AKAI_AUTH_BEARER=
18 changes: 18 additions & 0 deletions src/weather_update/local/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Use an official Python runtime as a parent image
FROM python:3.9-slim

WORKDIR /app


#install requirements
COPY requirements.txt requirements.txt
RUN pip3 install -r requirements.txt
RUN apt-get update && apt-get install -y libgl1-mesa-glx libglib2.0-0
RUN DEBIAN_FRONTEND=noninteractive apt install -y ghostscript python3-tk


# Copy the rest of the application code to the working directory
COPY . /app/
EXPOSE 8000
# Set the entrypoint for the container
CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"]
17 changes: 17 additions & 0 deletions src/weather_update/local/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Weather Update

## Test Deployment

- Git clone the repo and cd to the project location.
- cd to `local`, i.e., `cd ./src/weather_update/local`.
- Make a `.env` following the template in `.env_template` file.
- Start your docker engine and `docker build -t weather_update .`.
- Do `docker run --env-file .env -p 8000:8000 weather_update`.
- `curl -X POST -H "Content-Type: application/json" -d '{}' http://0.0.0.0:8000/`.
- The reponse for above would be: <br>
`
{
"status_code": status code of the post request to database.
}
`

Empty file.
24 changes: 24 additions & 0 deletions src/weather_update/local/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from model import Model
from request import ModelRequest
from quart import Quart, request
import aiohttp

app = Quart(__name__)

model = None

@app.before_serving
async def startup():
app.client = aiohttp.ClientSession()
global model
model = Model(app)

@app.route('/', methods=['POST'])
async def update():
global model
data = await request.get_json()
req = ModelRequest(**data)
return model.inference(req)

if __name__ == "__main__":
app.run()
40 changes: 40 additions & 0 deletions src/weather_update/local/content/DistrictMapping.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
Name Mentioned,Mapped District
BALASORE,Balasore
CHANDBALI,Bhadrak
CUTTACK,Cuttack
PARADIP,Jagatsinghpur
BHUBANESWAR,Khurda
GOPALPUR,Ganjam
PURI,Puri
ANGUL,Angul
BARIPADA,Mayurbhanj
JHARSUGUDA,Jharsuguda
KEONJHARGARH,Keonjhar
SAMBALPUR,Sambalpur
SUNDARGARH,Sundargarh
HIRAKUD,Sambalpur
TALCHER,Angul
BHAWANIPATNA,Kalahandi
BOLANGIR,Bolangir
KORAPUT,Koraput
PHULABANI,Kandhamal
TITLAGARH,Bolangir
MALKANGIRI,Malkangiri
SONEPUR,Sonepur
DARINGIBADI,Kandhamal
NAYAGARH,Nayagarh
BOUDH,Boudh
CHATRAPUR,Ganjam
PARALAKHEMUNDI,Gajapati
RAYAGADA,Rayagada
BHADRAK,Bhadrak
JAJPUR,Jajpur
DHENKANAL,Dhenkanal
BARGARH,Bargarh
DEOGARH,Deogarh
NUAPADA,Nuapada
NAWARANGPUR,Nabarangapur
KHORDHA,Khurda
JAGATSINGHPUR,Jagatsinghpur
KENDRAPADA,Kendrapara
ROURKELA,Sundargarh
54 changes: 54 additions & 0 deletions src/weather_update/local/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from request import ModelRequest
from utils import parse_pdfs
from datetime import date
import requests
import json
import os

secret = os.environ.get('HASURA_ADMIN_SECRET')
token = os.environ.get('AKAI_AUTH_BEARER')


class Model:
def __new__(cls, context):
cls.context = context
if not hasattr(cls, 'instance'):
cls.instance = super(Model, cls).__new__(cls)
return cls.instance

def inference(self, request: ModelRequest):
df = parse_pdfs()
json_df = df.to_dict(orient='records')

url = 'https://hasura.staging.akai.samagra.io/api/rest/getlastdocumentid'
headers = {
'x-hasura-admin-secret': secret
}
response = requests.get(url, headers=headers)
data = json.loads(response.text)
id = data['document'][0]['id']
today = date.today()

data = []
for item in json_df:
id += 1
ob = {
"id": id,
"content": str(item),
"tags": str(today) + " Weather Update"
}
data.append(ob)

url = 'https://staging.akai.samagra.io/document'
headers = {
'Authorization': f'bearer {token}',
'Content-Type': 'application/json'
}

response = requests.post(url, headers=headers, json=data)
return {"status_code": response.status_code}





10 changes: 10 additions & 0 deletions src/weather_update/local/request.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import json


class ModelRequest():
def __init__(self, text="None"):
self.text = text

def to_json(self):
return json.dumps(self, default=lambda o: o.__dict__,
sort_keys=True, indent=4)
9 changes: 9 additions & 0 deletions src/weather_update/local/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
quart
aiohttp
camelot-py
ghostscript
excalibur-py
camelot-py[cv]
PyPDF2==2.0.0
pdfplumber
requests
106 changes: 106 additions & 0 deletions src/weather_update/local/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import camelot
import pandas as pd
import pdfplumber
import requests
from io import BytesIO

pdf1 = "https://mausam.imd.gov.in/bhubaneswar/mcdata/District.pdf"
pdf2 = "https://mausam.imd.gov.in/bhubaneswar/mcdata/1730Z.pdf"

def custom_aggregate(x):
numeric_values = []

t = None
for i in x:
t = i
try:
numeric_values.append(float(i))
except ValueError:
pass

if numeric_values:
return sum(numeric_values) / len(numeric_values)
else:
return t

def parse_pdfs():
table = camelot.read_pdf(pdf1, pages="all")

first_page = table[0].df.copy(deep=True)

# camelot doesn't read the 5, 6 rows of first page properly (so read them using pdfplumber)
response = requests.get(pdf1)
with pdfplumber.open(BytesIO(response.content)) as pdf:
temp_page = pdf.pages[0]
temp_table = temp_page.extract_table()
temp_df = pd.DataFrame(temp_table[1:], columns=temp_table[0])
first_page.iloc[5:7, 1:8] = temp_df.iloc[13:15, 2:]

second_page = table[1].df.copy(deep=True)

first_table = pd.concat([first_page, second_page], ignore_index=True)
first_table.drop(columns=8, inplace=True)

for col in first_table.columns[2:7]:
first_table.at[14, col] = str(first_table.at[14, col]) + " \n" + str(first_table.at[15, col])

# appending header to second row
first_table.iloc[0, 3:7] = first_table.iloc[0, 2]
first_table.iloc[1] = first_table.iloc[0] + "\n" + first_table.iloc[1]

first_table.drop([0, 15], inplace=True)
first_table.reset_index(drop=True, inplace=True)

third_page = table[2].df.copy(deep=True)
fourth_page = table[3].df.copy(deep=True)
fifth_page = table[4].df.copy(deep=True)

second_table = pd.concat([third_page, fourth_page, fifth_page], ignore_index=True)
second_table.iloc[0, 3:7] = second_table.iloc[0, 2]

# appending header to second row
second_table.iloc[0, 3:7] = second_table.iloc[0, 2]
second_table.iloc[1] = second_table.iloc[0] + "\n" + second_table.iloc[1]
second_table.drop(0, inplace=True)
second_table.reset_index(drop=True, inplace=True)

for df in [first_table, second_table]:
for i in range(2, len(df)):
if df.loc[i, 0] == '':
df.loc[i, 0] = df.loc[i - 1, 0]

semi_merge = pd.concat([first_table, second_table.iloc[:, 2:]], axis=1)

first_row = semi_merge.iloc[:1]
sorted_df = semi_merge.iloc[1:].sort_values(by=1)
semi_merge = pd.concat([first_row, sorted_df], ignore_index=True)

table2 = camelot.read_pdf(pdf2, pages="all")
table = table2[0].df.copy(deep=True)

x = pd.read_csv("./content/DistrictMapping.csv")
map = dict(zip(x['Name Mentioned'], x['Mapped District']))

table[1] = table[1].map(lambda x: map.get(x, x))
table.replace('TRACE', 0.1, inplace=True)

response = requests.get(pdf2)
with pdfplumber.open(BytesIO(response.content)) as pdf:
temp_page = pdf.pages[0]
temp_table = temp_page.extract_table()
temp_df = pd.DataFrame(temp_table[1:], columns=temp_table[0])
table.at[1, 6] = temp_df.at[4, '24 HOURS'].replace("I", "").replace("S", "")
table.drop(columns=0, inplace=True)

first_row = table.iloc[:1]
result_df = table.iloc[1:].groupby(1).agg(custom_aggregate).reset_index()
table = pd.concat([first_row, result_df], ignore_index=True)

merge = pd.concat([semi_merge, table.iloc[:, 1:]], axis=1)

merge.columns = merge.iloc[0]
merge = merge.drop(merge.index[0])
merge = merge.reset_index(drop=True)

return merge