forked from ShovonUniper/WerkStudent_Python
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtask.py
More file actions
163 lines (132 loc) · 5.99 KB
/
task.py
File metadata and controls
163 lines (132 loc) · 5.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import os
import re
import sys
import pdfplumber
import pandas as pd
from datetime import datetime
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
"""Here we get the path for the pdf files considering they are in the same folder as the exe or py file.
It will get the right path depending upon if the code runs as exe file or script file."""
def get_directory_path():
if getattr(sys, 'frozen', False): # If running as an exe
return os.path.dirname(os.path.abspath(sys.executable))
else: # If running as a python script
return os.path.dirname(os.path.abspath(__file__))
# This function is used to extract the required values from the pdf
def extract_value_from_pdf(file_path, target_string):
with pdfplumber.open(file_path) as pdf:
# Extract the text from all the pages in each pdf
all_text = ""
for page in pdf.pages:
all_text += page.extract_text()
# Look for the target value
value = None
lines = all_text.split('\n')
for line in lines:
if target_string in line:
# Extract the value associated with the target string
value = line.split(target_string)[-1].strip()
break
return value
# Function to extract and normalize the date from the PDF
def extract_date_from_pdf(file_path):
with pdfplumber.open(file_path) as pdf:
all_text = ""
for page in pdf.pages:
all_text += page.extract_text()
# First, try to extract the 'Invoice date' or 'Invoice period'
lines = all_text.split('\n')
for line in lines:
# Extract 'Invoice date:' and try to get the date
if "Invoice date:" in line:
date = line.split("Invoice date:")[-1].strip()
# Normalize the date format
return normalize_date(date)
# Regex for 'Invoice period' with a date range
match = re.search(r'(\d{2}.\d{2}.\d{4})\s*-\s*(\d{2}.\d{2}.\d{4})', line)
if match:
# Return the start date from the period range
return normalize_date(match.group(1))
return None
# Function to normalize the date format to 'DD-MM-YYYY'
def normalize_date(date_str):
# Try to parse different date formats
for fmt in ("%d.%m.%Y", "%m/%d/%Y", "%Y-%m-%d", "%b %d, %Y"):
try:
date_obj = datetime.strptime(date_str, fmt)
return date_obj.strftime('%d-%m-%Y')
except ValueError:
continue
return date_str # Return the original string if no format matches
# Function to handle German decimal system and currency removal
def normalize_value(value_str):
# If the value ends with '€', we need to swap comma and period for German decimal system
if '€' in value_str:
value_str = value_str.replace('.', 'temp').replace(',', '.').replace('temp', ',')
# Remove currency symbols (USD, €, etc.) and any non-numeric characters
value_str = re.sub(r'[^\d,.-]', '', value_str) # Remove anything that is not a number or decimal separator
return value_str
# Function to process all PDF files in the directory
def process_pdf_data(directory_path):
# Get all files in the directory
pdf_files = [f for f in os.listdir(directory_path) if f.endswith('.pdf')]
data = []
# Loop through each PDF file
for pdf_file in pdf_files:
pdf_file_path = os.path.join(directory_path, pdf_file)
print(f"Processing {pdf_file}...")
# First, try to extract 'Gross Amount incl. VAT'
gross_amount = extract_value_from_pdf(pdf_file_path, "Gross Amount incl. VAT")
date = extract_date_from_pdf(pdf_file_path)
if gross_amount:
# Normalize and clean the value if it's in German decimal system
normalized_value = normalize_value(gross_amount)
data.append([pdf_file, date, normalized_value])
print(f"Gross Amount incl. VAT for {pdf_file}: {normalized_value}")
else:
# If 'Gross Amount incl. VAT' is not found, look for 'Total'
total = extract_value_from_pdf(pdf_file_path, "Total")
if total:
# Normalize and clean the value if it's in German decimal system
normalized_value = normalize_value(total)
data.append([pdf_file, date, normalized_value])
print(f"Total for {pdf_file}: {normalized_value}")
else:
print(f"Neither Gross Amount incl. VAT nor Total found in {pdf_file}")
print("-" * 40)
return pd.DataFrame(data, columns=["File Name", "Date", "Value"]) if data else None
def save_data_to_files(df, directory_path):
# Create an Excel workbook and add the data
wb = Workbook()
ws1 = wb.active
ws1.title = "Data"
# Convert DataFrame to rows and append to Sheet 1
for row in dataframe_to_rows(df, index=False, header=True):
ws1.append(row)
# Create a pivot table on Sheet 2
ws2 = wb.create_sheet(title="Pivot Table")
pivot_df = df.pivot_table(values='Value', index=['Date', 'File Name'], aggfunc='sum')
# Write Pivot DataFrame to Sheet 2
for row in dataframe_to_rows(pivot_df, index=True, header=True):
ws2.append(row)
# Save the Excel file
output_file_excel = os.path.join(directory_path, "invoice_data.xlsx")
wb.save(output_file_excel)
print(f"Excel file created: {output_file_excel}")
# Create CSV file
output_file_csv = os.path.join(directory_path, "invoice_data.csv")
df.to_csv(output_file_csv, index=False, sep=";", header=True)
print(f"CSV file created: {output_file_csv}")
def main():
# Step 1: Get the directory path where the script or executable is located
directory_path = get_directory_path()
# Step 2: Process all PDF files in the directory to extract data
df = process_pdf_data(directory_path)
# Step 3: If data is found, save it to Excel and CSV files
if df is not None:
save_data_to_files(df, directory_path)
else:
print("No data found to process.")
if __name__ == "__main__":
main()