-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
52 lines (39 loc) · 2.29 KB
/
main.py
File metadata and controls
52 lines (39 loc) · 2.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import re
from formatter import Formatter
from soup_statement_generator import SoupStatementGenerator
from page_finder import PageFinder
def main():
#get_info_from_url("https://www.kununu.com/at", "Get the top Companies, based on the kununu score.",'[{{"CompanyName": <CompanyName>}}]')
#get_info_from_url("https://www.kununu.com/at", "Get kununu score from the FH Oberösterreich","Company,Score\nInsertCompanyName,InsertScore")
#get_info_from_url("https://orf.at/", "Get the last 5 articles (title + content) for upper austria",'[{{"Title": <Title>, "Content": <Content>}}]')
#get_info_from_url("https://orf.at/", "Get the links to news articles", '[{{"Link": <Link>}}]')
#get_info_from_url("https://www.reddit.com", "Gib mir 10 Einträge (Titel + Inhalt) zur FH Oberösterreich.",'[{{"Title": <Title>, "Content": <Content>}}]')
get_info_from_url("https://fh-ooe.at/", "Gib mir Social Media Accounts der FH Oberösterreich.","Accounts: 'Url1', 'Url2', ...")
#get_info_from_url("https://www.kununu.com/at", "Get all Companies with their the kununu score.",'[{{"CompanyName": <CompanyName>,"Score": <Score>}}]')
def get_urls(urls: str) -> list:
return re.findall(r'(?:["\'])([^"\']+)(?:["\'])', urls)
def get_info_from_url(url: str, question: str, target_format:str):
print("++++++++++++++PageFinder++++++++++++++++++++")
page_finder = PageFinder()
page_finder.persist_web_page(url)
needed_urls = page_finder.run(question)
print(needed_urls)
extracted_urls = get_urls(needed_urls)
print(f"Extracted urls: {extracted_urls}")
if len(extracted_urls) == 0:
print(f"No urls found. Are you sure the needed information can be found on the provided website: {url}?")
return
print("++++++++++++++++SoupStatementGenerator++++++++++++++++++")
general_scraper = SoupStatementGenerator()
general_scraper.persist_multiple_web_page(extracted_urls)
statement = general_scraper.run(question)
print(statement)
content =general_scraper.apply_soup_statement(statement)
print(content[:1000])
print("++++++++++++++++Formatter++++++++++++++++++")
formatter = Formatter()
code = formatter.run(content,target_format)
print(code)
print(formatter.apply_code(code,content)[:1000])
if __name__=="__main__":
main()