Skip to content

Commit f18e861

Browse files
2 parents 7ad38de + 59b453d commit f18e861

File tree

8 files changed

+127
-47
lines changed

8 files changed

+127
-47
lines changed

src/app_gui.py

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,14 @@
1-
import os, csv, asyncio, threading, subprocess, sys
1+
import asyncio
2+
import csv
3+
import os
4+
import subprocess
5+
import sys
6+
import threading
27
import tkinter as tk
3-
from tkinter import ttk, messagebox, filedialog, simpledialog
4-
from pathlib import Path
58
from datetime import datetime
9+
from pathlib import Path
10+
from tkinter import filedialog, messagebox, simpledialog, ttk
11+
612
from dotenv import load_dotenv, set_key
713

814
from src.main import run, write_excel
@@ -141,17 +147,25 @@ def build_ui():
141147
root.title("GinioCrawler")
142148
root.geometry("560x220")
143149
tk.Label(root, text="Fraza do wyszukania:").pack(anchor="w", padx=12, pady=(12, 0))
144-
entry_query = tk.Entry(root); entry_query.pack(fill="x", padx=12, pady=6); entry_query.focus()
145-
frm = tk.Frame(root); frm.pack(fill="x", padx=12, pady=(0, 6))
150+
entry_query = tk.Entry(root)
151+
entry_query.pack(fill="x", padx=12, pady=6)
152+
entry_query.focus()
153+
frm = tk.Frame(root)
154+
frm.pack(fill="x", padx=12, pady=(0, 6))
146155
tk.Label(frm, text="Folder wyjściowy:").pack(side="left")
147156
out_dir_var = tk.StringVar(value=str((Path.cwd() / "wyniki")))
148-
entry_dir = tk.Entry(frm, textvariable=out_dir_var); entry_dir.pack(side="left", fill="x", expand=True, padx=(8, 6))
157+
entry_dir = tk.Entry(frm, textvariable=out_dir_var)
158+
entry_dir.pack(side="left", fill="x", expand=True, padx=(8, 6))
149159
tk.Button(frm, text="Wybierz…", command=choose_dir).pack(side="left")
150-
btn_start = tk.Button(root, text="Start", command=start); btn_start.pack(padx=12, pady=6)
151-
prog = ttk.Progressbar(root, mode="indeterminate"); prog.pack(fill="x", padx=12, pady=(4, 8))
152-
status = tk.StringVar(value="Gotowy"); tk.Label(root, textvariable=status, anchor="w").pack(fill="x", padx=12, pady=(0, 8))
160+
btn_start = tk.Button(root, text="Start", command=start)
161+
btn_start.pack(padx=12, pady=6)
162+
prog = ttk.Progressbar(root, mode="indeterminate")
163+
prog.pack(fill="x", padx=12, pady=(4, 8))
164+
status = tk.StringVar(value="Gotowy")
165+
tk.Label(root, textvariable=status, anchor="w").pack(fill="x", padx=12, pady=(0, 8))
153166
return root
154167

168+
155169
if __name__ == "__main__":
156170
build_ui()
157171
root.mainloop()

src/main.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,18 @@
1+
import asyncio
12
import csv
3+
import os
4+
import re
5+
import sys
6+
import urllib.parse
7+
import urllib.robotparser as rp
28
from datetime import datetime
3-
import os, sys, re, asyncio, httpx, urllib.parse, urllib.robotparser as rp
4-
from bs4 import BeautifulSoup as BS
9+
from pathlib import Path
10+
11+
import httpx
512
import pandas as pd
6-
from openpyxl.styles import Font, Alignment
13+
from bs4 import BeautifulSoup as BS
14+
from openpyxl.styles import Alignment, Font
715
from openpyxl.utils import get_column_letter
8-
from pathlib import Path
916

1017
EMAIL_RE = re.compile(r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}", re.I)
1118
PL_PHONE_RE = re.compile(r"(?:\+48\s?)?(?:\d{3}[\s-]?\d{3}[\s-]?\d{3})")
@@ -35,7 +42,9 @@ def write_excel(csv_path, xlsx_path):
3542
ws.auto_filter.ref = ws.dimensions
3643
for col in ws.columns:
3744
length = max(len(str(c.value)) if c.value else 0 for c in col)
38-
ws.column_dimensions[col[0].column_letter].width = min(max(12, int(length * 0.9)), 60)
45+
ws.column_dimensions[col[0].column_letter].width = min(
46+
max(12, int(length * 0.9)), 60
47+
)
3948
cols = [c for c in ("url", "contact_url") if c in df.columns]
4049
for row in range(2, ws.max_row + 1):
4150
for name in cols:
@@ -47,7 +56,6 @@ def write_excel(csv_path, xlsx_path):
4756
cell.style = "Hyperlink"
4857

4958

50-
5159
def in_robots(url: str) -> bool:
5260
base = f"{urllib.parse.urlsplit(url).scheme}://{urllib.parse.urlsplit(url).netloc}"
5361
robots = rp.RobotFileParser()
@@ -144,9 +152,12 @@ async def run(query):
144152

145153
if __name__ == "__main__":
146154
try:
147-
import os, sys, csv, asyncio
148-
from pathlib import Path
155+
import asyncio
156+
import csv
157+
import os
158+
import sys
149159
from datetime import datetime
160+
from pathlib import Path
150161

151162
q = input("Podaj szukane słowo: ").strip()
152163
if not q:

tests/conftest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@
33

44
REPO_ROOT = Path(__file__).resolve().parents[1]
55
if str(REPO_ROOT) not in sys.path:
6-
sys.path.insert(0, str(REPO_ROOT))
6+
sys.path.insert(0, str(REPO_ROOT))

tests/test_crawl_run.py

Lines changed: 37 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,51 @@
1-
import httpx, respx, pytest
1+
import httpx
2+
import pytest
3+
import respx
4+
25
from src.main import crawl_one, run
36

7+
48
@respx.mock
59
@pytest.mark.anyio
610
async def test_crawl_one_merges_contact(monkeypatch):
711
monkeypatch.setattr("src.main.in_robots", lambda u: True)
8-
respx.get("https://site.test/").mock(return_value=httpx.Response(200, text="""
9-
<a href="/contact">Contact</a><p>[email protected]</p>"""))
10-
respx.get("https://site.test/contact").mock(return_value=httpx.Response(200, text="""
11-
<p>[email protected] 123 456 789</p>"""))
12+
respx.get("https://site.test/").mock(
13+
return_value=httpx.Response(
14+
200,
15+
text="""
16+
<a href="/contact">Contact</a><p>[email protected]</p>""",
17+
)
18+
)
19+
respx.get("https://site.test/contact").mock(
20+
return_value=httpx.Response(
21+
200,
22+
text="""
23+
<p>[email protected] 123 456 789</p>""",
24+
)
25+
)
1226
async with httpx.AsyncClient() as c:
1327
out = await crawl_one("https://site.test/", c)
14-
assert {"[email protected]","[email protected]"} <= set(out["emails"])
28+
assert {"[email protected]", "[email protected]"} <= set(out["emails"])
29+
1530

1631
@respx.mock
1732
@pytest.mark.anyio
1833
async def test_run_full(monkeypatch):
19-
respx.get("https://serpapi.com/search").mock(return_value=httpx.Response(
20-
200, json={"organic_results":[{"link":"https://a.pl"},{"link":"https://b.pl"}]}
21-
))
34+
respx.get("https://serpapi.com/search").mock(
35+
return_value=httpx.Response(
36+
200,
37+
json={
38+
"organic_results": [{"link": "https://a.pl"}, {"link": "https://b.pl"}]
39+
},
40+
)
41+
)
2242
monkeypatch.setattr("src.main.in_robots", lambda u: True)
23-
respx.get("https://a.pl").mock(return_value=httpx.Response(200, text="<title>A</title>"))
24-
respx.get("https://b.pl").mock(return_value=httpx.Response(200, text="<title>B</title>"))
25-
monkeypatch.setenv("SERPAPI_KEY","x")
43+
respx.get("https://a.pl").mock(
44+
return_value=httpx.Response(200, text="<title>A</title>")
45+
)
46+
respx.get("https://b.pl").mock(
47+
return_value=httpx.Response(200, text="<title>B</title>")
48+
)
49+
monkeypatch.setenv("SERPAPI_KEY", "x")
2650
out = await run("foo")
27-
assert {r["title"] for r in out} == {"A","B"}
51+
assert {r["title"] for r in out} == {"A", "B"}

tests/test_excel.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,34 @@
11
from pathlib import Path
2+
23
import pandas as pd
4+
35
from src.app_gui import save_results
46
from src.main import write_excel as real_write_excel
57

8+
69
def test_save_results_creates_files(tmp_path, monkeypatch):
710
root_out = tmp_path / "wyniki"
8-
data = [{"url":"https://x","title":"X","emails":["a@x"],"phones":["123"],"contact_url":None}]
11+
data = [
12+
{
13+
"url": "https://x",
14+
"title": "X",
15+
"emails": ["a@x"],
16+
"phones": ["123"],
17+
"contact_url": None,
18+
}
19+
]
920

1021
called = {}
22+
1123
def fake_write(csv_path, xlsx_path):
12-
called["csv"] = csv_path; called["xlsx"] = xlsx_path
24+
called["csv"] = csv_path
25+
called["xlsx"] = xlsx_path
1326
return real_write_excel(csv_path, xlsx_path)
1427

1528
monkeypatch.setattr("src.app_gui.write_excel", fake_write)
1629

1730
csv_p, xlsx_p = save_results(data, "20250101_000000", root_out)
1831
assert Path(csv_p).exists() and Path(xlsx_p).exists()
1932
df = pd.read_excel(xlsx_p)
20-
assert {"url","title","emails","phones","contact_url"}.issubset(df.columns)
33+
assert {"url", "title", "emails", "phones", "contact_url"}.issubset(df.columns)
2134
assert called["csv"] == str(csv_p) and called["xlsx"] == str(xlsx_p)

tests/test_gui.py

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,49 @@
11
from unittest.mock import patch
22

3+
34
def test_ensure_api_key_env(monkeypatch):
4-
monkeypatch.setenv("SERPAPI_KEY","secret")
5+
monkeypatch.setenv("SERPAPI_KEY", "secret")
56
from src.app_gui import ensure_api_key
7+
68
assert ensure_api_key() == "secret"
79

10+
811
def test_ensure_api_key_prompt(tmp_path, monkeypatch):
912
monkeypatch.delenv("SERPAPI_KEY", raising=False)
1013
monkeypatch.setenv("APPDATA", str(tmp_path))
1114
from src.app_gui import ensure_api_key
15+
1216
with patch("src.app_gui.simpledialog.askstring", return_value="abc"):
1317
assert ensure_api_key() == "abc"
1418

19+
1520
def test_start_calls_run_without_threading(monkeypatch):
1621
import src.app_gui as g
22+
1723
g.build_ui()
18-
g.entry_query.delete(0,'end'); g.entry_query.insert(0,"kawa")
19-
monkeypatch.setenv("SERPAPI_KEY","x")
24+
g.entry_query.delete(0, "end")
25+
g.entry_query.insert(0, "kawa")
26+
monkeypatch.setenv("SERPAPI_KEY", "x")
2027

2128
class DummyThread:
22-
def __init__(self, target, daemon): self.target = target
23-
def start(self): self.target()
29+
def __init__(self, target, daemon):
30+
self.target = target
31+
32+
def start(self):
33+
self.target()
2434

2535
monkeypatch.setattr("src.app_gui.threading.Thread", DummyThread)
2636
monkeypatch.setattr(g.root, "after", lambda ms, fn: fn(), raising=False)
2737

2838
async def fake_run(q):
29-
return [{"url":"u","title":"t","emails":[],"phones":[],"contact_url":None}]
39+
return [
40+
{"url": "u", "title": "t", "emails": [], "phones": [], "contact_url": None}
41+
]
42+
3043
monkeypatch.setattr("src.app_gui.run", fake_run)
3144

3245
with patch.object(g, "messagebox"):
3346
g.start()
3447
assert g.btn_start["state"] == "normal"
3548
assert "OK — zapisano" in g.status.get()
36-
try:
37-
g.root.destroy()
38-
except Exception:
39-
pass
4049

tests/test_net.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,22 @@
1-
import httpx, respx, pytest
1+
import httpx
2+
import pytest
3+
import respx
4+
25
from src.main import fetch
36

7+
48
@respx.mock
59
@pytest.mark.anyio
610
async def test_fetch_ok(monkeypatch):
711
monkeypatch.setattr("src.main.in_robots", lambda u: True)
8-
respx.get("https://x.test/ok").mock(return_value=httpx.Response(200, text="<h1>ok</h1>"))
12+
respx.get("https://x.test/ok").mock(
13+
return_value=httpx.Response(200, text="<h1>ok</h1>")
14+
)
915
async with httpx.AsyncClient() as c:
1016
html = await fetch("https://x.test/ok", c)
1117
assert "ok" in html
1218

19+
1320
@respx.mock
1421
@pytest.mark.anyio
1522
async def test_fetch_respects_robots(monkeypatch):

tests/test_parse.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
1-
from src.main import parse_info, absolutize
1+
from src.main import absolutize, parse_info
2+
23

34
def test_absolutize():
45
assert absolutize("https://ex.com/dir/", "../a") == "https://ex.com/a"
56

7+
68
def test_parse_info_extracts_contact():
79
html = """
810
<html><head><title> ACME </title></head>

0 commit comments

Comments
 (0)