Skip to content

Commit 3b4c110

Browse files
committed
feat: add PageLLMProfileNode and all-LLM capsule-grounded pipeline
1 parent 077e781 commit 3b4c110

19 files changed

+374
-96
lines changed
316 Bytes
Binary file not shown.

api/config.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
"""
2+
Configuration constants for OpenAI integration.
3+
"""
4+
import os
5+
6+
# Default model for LLM nodes, configurable via environment variable.
7+
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4.1-mini-2025-04-14")

api/main.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from api.nodes.rank_node import RankNode
2222
from api.nodes.guide_node import GuideNode
2323
from api.nodes.pdf_builder_node import PdfBuilderNode
24+
from api.nodes.page_llm_profile_node import PageLLMProfileNode
2425
from api.nodes.new_pipeline.pipeline import Generate10Pipeline
2526
from api.nodes.new_pipeline.web_fetch_node import WebFetchNode
2627
from api.nodes.new_pipeline.local_fetch_node import LocalFetchNode
@@ -61,13 +62,23 @@ async def generate(request: Request):
6162
if not url:
6263
raise HTTPException(status_code=400, detail="Missing 'url' in request body")
6364
try:
64-
# Pipeline: fetch, summarise, assets, prompts, ranking, tips, PDF
65-
raw_text = FetchSummaryNode(url)
66-
master_prompt = SummariseNode(raw_text)
67-
assets = AssetsNode(url)
68-
groups = PromptsNode(master_prompt, assets.get('palette', []))
69-
bests = RankNode(groups)
65+
# Pipeline: one-shot profile, prompts, ranking, tips, assets, PDF
66+
profile = PageLLMProfileNode(url)
67+
framework_plan = {
68+
"key_phrases": profile["keywords"],
69+
"sector": profile["sector"],
70+
"services": profile["services"],
71+
"geo": profile["geo"],
72+
"brand_tone": profile["brand_tone"],
73+
}
74+
prompts = PromptDraftNode(
75+
text=" ".join(profile["value_props"]),
76+
framework_plan=framework_plan,
77+
)
78+
bests = RankNode(prompts)
7079
tips = GuideNode(bests)
80+
# branding assets
81+
assets = AssetsNode(url)
7182
pdf_bytes = PdfBuilderNode(
7283
assets.get('logo_url'),
7384
assets.get('palette', []),
Lines changed: 36 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,41 @@
1-
from api.nodes.fetch_summary_node import Node
21
import logging
2+
import re
3+
import collections
4+
5+
from api.nodes.fetch_summary_node import Node
36

47
@Node(retries=1)
5-
def BusinessAnchorGuard(prompts, keyphrases: list[str]):
8+
def BusinessAnchorGuard(prompts, keyphrases_or_capsule):
69
"""
7-
Keep prompts that mention at least one scraped key-phrase.
8-
If no key-phrases were extracted, return an empty list.
10+
Dual-mode anchoring:
11+
- Legacy mode: prompts=list[str], keyphrases_or_capsule=list[str] => filter by keyphrases.
12+
- New mode: prompts_by_cat=dict[str,list[str]], keyphrases_or_capsule=capsule(str) => filter by capsule nouns.
913
"""
10-
# Skip anchoring for grouped prompts (dict input)
11-
if isinstance(prompts, dict):
12-
logging.getLogger(__name__).info(
13-
"BusinessAnchorGuard: grouped prompts detected, skipping filter")
14-
return prompts
15-
if not keyphrases:
16-
# No keyphrases: return original prompts
17-
return prompts
18-
19-
lowered_phrases = [kp.lower() for kp in keyphrases]
20-
anchored = [
21-
p for p in prompts
22-
if any(phrase in p.lower() for phrase in lowered_phrases)
23-
]
24-
return anchored
14+
logger = logging.getLogger(__name__)
15+
# Legacy mode: list of prompts and list of keyphrases
16+
if isinstance(prompts, list) and isinstance(keyphrases_or_capsule, list):
17+
keyphrases = keyphrases_or_capsule
18+
if not keyphrases:
19+
return prompts
20+
lowered = [kp.lower() for kp in keyphrases]
21+
return [p for p in prompts if any(phrase in p.lower() for phrase in lowered)]
22+
# New mode: dict of categories and capsule text
23+
if isinstance(prompts, dict) and isinstance(keyphrases_or_capsule, str):
24+
capsule = keyphrases_or_capsule
25+
# Extract words of length ≥4 as candidate nouns
26+
nouns = re.findall(r"\b[A-Za-z]{4,}\b", capsule.lower())
27+
top_nouns = [w for w, _ in collections.Counter(nouns).most_common(20)]
28+
anchored = {}
29+
for cat, items in prompts.items():
30+
filtered = []
31+
for p in items:
32+
lp = p.lower()
33+
overlap = sum(1 for noun in top_nouns if noun in lp)
34+
if overlap >= 3:
35+
filtered.append(p)
36+
# fallback if no prompts matched
37+
anchored[cat] = filtered or items
38+
return anchored
39+
# Fallback: return original
40+
logger.info("BusinessAnchorGuard: unexpected args, returning original prompts")
41+
return prompts
Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,11 @@
11
from api.nodes.fetch_summary_node import Node
2+
from api.nodes.new_pipeline.prompt_draft_node import QUOTAS
23

34
@Node(retries=1)
45
def FrameworkSelectNode(keyphrases: list[str]) -> dict:
56
"""
67
Determine the frameworks mix (RTF, RISEN, CRISPE) per PRD category quotas.
78
Returns a plan dict, e.g., {'Marketing':3,'Sales':2,...}.
89
"""
9-
# Fixed category quota: 3 Marketing, 2 Sales, 2 Success, 2 Product, 1 Ops
10-
return {
11-
'Marketing': 3,
12-
'Sales': 2,
13-
'Success': 2,
14-
'Product': 2,
15-
'Ops': 1,
16-
}
10+
# Use unified quotas from PromptDraftNode
11+
return QUOTAS
Lines changed: 11 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,17 @@
1+
"""
2+
Legacy keyphrase extraction node retained for backward compatibility.
3+
"""
4+
import logging
5+
16
from api.nodes.fetch_summary_node import Node
27

8+
logger = logging.getLogger(__name__)
9+
310
@Node(retries=1)
411
def KeyphraseNode(text: str) -> list[str]:
512
"""
6-
Extract at least 15 key phrases from the cleaned text for relevance scoring.
7-
Returns a list of key phrase strings.
13+
Extract key-phrases from the given text.
14+
Legacy stub: returns an empty list; override via pipeline node or tests.
815
"""
9-
import re
10-
from collections import Counter
11-
12-
# Simple keyphrase extraction: top frequent words excluding stopwords
13-
# Define basic English stopwords
14-
stopwords = {
15-
'the', 'and', 'a', 'an', 'of', 'to', 'in', 'for', 'on', 'with', 'as',
16-
'by', 'at', 'is', 'are', 'was', 'were', 'be', 'been', 'it', 'this',
17-
'that', 'from', 'or', 'but', 'not', 'your', 'our', 'their', 'its'
18-
}
19-
# Normalize text to lowercase and extract words
20-
words = re.findall(r"\b\w+\b", text.lower())
21-
# Filter tokens
22-
tokens = [w for w in words if w not in stopwords and len(w) > 2]
23-
# Count frequencies
24-
freq = Counter(tokens)
25-
# Return top 15 keyphrases (or fewer if not enough)
26-
keyphrases = [word for word, _ in freq.most_common(15)]
27-
return keyphrases
16+
logger.warning("KeyphraseNode is deprecated; returning empty list.")
17+
return []
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
"""
2+
Node to condense raw HTML into a business context capsule via LLM.
3+
"""
4+
import functools
5+
import logging
6+
import openai
7+
8+
from api.nodes.fetch_summary_node import Node
9+
from api.config import OPENAI_MODEL
10+
11+
logger = logging.getLogger(__name__)
12+
13+
@functools.lru_cache(maxsize=128)
14+
@Node(retries=2)
15+
def MiniMasterPromptNode(html: str) -> str:
16+
"""
17+
Condense supplied HTML into a ≤350-word Business Context Capsule.
18+
"""
19+
client = openai.OpenAI()
20+
try:
21+
resp = client.chat.completions.create(
22+
model=OPENAI_MODEL,
23+
response_format={"type": "text"},
24+
temperature=0.3,
25+
max_tokens=500,
26+
messages=[
27+
{
28+
"role": "system",
29+
"content": (
30+
"Condense the supplied HTML into a ≤350-word Business Context Capsule "
31+
"(USPs, services, tone, geo, benefits). Return plain text."
32+
),
33+
},
34+
{"role": "user", "content": html[:14000]},
35+
],
36+
)
37+
capsule = resp.choices[0].message.content.strip()
38+
except Exception as e:
39+
logger.error("MiniMasterPromptNode LLM error: %s", e)
40+
raise
41+
return capsule

api/nodes/new_pipeline/pipeline.py

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from api.nodes.new_pipeline.web_fetch_node import WebFetchNode
88
from api.nodes.new_pipeline.local_fetch_node import LocalFetchNode
99
from api.nodes.new_pipeline.clean_node import CleanNode
10-
from api.nodes.new_pipeline.keyphrase_node import KeyphraseNode
10+
from api.nodes.new_pipeline.mini_master_prompt_node import MiniMasterPromptNode
1111
from api.nodes.new_pipeline.framework_select_node import FrameworkSelectNode
1212
from api.nodes.new_pipeline.prompt_draft_node import PromptDraftNode
1313
from api.nodes.new_pipeline.deduplicate_node import DeduplicateNode
@@ -41,22 +41,20 @@ def Generate10Pipeline(url: str) -> bytes:
4141
# Step 2: clean
4242
text = CleanNode(html)
4343
logger.info("CleanNode output text length=%d", len(text))
44-
# Step 3: keyphrases
45-
keyphrases = KeyphraseNode(text)
46-
logger.info("KeyphraseNode output: %r", keyphrases)
47-
# Step 4: framework plan
48-
plan = FrameworkSelectNode(keyphrases)
44+
# Step 3: generate business capsule
45+
capsule = MiniMasterPromptNode(html)
46+
logger.info("MiniMasterPromptNode capsule length=%d", len(capsule))
47+
# Step 4: framework plan (static quotas)
48+
plan = FrameworkSelectNode([])
4949
logger.info("FrameworkSelectNode plan: %r", plan)
50-
plan["key_phrases"] = keyphrases
51-
# Step 5: draft prompts
52-
raw_prompts = PromptDraftNode(text, plan)
50+
# Step 5: draft prompts based on capsule
51+
raw_prompts = PromptDraftNode(capsule, plan)
5352
logger.info("PromptDraftNode output: %r", raw_prompts)
5453
# Step 6: dedupe
5554
unique_prompts = DeduplicateNode(raw_prompts)
5655
logger.info("DeduplicateNode output: %r", unique_prompts)
57-
# Step 7: business anchor
58-
# BusinessAnchorGuard may filter within categories or lists
59-
anchored_prompts = BusinessAnchorGuard(unique_prompts, keyphrases)
56+
# Step 7: business anchor using capsule nouns
57+
anchored_prompts = BusinessAnchorGuard(unique_prompts, capsule)
6058
logger.info("BusinessAnchorGuard output: %r", anchored_prompts)
6159
# Step 8: enforce quota
6260
final_prompts = QuotaEnforceNode(anchored_prompts, plan)

api/nodes/new_pipeline/prompt_draft_node.py

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,56 +1,54 @@
1-
from api.nodes.fetch_summary_node import Node
21
import logging
2+
import json
3+
from api.nodes.fetch_summary_node import Node
4+
from api.config import OPENAI_MODEL
5+
36
logging.basicConfig(level=logging.INFO)
47
logger = logging.getLogger(__name__)
58

9+
# Prompt quotas per category (total 9 slots)
10+
QUOTAS = {"Marketing": 3, "Sales": 2, "Product": 2, "Success": 1, "Ops": 1}
11+
612
@Node(retries=3)
713
def PromptDraftNode(text: str, framework_plan: dict) -> dict[str, list[str]]:
814
"""
915
Draft 10-25 raw prompts with explicit constraints and strong business anchoring.
1016
`framework_plan` **must** contain "key_phrases": list[str].
1117
"""
12-
import json, hashlib, logging, openai
13-
14-
logger = logging.getLogger(__name__)
15-
key_phrases: list[str] = framework_plan.get("key_phrases", [])
16-
min_phrases_required = 2 if key_phrases else 1 # fallback if none
18+
import openai, hashlib
1719

20+
# Framework plan may include quotas and capsule context
1821
client = openai.OpenAI()
1922

23+
# System prompt for generating prompt packs grounded in business capsule
2024
system_msg = {
2125
"role": "system",
2226
"content": (
23-
"Draft 10-25 AI prompts grouped by business function. "
24-
"Return ONLY valid JSON shaped as:\n"
25-
"{\n"
26-
" \"Marketing\": [\"You are a ...\", ...],\n"
27-
" \"Sales\": [...],\n"
28-
" \"Success\": [...],\n"
29-
" \"Product\": [...],\n"
30-
" \"Ops\": [...]\n"
31-
"}\n"
32-
f"Rules: • each prompt begins with \"You are a ...\" • min {min_phrases_required} key-phrases "
33-
"• ≤220 tokens • quotas: Marketing 3, Sales 2, Success 2, Product 2, Ops 1."
27+
"You are a Prompt-Pack Generator. Given a Business Context Capsule and a framework plan, "
28+
"generate 10–25 high-quality prompts grouped by business function. "
29+
"Return ONLY valid JSON mapping categories to arrays of prompt strings. "
30+
"Prompts must be no more than 220 tokens each. "
31+
f"Quotas per category: {QUOTAS}."
3432
)
3533
}
3634

3735
user_msg = {
3836
"role": "user",
3937
"content": (
40-
f"<business_text>{text}</business_text>\n"
41-
f"<key_phrases>{', '.join(key_phrases)}</key_phrases>\n"
38+
f"<capsule>{text}</capsule>\n"
4239
f"<framework_plan>{json.dumps(framework_plan, ensure_ascii=False)}</framework_plan>"
4340
),
4441
}
4542

43+
# Use deterministic seed for repeatability
4644
seed_val = int(
47-
hashlib.sha256((text + 'gpt-4.1-mini-2025-04-14').encode()).hexdigest(), 16
45+
hashlib.sha256((text + OPENAI_MODEL).encode()).hexdigest(), 16
4846
) % 2**31
4947

5048
resp = client.chat.completions.create(
51-
model="gpt-4.1-mini-2025-04-14",
49+
model=OPENAI_MODEL,
5250
messages=[system_msg, user_msg],
53-
temperature=0.0, # deterministic
51+
temperature=0.35,
5452
seed=seed_val,
5553
)
5654

api/nodes/page_llm_profile_node.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
"""
2+
Node that uses an LLM to fetch and profile a webpage in one shot.
3+
Returns structured JSON with business metadata.
4+
"""
5+
import json
6+
import logging
7+
import openai
8+
9+
from api.nodes.fetch_summary_node import Node
10+
from api.config import OPENAI_MODEL
11+
12+
logger = logging.getLogger(__name__)
13+
14+
@Node(retries=2)
15+
def PageLLMProfileNode(url: str) -> dict:
16+
"""
17+
One-shot profile of a business homepage via LLM.
18+
Returns JSON with keys: name, sector, services, geo,
19+
value_props, brand_tone, keywords.
20+
"""
21+
client = openai.OpenAI()
22+
try:
23+
resp = client.chat.completions.create(
24+
model=OPENAI_MODEL,
25+
tools=[{"type": "web_search"}],
26+
response_format={"type": "json_object"},
27+
temperature=0.3,
28+
max_tokens=400,
29+
messages=[
30+
{
31+
"role": "system",
32+
"content": (
33+
"You are BizScraper-AI. Use web_search to visit the URL, "
34+
"gather up to 14 000 chars, and return ONLY valid JSON with: "
35+
"name, sector, services (≤5), geo, value_props (3–5), "
36+
"brand_tone ['friendly','formal','premium','fun','neutral','playful'], "
37+
"keywords (≤8). If missing, use empty string/list."
38+
)
39+
},
40+
{"role": "user", "content": url, "tool": "web_search"},
41+
],
42+
)
43+
raw = resp.choices[0].message.content
44+
except Exception as e:
45+
logger.error("LLM profile call failed: %s", e)
46+
raise
47+
try:
48+
data = json.loads(raw)
49+
except json.JSONDecodeError:
50+
logger.error("JSON decode error in PageLLMProfileNode: %s", raw[:500])
51+
# Clear cache entry on parse failure so subsequent calls retry
52+
try:
53+
PageLLMProfileNode.cache_clear()
54+
except Exception:
55+
pass
56+
data = {}
57+
# fill defaults
58+
for key in ("name", "sector", "geo", "brand_tone"):
59+
data.setdefault(key, "")
60+
for key in ("services", "value_props", "keywords"):
61+
data.setdefault(key, [])
62+
return data

0 commit comments

Comments
 (0)