Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions Backend/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,47 @@ Cloud Run へのデプロイ手順は以下を参照してください。
"sentence_vector": [0.0312, -0.0124, 0.2011, -0.0942]
}
```

- `POST /analysis/tfidf/bubble-scores`
- 発話ごとのTF-IDFスコアから、バブルUI向けサイズを算出する
- 算出ロジック: `raw_score = 上位top_k語のTF-IDF合計` を `p10/p90` で正規化し、`min/max` サイズへマッピング
- リクエスト例:
```json
{
"utterances": [
"今日はRAGの設計を詰めます。",
"APIのレイテンシ改善も必要です。",
"GPUコストの見積もりも確認しましょう。"
],
"top_k": 3,
"window_size": 30,
"min_bubble_size": 28,
"max_bubble_size": 72
}
```
- レスポンス例(抜粋):
```json
{
"meta": {
"algorithm": "tfidf_topk_sum_v1",
"p10": 0.31211,
"p90": 1.98212,
"utterance_count": 3
},
"items": [
{
"index": 0,
"text": "今日はRAGの設計を詰めます。",
"raw_score": 1.423111,
"normalized_score": 0.665269,
"bubble_size": 57,
"top_terms": [
{ "term": "rag", "score": 0.845212 }
]
}
]
}
```
- レスポンス例(抜粋):
```json
{
Expand Down
55 changes: 53 additions & 2 deletions Backend/app/api/endpoints/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,16 @@
ReferDictionaryEntry,
SentenceVectorizeRequest,
SentenceVectorizeResponse,
TfidfBubbleScoresRequest,
TfidfBubbleScoresResponse,
VectorizeRequest,
VectorizeResponse,
)
from app.services.text_analysis import vectorize_content_tokens, vectorize_sentence
from app.services.text_analysis import (
tfidf_bubble_scores,
vectorize_content_tokens,
vectorize_sentence,
)
from app.services.refer_dictionary import refer_dictionary

router = fastapi.APIRouter()
Expand Down Expand Up @@ -95,6 +101,51 @@ def vectorize_sentence_endpoint(
return SentenceVectorizeResponse(**result)


@router.post(
"/tfidf/bubble-scores",
response_model=TfidfBubbleScoresResponse,
summary="発話ごとのTF-IDFスコアからバブルサイズを算出する",
description=(
"1発話=1バブルとしてTF-IDFを算出し、"
"上位top_k語のスコア合計を p10/p90 基準で正規化してバブルサイズに変換します。"
),
response_description="発話ごとのTF-IDFバブルスコア",
responses={
200: {"description": "算出成功"},
422: {"description": "入力バリデーションエラー(空配列・空白発話など)"},
},
)
def tfidf_bubble_scores_endpoint(
body: TfidfBubbleScoresRequest = fastapi.Body(
...,
examples={
"default": {
"summary": "既定パラメータで算出",
"value": {
"utterances": [
"今日はRAGの設計を詰めます。",
"APIのレイテンシ改善も必要です。",
"GPUコストの見積もりも確認しましょう。",
],
"top_k": 3,
"window_size": 30,
"min_bubble_size": 28,
"max_bubble_size": 72,
},
}
},
)
) -> TfidfBubbleScoresResponse:
result = tfidf_bubble_scores(
utterances=body.utterances,
top_k=body.top_k,
window_size=body.window_size,
min_bubble_size=body.min_bubble_size,
max_bubble_size=body.max_bubble_size,
)
return TfidfBubbleScoresResponse(**result)


@router.post(
"/refer_dictionary",
response_model=ReferDictionaryResponse,
Expand All @@ -115,4 +166,4 @@ async def refer_dictionary_endpoint(
return ReferDictionaryResponse(
text=body.text,
entries=[ReferDictionaryEntry(**e) for e in entries],
)
)
134 changes: 133 additions & 1 deletion Backend/app/schemas/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,139 @@ class SentenceVectorizeResponse(BaseModel):
}


class TfidfBubbleScoresRequest(BaseModel):
utterances: list[str] = Field(
min_length=1,
description="バブル対象の発話配列(1要素=1バブル)",
examples=[["今日はRAGの設計を詰めます。", "APIのレイテンシ改善も必要です。"]],
)
top_k: int = Field(
default=3,
ge=1,
le=10,
description="各発話で raw_score に加算する上位TF-IDF語数",
examples=[3],
)
window_size: int = Field(
default=30,
ge=1,
le=200,
description="TF-IDF算出に使うスライディング窓サイズ(発話数)",
examples=[30],
)
min_bubble_size: int = Field(
default=28,
ge=12,
le=200,
description="正規化スコア0のときの最小バブルサイズ(px)",
examples=[28],
)
max_bubble_size: int = Field(
default=72,
ge=12,
le=280,
description="正規化スコア1のときの最大バブルサイズ(px)",
examples=[72],
)

model_config = {
"json_schema_extra": {
"examples": [
{
"utterances": [
"今日はRAGの設計を詰めます。",
"APIのレイテンシ改善も必要です。",
"GPUコストの見積もりも確認しましょう。",
],
"top_k": 3,
"window_size": 30,
"min_bubble_size": 28,
"max_bubble_size": 72,
}
]
}
}

@field_validator("utterances")
@classmethod
def validate_utterances_not_blank(cls, values: list[str]) -> list[str]:
if not values:
raise ValueError("utterances must not be empty")
if any(not value.strip() for value in values):
raise ValueError("utterances must not contain blank text")
return values

@field_validator("max_bubble_size")
@classmethod
def validate_bubble_size_range(cls, value: int, info) -> int:
min_size = info.data.get("min_bubble_size")
if isinstance(min_size, int) and value <= min_size:
raise ValueError("max_bubble_size must be greater than min_bubble_size")
return value


class TfidfBubbleTerm(BaseModel):
term: str = Field(description="TF-IDF上位語", examples=["rag"])
score: float = Field(description="語のTF-IDFスコア", examples=[0.845212])


class TfidfBubbleItem(BaseModel):
index: int = Field(description="utterances 内のインデックス", examples=[0])
text: str = Field(description="対象発話テキスト")
raw_score: float = Field(description="上位top_k語のTF-IDF合計スコア", examples=[1.423111])
normalized_score: float = Field(description="p10/p90基準で0..1に正規化したスコア", examples=[0.734212])
bubble_size: int = Field(description="フロント表示向けの推奨バブルサイズ(px)", examples=[60])
top_terms: list[TfidfBubbleTerm] = Field(description="発話内のTF-IDF上位語(最大top_k件)")


class TfidfBubbleScoresMeta(BaseModel):
algorithm: str = Field(description="スコア算出アルゴリズム識別子", examples=["tfidf_topk_sum_v1"])
top_k: int = Field(description="raw_score計算に使った上位語数", examples=[3])
window_size: int = Field(description="TF-IDF算出窓サイズ", examples=[30])
min_bubble_size: int = Field(description="最小バブルサイズ(px)", examples=[28])
max_bubble_size: int = Field(description="最大バブルサイズ(px)", examples=[72])
p10: float = Field(description="raw_score の10パーセンタイル", examples=[0.31211])
p90: float = Field(description="raw_score の90パーセンタイル", examples=[1.98212])
utterance_count: int = Field(description="入力発話数", examples=[12])


class TfidfBubbleScoresResponse(BaseModel):
meta: TfidfBubbleScoresMeta = Field(description="TF-IDFバブルスコア算出のメタ情報")
items: list[TfidfBubbleItem] = Field(description="発話ごとのスコアと推奨バブルサイズ")

model_config = {
"json_schema_extra": {
"examples": [
{
"meta": {
"algorithm": "tfidf_topk_sum_v1",
"top_k": 3,
"window_size": 30,
"min_bubble_size": 28,
"max_bubble_size": 72,
"p10": 0.31211,
"p90": 1.98212,
"utterance_count": 3,
},
"items": [
{
"index": 0,
"text": "今日はRAGの設計を詰めます。",
"raw_score": 1.423111,
"normalized_score": 0.665269,
"bubble_size": 57,
"top_terms": [
{"term": "rag", "score": 0.845212},
{"term": "設計", "score": 0.577899},
],
}
],
}
]
}
}


class ReferDictionaryRequest(BaseModel):
text: str = Field(
min_length=1,
Expand Down Expand Up @@ -219,4 +352,3 @@ class ReferDictionaryResponse(BaseModel):
entries: list[ReferDictionaryEntry] = Field(
description="辞書検索結果の一覧",
)

104 changes: 103 additions & 1 deletion Backend/app/services/text_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -763,6 +763,109 @@ def top_terms_by_tfidf(corpus: list[str], top_k: int = 10) -> list[list[dict[str
return top_terms


def _percentile(values: list[float], quantile: float) -> float:
"""Return interpolated percentile value in [0.0, 1.0]."""
if not values:
return 0.0

sorted_values = sorted(values)
if len(sorted_values) == 1:
return float(sorted_values[0])

position = (len(sorted_values) - 1) * quantile
lower = math.floor(position)
upper = math.ceil(position)
if lower == upper:
return float(sorted_values[lower])

lower_value = float(sorted_values[lower])
upper_value = float(sorted_values[upper])
ratio = position - lower
return lower_value + (upper_value - lower_value) * ratio


def tfidf_bubble_scores(
utterances: list[str],
top_k: int = 3,
window_size: int = 30,
min_bubble_size: int = 28,
max_bubble_size: int = 72,
) -> dict[str, Any]:
"""Calculate TF-IDF-based bubble sizes for utterance list."""
if not utterances:
return {
"meta": {
"algorithm": "tfidf_topk_sum_v1",
"top_k": top_k,
"window_size": window_size,
"min_bubble_size": min_bubble_size,
"max_bubble_size": max_bubble_size,
"p10": 0.0,
"p90": 0.0,
"utterance_count": 0,
},
"items": [],
}

cleaned = [utterance.strip() for utterance in utterances]
items: list[dict[str, Any]] = []
raw_scores: list[float] = []

# 発話 i ごとに「直近 window_size 発話」をコーパスとしてTF-IDFを算出する。
for i, utterance in enumerate(cleaned):
start = max(0, i - window_size + 1)
corpus = cleaned[start : i + 1]
current_score_map = tfidf_scores(corpus)[-1] if corpus else {}

sorted_terms = sorted(current_score_map.items(), key=lambda x: x[1], reverse=True)[:top_k]
raw_score = float(sum(score for _, score in sorted_terms))
raw_scores.append(raw_score)

items.append(
{
"index": i,
"text": utterance,
"raw_score": round(raw_score, 6),
"normalized_score": 0.0,
"bubble_size": min_bubble_size,
"top_terms": [
{"term": term, "score": round(float(score), 6)}
for term, score in sorted_terms
],
}
)

p10 = _percentile(raw_scores, 0.10)
p90 = _percentile(raw_scores, 0.90)

# p10/p90でロバスト正規化し、0..1をバブルサイズに線形マッピングする。
score_width = p90 - p10
for item, raw_score in zip(items, raw_scores):
if score_width <= 0.0:
normalized = 0.0 if raw_score <= 0.0 else 0.5
else:
normalized = (raw_score - p10) / score_width
normalized = max(0.0, min(1.0, normalized))

bubble_size = int(round(min_bubble_size + (max_bubble_size - min_bubble_size) * normalized))
item["normalized_score"] = round(normalized, 6)
item["bubble_size"] = bubble_size

return {
"meta": {
"algorithm": "tfidf_topk_sum_v1",
"top_k": top_k,
"window_size": window_size,
"min_bubble_size": min_bubble_size,
"max_bubble_size": max_bubble_size,
"p10": round(float(p10), 6),
"p90": round(float(p90), 6),
"utterance_count": len(cleaned),
},
"items": items,
}


# --- refer_dictionary 向け: 形態素解析済みトークンから直接ベクトルを算出する ---
# morphological_analysis を再度呼ばずに済むため、
# スレッドセーフ問題を回避しつつパフォーマンスも改善できる。
Expand Down Expand Up @@ -845,4 +948,3 @@ def _get_vocab_vector(nlp: Any | None, word: str) -> list[float]:
return [float(v) for v in lexeme.vector]

return []

Loading