kc3hack · yudai-kdix · Feb 26, 2026
diff --git a/Backend/README.md b/Backend/README.md
@@ -111,6 +111,47 @@ Cloud Run へのデプロイ手順は以下を参照してください。
       "sentence_vector": [0.0312, -0.0124, 0.2011, -0.0942]
     }
     ```
+
+- `POST /analysis/tfidf/bubble-scores`
+  - 発話ごとのTF-IDFスコアから、バブルUI向けサイズを算出する
+  - 算出ロジック: `raw_score = 上位top_k語のTF-IDF合計` を `p10/p90` で正規化し、`min/max` サイズへマッピング
+  - リクエスト例:
+    ```json
+    {
+      "utterances": [
+        "今日はRAGの設計を詰めます。",
+        "APIのレイテンシ改善も必要です。",
+        "GPUコストの見積もりも確認しましょう。"
+      ],
+      "top_k": 3,
+      "window_size": 30,
+      "min_bubble_size": 28,
+      "max_bubble_size": 72
+    }
+    ```
+  - レスポンス例（抜粋）:
+    ```json
+    {
+      "meta": {
+        "algorithm": "tfidf_topk_sum_v1",
+        "p10": 0.31211,
+        "p90": 1.98212,
+        "utterance_count": 3
+      },
+      "items": [
+        {
+          "index": 0,
+          "text": "今日はRAGの設計を詰めます。",
+          "raw_score": 1.423111,
+          "normalized_score": 0.665269,
+          "bubble_size": 57,
+          "top_terms": [
+            { "term": "rag", "score": 0.845212 }
+          ]
+        }
+      ]
+    }
+    ```
   - レスポンス例（抜粋）:
     ```json
     {

diff --git a/Backend/app/api/endpoints/analysis.py b/Backend/app/api/endpoints/analysis.py
@@ -6,10 +6,16 @@
     ReferDictionaryEntry,
     SentenceVectorizeRequest,
     SentenceVectorizeResponse,
+    TfidfBubbleScoresRequest,
+    TfidfBubbleScoresResponse,
     VectorizeRequest,
     VectorizeResponse,
 )
-from app.services.text_analysis import vectorize_content_tokens, vectorize_sentence
+from app.services.text_analysis import (
+    tfidf_bubble_scores,
+    vectorize_content_tokens,
+    vectorize_sentence,
+)
 from app.services.refer_dictionary import refer_dictionary
 
 router = fastapi.APIRouter()
@@ -95,6 +101,51 @@ def vectorize_sentence_endpoint(
     return SentenceVectorizeResponse(**result)
 
 
+@router.post(
+    "/tfidf/bubble-scores",
+    response_model=TfidfBubbleScoresResponse,
+    summary="発話ごとのTF-IDFスコアからバブルサイズを算出する",
+    description=(
+        "1発話=1バブルとしてTF-IDFを算出し、"
+        "上位top_k語のスコア合計を p10/p90 基準で正規化してバブルサイズに変換します。"
+    ),
+    response_description="発話ごとのTF-IDFバブルスコア",
+    responses={
+        200: {"description": "算出成功"},
+        422: {"description": "入力バリデーションエラー（空配列・空白発話など）"},
+    },
+)
+def tfidf_bubble_scores_endpoint(
+    body: TfidfBubbleScoresRequest = fastapi.Body(
+        ...,
+        examples={
+            "default": {
+                "summary": "既定パラメータで算出",
+                "value": {
+                    "utterances": [
+                        "今日はRAGの設計を詰めます。",
+                        "APIのレイテンシ改善も必要です。",
+                        "GPUコストの見積もりも確認しましょう。",
+                    ],
+                    "top_k": 3,
+                    "window_size": 30,
+                    "min_bubble_size": 28,
+                    "max_bubble_size": 72,
+                },
+            }
+        },
+    )
+) -> TfidfBubbleScoresResponse:
+    result = tfidf_bubble_scores(
+        utterances=body.utterances,
+        top_k=body.top_k,
+        window_size=body.window_size,
+        min_bubble_size=body.min_bubble_size,
+        max_bubble_size=body.max_bubble_size,
+    )
+    return TfidfBubbleScoresResponse(**result)
+
+
 @router.post(
     "/refer_dictionary",
     response_model=ReferDictionaryResponse,
@@ -115,4 +166,4 @@ async def refer_dictionary_endpoint(
     return ReferDictionaryResponse(
         text=body.text,
         entries=[ReferDictionaryEntry(**e) for e in entries],
-    )
+    )
diff --git a/Backend/app/schemas/analysis.py b/Backend/app/schemas/analysis.py
@@ -183,6 +183,139 @@ class SentenceVectorizeResponse(BaseModel):
     }
 
 
+class TfidfBubbleScoresRequest(BaseModel):
+    utterances: list[str] = Field(
+        min_length=1,
+        description="バブル対象の発話配列（1要素=1バブル）",
+        examples=[["今日はRAGの設計を詰めます。", "APIのレイテンシ改善も必要です。"]],
+    )
+    top_k: int = Field(
+        default=3,
+        ge=1,
+        le=10,
+        description="各発話で raw_score に加算する上位TF-IDF語数",
+        examples=[3],
+    )
+    window_size: int = Field(
+        default=30,
+        ge=1,
+        le=200,
+        description="TF-IDF算出に使うスライディング窓サイズ（発話数）",
+        examples=[30],
+    )
+    min_bubble_size: int = Field(
+        default=28,
+        ge=12,
+        le=200,
+        description="正規化スコア0のときの最小バブルサイズ(px)",
+        examples=[28],
+    )
+    max_bubble_size: int = Field(
+        default=72,
+        ge=12,
+        le=280,
+        description="正規化スコア1のときの最大バブルサイズ(px)",
+        examples=[72],
+    )
+
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "utterances": [
+                        "今日はRAGの設計を詰めます。",
+                        "APIのレイテンシ改善も必要です。",
+                        "GPUコストの見積もりも確認しましょう。",
+                    ],
+                    "top_k": 3,
+                    "window_size": 30,
+                    "min_bubble_size": 28,
+                    "max_bubble_size": 72,
+                }
+            ]
+        }
+    }
+
+    @field_validator("utterances")
+    @classmethod
+    def validate_utterances_not_blank(cls, values: list[str]) -> list[str]:
+        if not values:
+            raise ValueError("utterances must not be empty")
+        if any(not value.strip() for value in values):
+            raise ValueError("utterances must not contain blank text")
+        return values
+
+    @field_validator("max_bubble_size")
+    @classmethod
+    def validate_bubble_size_range(cls, value: int, info) -> int:
+        min_size = info.data.get("min_bubble_size")
+        if isinstance(min_size, int) and value <= min_size:
+            raise ValueError("max_bubble_size must be greater than min_bubble_size")
+        return value
+
+
+class TfidfBubbleTerm(BaseModel):
+    term: str = Field(description="TF-IDF上位語", examples=["rag"])
+    score: float = Field(description="語のTF-IDFスコア", examples=[0.845212])
+
+
+class TfidfBubbleItem(BaseModel):
+    index: int = Field(description="utterances 内のインデックス", examples=[0])
+    text: str = Field(description="対象発話テキスト")
+    raw_score: float = Field(description="上位top_k語のTF-IDF合計スコア", examples=[1.423111])
+    normalized_score: float = Field(description="p10/p90基準で0..1に正規化したスコア", examples=[0.734212])
+    bubble_size: int = Field(description="フロント表示向けの推奨バブルサイズ(px)", examples=[60])
+    top_terms: list[TfidfBubbleTerm] = Field(description="発話内のTF-IDF上位語（最大top_k件）")
+
+
+class TfidfBubbleScoresMeta(BaseModel):
+    algorithm: str = Field(description="スコア算出アルゴリズム識別子", examples=["tfidf_topk_sum_v1"])
+    top_k: int = Field(description="raw_score計算に使った上位語数", examples=[3])
+    window_size: int = Field(description="TF-IDF算出窓サイズ", examples=[30])
+    min_bubble_size: int = Field(description="最小バブルサイズ(px)", examples=[28])
+    max_bubble_size: int = Field(description="最大バブルサイズ(px)", examples=[72])
+    p10: float = Field(description="raw_score の10パーセンタイル", examples=[0.31211])
+    p90: float = Field(description="raw_score の90パーセンタイル", examples=[1.98212])
+    utterance_count: int = Field(description="入力発話数", examples=[12])
+
+
+class TfidfBubbleScoresResponse(BaseModel):
+    meta: TfidfBubbleScoresMeta = Field(description="TF-IDFバブルスコア算出のメタ情報")
+    items: list[TfidfBubbleItem] = Field(description="発話ごとのスコアと推奨バブルサイズ")
+
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "meta": {
+                        "algorithm": "tfidf_topk_sum_v1",
+                        "top_k": 3,
+                        "window_size": 30,
+                        "min_bubble_size": 28,
+                        "max_bubble_size": 72,
+                        "p10": 0.31211,
+                        "p90": 1.98212,
+                        "utterance_count": 3,
+                    },
+                    "items": [
+                        {
+                            "index": 0,
+                            "text": "今日はRAGの設計を詰めます。",
+                            "raw_score": 1.423111,
+                            "normalized_score": 0.665269,
+                            "bubble_size": 57,
+                            "top_terms": [
+                                {"term": "rag", "score": 0.845212},
+                                {"term": "設計", "score": 0.577899},
+                            ],
+                        }
+                    ],
+                }
+            ]
+        }
+    }
+
+
 class ReferDictionaryRequest(BaseModel):
     text: str = Field(
         min_length=1,
@@ -219,4 +352,3 @@ class ReferDictionaryResponse(BaseModel):
     entries: list[ReferDictionaryEntry] = Field(
         description="辞書検索結果の一覧",
     )
-
diff --git a/Backend/app/services/text_analysis.py b/Backend/app/services/text_analysis.py
@@ -763,6 +763,109 @@ def top_terms_by_tfidf(corpus: list[str], top_k: int = 10) -> list[list[dict[str
     return top_terms
 
 
+def _percentile(values: list[float], quantile: float) -> float:
+    """Return interpolated percentile value in [0.0, 1.0]."""
+    if not values:
+        return 0.0
+
+    sorted_values = sorted(values)
+    if len(sorted_values) == 1:
+        return float(sorted_values[0])
+
+    position = (len(sorted_values) - 1) * quantile
+    lower = math.floor(position)
+    upper = math.ceil(position)
+    if lower == upper:
+        return float(sorted_values[lower])
+
+    lower_value = float(sorted_values[lower])
+    upper_value = float(sorted_values[upper])
+    ratio = position - lower
+    return lower_value + (upper_value - lower_value) * ratio
+
+
+def tfidf_bubble_scores(
+    utterances: list[str],
+    top_k: int = 3,
+    window_size: int = 30,
+    min_bubble_size: int = 28,
+    max_bubble_size: int = 72,
+) -> dict[str, Any]:
+    """Calculate TF-IDF-based bubble sizes for utterance list."""
+    if not utterances:
+        return {
+            "meta": {
+                "algorithm": "tfidf_topk_sum_v1",
+                "top_k": top_k,
+                "window_size": window_size,
+                "min_bubble_size": min_bubble_size,
+                "max_bubble_size": max_bubble_size,
+                "p10": 0.0,
+                "p90": 0.0,
+                "utterance_count": 0,
+            },
+            "items": [],
+        }
+
+    cleaned = [utterance.strip() for utterance in utterances]
+    items: list[dict[str, Any]] = []
+    raw_scores: list[float] = []
+
+    # 発話 i ごとに「直近 window_size 発話」をコーパスとしてTF-IDFを算出する。
+    for i, utterance in enumerate(cleaned):
+        start = max(0, i - window_size + 1)
+        corpus = cleaned[start : i + 1]
+        current_score_map = tfidf_scores(corpus)[-1] if corpus else {}
+
+        sorted_terms = sorted(current_score_map.items(), key=lambda x: x[1], reverse=True)[:top_k]
+        raw_score = float(sum(score for _, score in sorted_terms))
+        raw_scores.append(raw_score)
+
+        items.append(
+            {
+                "index": i,
+                "text": utterance,
+                "raw_score": round(raw_score, 6),
+                "normalized_score": 0.0,
+                "bubble_size": min_bubble_size,
+                "top_terms": [
+                    {"term": term, "score": round(float(score), 6)}
+                    for term, score in sorted_terms
+                ],
+            }
+        )
+
+    p10 = _percentile(raw_scores, 0.10)
+    p90 = _percentile(raw_scores, 0.90)
+
+    # p10/p90でロバスト正規化し、0..1をバブルサイズに線形マッピングする。
+    score_width = p90 - p10
+    for item, raw_score in zip(items, raw_scores):
+        if score_width <= 0.0:
+            normalized = 0.0 if raw_score <= 0.0 else 0.5
+        else:
+            normalized = (raw_score - p10) / score_width
+            normalized = max(0.0, min(1.0, normalized))
+
+        bubble_size = int(round(min_bubble_size + (max_bubble_size - min_bubble_size) * normalized))
+        item["normalized_score"] = round(normalized, 6)
+        item["bubble_size"] = bubble_size
+
+    return {
+        "meta": {
+            "algorithm": "tfidf_topk_sum_v1",
+            "top_k": top_k,
+            "window_size": window_size,
+            "min_bubble_size": min_bubble_size,
+            "max_bubble_size": max_bubble_size,
+            "p10": round(float(p10), 6),
+            "p90": round(float(p90), 6),
+            "utterance_count": len(cleaned),
+        },
+        "items": items,
+    }
+
+
 # --- refer_dictionary 向け: 形態素解析済みトークンから直接ベクトルを算出する ---
 # morphological_analysis を再度呼ばずに済むため、
 # スレッドセーフ問題を回避しつつパフォーマンスも改善できる。
@@ -845,4 +948,3 @@ def _get_vocab_vector(nlp: Any | None, word: str) -> list[float]:
         return [float(v) for v in lexeme.vector]
 
     return []
-