Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions Backend/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,13 @@ Cloud Run へのデプロイ手順は以下を参照してください。
}
```

- `POST /pipeline/transcribe-analyze`
- Desktop向け統合API。音声チャンクを受け取り、文字起こし(MVPでは `text_override` 指定可)と解析結果を1レスポンスで返す
- `is_final_chunk=false` の場合は解析をスキップし、partialテキストのみ返す
- `is_final_chunk=true` の場合は内容語ベクトル化・文ベクトル化・(必要時)辞書参照を返す
- 主要フォーム項目: `audio`, `session_id`, `chunk_seq`, `is_final_chunk`, `input_source`, `audio_format`, `include_dictionary`
- `audio_format` は `wav | pcm16 | webm_opus` をサポート

- `POST /dictionary/lookup`
- 用語の意味を日本語で1〜2文の概要として返す
- このエンドポイントは現在DB参照未連携のため、常にGeminiで生成する
Expand Down Expand Up @@ -210,15 +217,18 @@ Backend/
│ └── endpoints/
│ ├── analysis.py
│ ├── dictionary.py
│ └── hoge.py
│ ├── hoge.py
│ └── pipeline.py
├── services/
│ ├── dictionary.py
│ ├── speech_pipeline.py
│ ├── text_analysis.py
│ └── hoge.py
└── schemas/
├── analysis.py
├── dictionary.py
└── hoge.py
├── hoge.py
└── pipeline.py
```

## 今後の実装予定
Expand Down
3 changes: 2 additions & 1 deletion Backend/app/api/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import fastapi
from app.api.endpoints import hoge, analysis, dictionary
from app.api.endpoints import hoge, analysis, dictionary, pipeline

router = fastapi.APIRouter()

router.include_router(hoge.router, prefix="/hoge", tags=["hoge"])
router.include_router(analysis.router, prefix="/analysis", tags=["analysis"])
router.include_router(dictionary.router, prefix="/dictionary", tags=["dictionary"])
router.include_router(pipeline.router, prefix="/pipeline", tags=["pipeline"])

# 新しくエンドポイントを追加するときは、
# 1. app/api/endpoints/new_endpoint.pyを作成する
Expand Down
81 changes: 81 additions & 0 deletions Backend/app/api/endpoints/pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
from __future__ import annotations

import fastapi

from app.schemas.pipeline import PipelineTranscribeAnalyzeResponse
from app.services.speech_pipeline import transcribe_and_analyze_chunk

router = fastapi.APIRouter()


@router.post(
"/transcribe-analyze",
response_model=PipelineTranscribeAnalyzeResponse,
summary="音声チャンクを文字起こしし、解析結果をまとめて返す",
description=(
"Desktop向けの統合パイプラインAPIです。"
" 音声チャンクを受け取り、文字起こし(暫定)と解析結果を1レスポンスで返します。"
),
responses={
200: {"description": "処理成功"},
400: {"description": "チャンク連番不正"},
413: {"description": "音声チャンクサイズ上限超過"},
415: {"description": "非対応フォーマット"},
422: {"description": "入力バリデーションエラー"},
},
)
async def transcribe_analyze(
audio: fastapi.UploadFile = fastapi.File(..., description="音声チャンク"),
session_id: str = fastapi.Form(..., min_length=1),
chunk_seq: int = fastapi.Form(..., ge=0),
is_final_chunk: bool = fastapi.Form(False),
input_source: str = fastapi.Form("microphone"),
audio_format: str = fastapi.Form("wav"),
sample_rate_hz: int = fastapi.Form(16000, ge=1),
channels: int = fastapi.Form(1, ge=1),
language_hint: str = fastapi.Form("ja-JP"),
include_dictionary: bool = fastapi.Form(False),
dictionary_top_k: int = fastapi.Form(5, ge=1, le=50),
deduplicate: bool = fastapi.Form(False),
min_length: int = fastapi.Form(1, ge=1, le=64),
normalize_sentence_vector: bool = fastapi.Form(True),
text_override: str | None = fastapi.Form(
default=None,
description="MVP向け暫定: STT未接続時に文字起こし結果として扱うテキスト",
),
) -> PipelineTranscribeAnalyzeResponse:
if input_source not in {"microphone", "system_audio"}:
raise fastapi.HTTPException(status_code=422, detail="input_source must be microphone or system_audio")

if audio_format not in {"wav", "pcm16", "webm_opus"}:
raise fastapi.HTTPException(status_code=415, detail="unsupported audio format")

normalized_session_id = session_id.strip()
if not normalized_session_id:
raise fastapi.HTTPException(status_code=422, detail="session_id must not be blank")

max_bytes = 5 * 1024 * 1024
audio_bytes = await audio.read(max_bytes + 1)
if not audio_bytes:
raise fastapi.HTTPException(status_code=422, detail="audio chunk is empty")
if len(audio_bytes) > max_bytes:
raise fastapi.HTTPException(status_code=413, detail="audio chunk too large")

result = await transcribe_and_analyze_chunk(
audio_bytes=audio_bytes,
session_id=normalized_session_id,
chunk_seq=chunk_seq,
is_final_chunk=is_final_chunk,
include_dictionary=include_dictionary,
dictionary_top_k=dictionary_top_k,
deduplicate=deduplicate,
min_length=min_length,
normalize_sentence_vector=normalize_sentence_vector,
text_override=text_override,
)

# 現時点のMVPではSTT実装に直結しないフィールドも受け付ける。
# 将来の実STT差し替え時にそのまま利用できるよう、ここではバリデーションのみ行う。
_ = (sample_rate_hz, channels, language_hint)

return PipelineTranscribeAnalyzeResponse(**result)
45 changes: 45 additions & 0 deletions Backend/app/schemas/pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from __future__ import annotations

from pydantic import BaseModel, Field

from app.schemas.analysis import (
ReferDictionaryEntry,
SentenceVectorizeResponse,
VectorizeResponse,
)


class PipelineTiming(BaseModel):
processed_ms: int = Field(ge=0, description="サーバー側処理時間(ミリ秒)")


class PipelineTranscript(BaseModel):
partial_text: str = Field(default="", description="途中文字起こし")
final_text: str = Field(default="", description="確定文字起こし")
is_final: bool = Field(description="確定結果か")
confidence: float | None = Field(default=None, description="STT信頼度(未提供時はnull)")


class PipelineAnalysis(BaseModel):
vectorize: VectorizeResponse | None = Field(
default=None,
description="内容語ベクトル化結果(非確定チャンクではnull)",
)
sentence_vectorize: SentenceVectorizeResponse | None = Field(
default=None,
description="文章ベクトル化結果(非確定チャンクではnull)",
)


class PipelineDictionary(BaseModel):
enabled: bool = Field(description="辞書参照を実行したか")
entries: list[ReferDictionaryEntry] = Field(default_factory=list, description="辞書参照結果")


class PipelineTranscribeAnalyzeResponse(BaseModel):
session_id: str = Field(description="セッション識別子")
chunk_seq: int = Field(ge=0, description="チャンク連番")
timing: PipelineTiming
transcript: PipelineTranscript
analysis: PipelineAnalysis
dictionary: PipelineDictionary
Loading