Skip to content

Commit ae6d8a2

Browse files
committed
add summarize youtube app
1 parent 972b6c5 commit ae6d8a2

File tree

6 files changed

+196
-134
lines changed

6 files changed

+196
-134
lines changed

frontend/.env.sample

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,5 @@ BACKEND_URL = "http://localhost:8888"
55
AZURE_OPENAI_ENDPOINT = "https://<aoai-name>.openai.azure.com"
66
AZURE_OPENAI_API_KEY = "<aoai-api-key>"
77
AZURE_OPENAI_API_VERSION = "2024-05-01-preview"
8-
AZURE_OPENAI_WHISPER_MODEL = "whisper"
9-
AZURE_OPENAI_GPT_MODEL = "gpt-4o"
8+
AZURE_OPENAI_MODEL_WHISPER = "whisper"
9+
AZURE_OPENAI_MODEL_CHAT = "gpt-4o"

frontend/pages/chat.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def main(
3737
st.markdown(prompt)
3838

3939
response = client.chat.completions.create(
40-
model=getenv("AZURE_OPENAI_GPT_MODEL"),
40+
model=getenv("AZURE_OPENAI_MODEL_CHAT"),
4141
messages=[{"role": m["role"], "content": m["content"]} for m in st.session_state.messages],
4242
stream=True,
4343
)
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
import logging
2+
import traceback
3+
from os import getenv
4+
from urllib.parse import urlparse
5+
6+
import streamlit as st
7+
import tiktoken
8+
from dotenv import load_dotenv
9+
from langchain_community.document_loaders import YoutubeLoader # Youtube用
10+
from langchain_core.output_parsers import StrOutputParser
11+
from langchain_core.prompts import ChatPromptTemplate
12+
from langchain_core.runnables import RunnableLambda
13+
from langchain_openai import AzureChatOpenAI
14+
from langchain_text_splitters import RecursiveCharacterTextSplitter
15+
16+
logger = logging.getLogger(__name__)
17+
load_dotenv()
18+
19+
20+
SUMMARIZE_PROMPT = """Please provide a clear 300 word summary of the following content in Japanese.
21+
22+
========
23+
24+
{content}
25+
26+
========
27+
"""
28+
29+
30+
def init_page():
31+
st.set_page_config(page_title="Summarize YouTube", page_icon="💻")
32+
st.header("Summarize YouTube")
33+
st.sidebar.title("Options")
34+
35+
36+
def select_model(temperature=0):
37+
return AzureChatOpenAI(
38+
temperature=temperature,
39+
api_key=getenv("AZURE_OPENAI_API_KEY"),
40+
api_version=getenv("AZURE_OPENAI_API_VERSION"),
41+
azure_endpoint=getenv("AZURE_OPENAI_ENDPOINT"),
42+
model=getenv("AZURE_OPENAI_MODEL_CHAT"),
43+
)
44+
45+
46+
def init_summarize_chain():
47+
llm = select_model()
48+
prompt = ChatPromptTemplate.from_messages(
49+
[
50+
("user", SUMMARIZE_PROMPT),
51+
]
52+
)
53+
output_parser = StrOutputParser()
54+
return prompt | llm | output_parser
55+
56+
57+
def init_map_reduce_chain():
58+
summarize_chain = init_summarize_chain()
59+
60+
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
61+
model_name="gpt-4o", # hard-coded for now
62+
chunk_size=16000,
63+
chunk_overlap=0,
64+
)
65+
text_split = RunnableLambda(lambda x: [{"content": doc} for doc in text_splitter.split_text(x["content"])])
66+
text_concat = RunnableLambda(lambda x: {"content": "\n".join(x)})
67+
return text_split | summarize_chain.map() | text_concat | summarize_chain
68+
69+
70+
def init_chain():
71+
summarize_chain = init_summarize_chain()
72+
map_reduce_chain = init_map_reduce_chain()
73+
74+
def route(x):
75+
encoding = tiktoken.encoding_for_model("gpt-4o")
76+
token_count = len(encoding.encode(x["content"]))
77+
if token_count > 16000:
78+
return map_reduce_chain
79+
else:
80+
return summarize_chain
81+
82+
chain = RunnableLambda(route)
83+
84+
return chain
85+
86+
87+
def validate_url(url):
88+
"""URLが有効かどうかを判定する関数"""
89+
try:
90+
result = urlparse(url)
91+
if result.netloc != "www.youtube.com":
92+
return False
93+
if not result.path.startswith("/watch"):
94+
return False
95+
return all([result.scheme, result.netloc])
96+
except ValueError:
97+
return False
98+
99+
100+
def get_content(url):
101+
with st.spinner("Fetching Youtube ..."):
102+
loader = YoutubeLoader.from_youtube_url(
103+
url,
104+
add_video_info=True, # タイトルや再生数も取得できる
105+
language=["en", "ja"], # 英語→日本語の優先順位で字幕を取得
106+
)
107+
res = loader.load() # list of `Document` (page_content, metadata)
108+
try:
109+
if res:
110+
content = res[0].page_content
111+
title = res[0].metadata["title"]
112+
return f"Title: {title}\n\n{content}"
113+
else:
114+
return None
115+
except Exception as e:
116+
logger.error(f"An error occurred: {e}")
117+
st.write(traceback.format_exc())
118+
return None
119+
120+
121+
def main():
122+
init_page()
123+
chain = init_chain()
124+
if url := st.text_input("URL: ", key="input"):
125+
# clear text input
126+
is_valid_url = validate_url(url)
127+
if not is_valid_url:
128+
st.write("Please input valid url")
129+
else:
130+
if content := get_content(url):
131+
st.markdown("## Summary")
132+
st.write_stream(chain.stream({"content": content}))
133+
st.markdown("---")
134+
st.markdown("## Original Text")
135+
st.write(content)
136+
137+
138+
if __name__ == "__main__":
139+
main()

frontend/pages/transcription.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def get_transcription(file_path: str) -> Transcription:
2121

2222
return client.audio.transcriptions.create(
2323
file=open(file=file_path, mode="rb"),
24-
model=getenv("AZURE_OPENAI_WHISPER_MODEL"),
24+
model=getenv("AZURE_OPENAI_MODEL_WHISPER"),
2525
)
2626

2727

0 commit comments

Comments
 (0)