diff --git a/src/chunking/MPNet/local/model.py b/src/chunking/MPNet/local/model.py index a5faa1a..b6eada0 100644 --- a/src/chunking/MPNet/local/model.py +++ b/src/chunking/MPNet/local/model.py @@ -158,4 +158,4 @@ async def inference(self, request: ModelRequest): # Properly escape the CSV string - return csv_string + return csv_string \ No newline at end of file diff --git a/src/youtube_embedding/Dockerfile b/src/youtube_embedding/Dockerfile new file mode 100644 index 0000000..dfb36ba --- /dev/null +++ b/src/youtube_embedding/Dockerfile @@ -0,0 +1,15 @@ + +FROM python:3.9-slim + +WORKDIR /app + + +#install requirements +COPY requirements.txt requirements.txt +RUN pip3 install -r requirements.txt + +# Copy the rest of the application code to the working directory +COPY . /app/ +EXPOSE 8000 +# Set the entrypoint for the container +CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] diff --git a/src/youtube_embedding/README.md b/src/youtube_embedding/README.md new file mode 100644 index 0000000..e69de29 diff --git a/src/youtube_embedding/__init__.py b/src/youtube_embedding/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/youtube_embedding/api.py b/src/youtube_embedding/api.py new file mode 100644 index 0000000..c3c560b --- /dev/null +++ b/src/youtube_embedding/api.py @@ -0,0 +1,68 @@ +from quart import Quart, request, jsonify +from scraper import transcript +import aiohttp +import io +from model import Model +from request import ModelRequest +from chunking import TranscriptChunker +import json + +app = Quart(__name__) + + +model = None + +@app.before_serving +async def startup(): + app.client = aiohttp.ClientSession() + global model + model = Model(app) + + +transcript_data_store = {} +@app.route('/get_transcript', methods=['POST']) +async def get_transcript(): + data = await request.get_json() + + if 'url' not in data: + return jsonify({'error': 'URL is required'}), 400 + + url = data['url'] + transcript_path, transcript_content = transcript(url) + + transcript_data_store[url] = { + 'transcript_path': transcript_path, + 'transcript_data': transcript_content + } + return jsonify({ + 'transcript_path': transcript_path, + 'transcript_data': transcript_content + }) + + +@app.route('/Query',methods=['POST']) +async def query(): + global model + data= await request.get_json() + + + if 'url' not in data or 'query' not in data: + return jsonify({'error': 'URL and query are required'}), 400 + + url=data['url'] + user_query = data['query'] + + if url not in transcript_data_store: + transcript_path, transcript_content = transcript(url) + transcript_data_store[url] = { + 'transcript_path': transcript_path, + 'transcript_data': transcript_content + } + else: + req=ModelRequest(data,transcript_data_store) + response=await model.inference(req) + + return jsonify({ + 'search_results':response + }) + diff --git a/src/youtube_embedding/chunking/__init__.py b/src/youtube_embedding/chunking/__init__.py new file mode 100644 index 0000000..020e1d6 --- /dev/null +++ b/src/youtube_embedding/chunking/__init__.py @@ -0,0 +1 @@ +from chunking.transform import * \ No newline at end of file diff --git a/src/youtube_embedding/chunking/transform.py b/src/youtube_embedding/chunking/transform.py new file mode 100644 index 0000000..d470f24 --- /dev/null +++ b/src/youtube_embedding/chunking/transform.py @@ -0,0 +1,128 @@ +""" +transform transcipted data and devides it into chunks. +Initially chunking 4mins video frame is implemented, +other more optimized algorithms will be implemented in +further iterations. +""" + +import json +from typing import List,Dict,Any +import pandas as pd + + +class TranscriptChunker: + """ + TranscriptChunker class for processing youtube transcript. + + Attributes: + chunk_size_seconds (int): The size of each chunk in seconds. + + Methods: + fit(transcript_path: str) -> None: + Reads the transcript data from a JSON file and prepares the class for transformation. + + _transform() -> Dict[str,List[Dict[str,Any]]]: + Transforms the transcript data into chunks of specified size. + + chunks() ->Dict[str,List[Dict[str,Any]]] : + Returns the resulting chunks . + + metadata() -> Dict[str,Dict[str,Any]]: + Returns metadata about the chunks, such as the number of chunks and their durations. + + + Example: + chunker = TranscriptChunker(chunk_size_seconds=240) + chunker.fit('transcript.json') + chunks_df = chunker.chunks() + metadata = chunker.metadata() + """ + + def __init__(self,chunk_size_seconds:int=240)->None: + self.chunk_size_seconds:int=chunk_size_seconds + # transcipt will be converted to pandas dataframe for better data manipulation + self.transcript_df:pd.DataFrame = None + self.result_chunks:Dict[str,List[Dict[str,Any]]] = None + + + def fit(self,transcript_path:str)->None: + with open(transcript_path,'r') as file: + transcript_data=json.load(file) + self.transcript_df=pd.DataFrame(transcript_data) + self.result_chunks=self._transform() + + + def _transform(self)->List[Dict[str,Any]]: + + if self.transcript_df is None: + raise ValueError("Transcript data not provided.") + + current_chunk=[] + current_chunk_duration = 0 + + # Dictionary to store all chunks + self.all_chunks={} + + chunk_counter=1 + + for index,row in self.transcript_df.iterrows(): + + if current_chunk_duration+row['duration']<=self.chunk_size_seconds: + + current_chunk.append(row.to_dict()) + current_chunk_duration+=row['duration'] + else: + self.all_chunks[f'chunk{chunk_counter}']=current_chunk + current_chunk=[row.to_dict()] + current_chunk_duration = row['duration'] + chunk_counter+=1 + + if current_chunk: + self.all_chunks[f'chunk{chunk_counter}'] = current_chunk + + return self.all_chunks + + + + def chunks(self)->Dict[str,List[Dict[str,Any]]]: + if self.result_chunks is None: + raise ValueError("Call .fit() method first to transform data into chunks") + + return self.result_chunks + + + #this method returns meta data about chunks like size of each chunk + #start duration and end duration of chunk + def metadata(self)->Dict[str,Dict[str,Any]]: + if self.result_chunks is None: + raise ValueError("Call .fit() method first to transform data into chunks") + + self.meta_dict={} + for chunk in self.result_chunks.keys(): + dict={} + + #calculating length of chunk(number of words) + text=" " + for item in self.result_chunks[chunk]: + text=text+" "+item['text'] + chunk_length=len(text.split()) + dict['chunk_length']=chunk_length + + #calculating duration of each chunk in minutes + start_time=self.result_chunks[chunk][0]['start'] + length=len(self.result_chunks[chunk]) + end_time=self.result_chunks[chunk][length-1]['start']+self.result_chunks[chunk][length-1]['duration'] + + dict['start_time']=round((start_time)/60,2) + dict['end_time']=round((end_time)/60,2) + self.meta_dict[chunk]=dict + + + return self.meta_dict + + +if __name__=='__main__': + + chunks=TranscriptChunker() + chunks.fit('/home/suyash/samagra/ai-tools/src/youtube_embedding/scraper/transcript.json') + print(chunks.metadata()) diff --git a/src/youtube_embedding/model.py b/src/youtube_embedding/model.py new file mode 100644 index 0000000..fd73ea8 --- /dev/null +++ b/src/youtube_embedding/model.py @@ -0,0 +1,50 @@ +from request import Modelrequest +from ragatouille import RAGPretrainedModel +from chunking import TranscriptChunker + + + +class Model(): + def __new__(cls,context): + cls.context=context + if not hasattr(cls,'instance'): + cls.instance= super(Model,cls).__new__(cls) + model_name="colbert-ir/colbertv2.0" + cls.model=RAGPretrainedModel.from_pretrained(model_name) + + return cls.instance + + async def inference(self,request:Modelrequest): + + url=request.url + query=request.query + trasncript_data=request.transcript_data + transcript_path=request.transcript_path + + #chunking + chunker=TranscriptChunker() + chunker.fit(transcript_path) + chunked_data=chunker.chunks() + chunked_meta_data=chunker.metadata() + + #embeddings and index creation + RAG_DICT={} + for chunks in chunked_data.keys(): + text_data=" " + for data in chunked_data[chunks]: + text_data=text_data+" "+data['text'] + + RAG_DICT[chunks]=text_data + + RAG_DATA=[] + for chunks in RAG_DATA.keys(): + RAG_DATA.append(RAG_DATA[chunks]) + + index_path=self.model.index(index_name="my-index",collection=RAG_DATA) + + #query + RAG=RAGPretrainedModel.from_index(index_path) + response=RAG.search(query) + + + return response \ No newline at end of file diff --git a/src/youtube_embedding/request.py b/src/youtube_embedding/request.py new file mode 100644 index 0000000..43e1391 --- /dev/null +++ b/src/youtube_embedding/request.py @@ -0,0 +1,14 @@ +import json + + +class ModelRequest(): + + def __init__(self,data,trasnscript_data_store): + self.query=data['query'] + self.url=data['url'] + self.transcript_path=trasnscript_data_store[self.url]['transcript_path'] + self.transcript_data=trasnscript_data_store[self.url]['transcript_data'] + + + def to_json(self): + return json.dump(self,default=lambda o:o.__dict__,sort_keys=2,indent=4) \ No newline at end of file diff --git a/src/youtube_embedding/requirements.txt b/src/youtube_embedding/requirements.txt new file mode 100644 index 0000000..e8305fe --- /dev/null +++ b/src/youtube_embedding/requirements.txt @@ -0,0 +1,8 @@ +faiss-cpu==1.7.4 +openai==1.11.1 +pandas==2.2.0 +Quart==0.19.4 +RAGatouille==0.0.6b5 +youtube-dl==2021.12.17 +youtube-transcript-api==0.6.2 + diff --git a/src/youtube_embedding/scraper/__init__.py b/src/youtube_embedding/scraper/__init__.py new file mode 100644 index 0000000..e9c2a4a --- /dev/null +++ b/src/youtube_embedding/scraper/__init__.py @@ -0,0 +1 @@ +from scraper.scrape_transcript import * \ No newline at end of file diff --git a/src/youtube_embedding/scraper/scrape_audio.py b/src/youtube_embedding/scraper/scrape_audio.py new file mode 100644 index 0000000..c4a0b88 --- /dev/null +++ b/src/youtube_embedding/scraper/scrape_audio.py @@ -0,0 +1,4 @@ +""" +scrapes the audio from the youtube vedio and then use it +for transcription. +""" diff --git a/src/youtube_embedding/scraper/scrape_transcript.py b/src/youtube_embedding/scraper/scrape_transcript.py new file mode 100644 index 0000000..55b3091 --- /dev/null +++ b/src/youtube_embedding/scraper/scrape_transcript.py @@ -0,0 +1,56 @@ +""" +scrape transcript of youtube vedios along with time frames. +""" +from youtube_transcript_api import YouTubeTranscriptApi +import json +import re +import os + + +def vid_id(Url:str)->str: + """ + retrieves vedio id from url. + args: + Url:url of the vedio in the form of string. + returns: + vedio_id:returns vedio_id os the url. + """ + try: + match = re.search(r'(?<=v=)[^&]+', Url) + video_id = match.group(0) if match else None + except AttributeError: + print("Video ID not foud in URL.") + + return video_id + +def transcript(Url:str)->str: + """ + retrieves the transcript from the youtube video along with timeframe + and stores it in json file + args: + Url:Url of the vedio. + returns: + output_path:returns absolute path of transcript. + """ + output_file='transcript.json' + v_id=vid_id(Url) + + try: + transcript = YouTubeTranscriptApi.get_transcript(v_id) + with open('transcript.json', 'w') as f: + json.dump(transcript, f) + + print("Transcript successfully saved to transcript.json") + + except Exception as e: + print(f"An error occurred: {e}") + + absolute_path=os.path.abspath(output_file) + return absolute_path,transcript + + + + +if __name__=='__main__': + + pass \ No newline at end of file