NimbleBoxAI
diff --git a/‎meet_reduce/gpt.py‎
Lines changed: 30 additions & 10 deletions b/‎meet_reduce/gpt.py‎
Lines changed: 30 additions & 10 deletions
diff --git a/‎meet_reduce/summarizer.py‎
Lines changed: 8 additions & 3 deletions b/‎meet_reduce/summarizer.py‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎meet_reduce/utils.py‎
Lines changed: 6 additions & 1 deletion b/‎meet_reduce/utils.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎pytube/__init__.py‎
Lines changed: 19 additions & 0 deletions b/‎pytube/__init__.py‎
Lines changed: 19 additions & 0 deletions
@@ -248,15 +248,24 @@ def get_keywords(text, gpt, r = 4):
 
 ###
 
-Text: "Black-on-black ware is a 20th- and 21st-century pottery tradition developed by the Puebloan Native American ceramic artists in Northern New Mexico. Traditional reduction-fired blackware has been made for centuries by pueblo artists. Black-on-black ware of the past century is produced with a smooth surface, with the designs applied through selective burnishing or the application of refractory slip. Another style involves carving or incising designs and selectively polishing the raised areas. For generations several families from Kha'po Owingeh and P'ohwhóge Owingeh pueblos have been making black-on-black ware with the techniques passed down from matriarch potters. Artists from other pueblos have also produced black-on-black ware. Several contemporary artists have created works honoring the pottery of their ancestors."
+Text: "Black-on-black ware is a 20th- and 21st-century pottery tradition developed by the
+Puebloan Native American ceramic artists in Northern New Mexico. Traditional reduction-fired
+blackware has been made for centuries by pueblo artists. Black-on-black ware of the past century
+is produced with a smooth surface, with the designs applied through selective burnishing or the
+application of refractory slip. Another style involves carving or incising designs and
+selectively polishing the raised areas. For generations several families from Kha'po
+Owingeh and P'ohwhóge Owingeh pueblos have been making black-on-black ware with the
+techniques passed down from matriarch potters. Artists from other pueblos have also produced
+black-on-black ware. Several contemporary artists have created works honoring the pottery of
+their ancestors."
 
 Keywords: Pueblo, art, pottery, black, black ware
 
 ###
 
 Text: "{text}"
 
-keywords:"""
+Keywords:"""
 
   p = prompt.format(text = text)
   words = set()
@@ -269,16 +278,21 @@ def get_keywords(text, gpt, r = 4):
     )
 
     for s in out:
-      ws = s.split("###")[2].split("\nkeywords:")[-1].split(",")
+      ws = s.split("###")[2].split("\nKeywords:")[-1].split(",")
       for w in ws:
         w = w.strip().lower()
         # print("--->", w, w in ["pueblo", "art", "pottery", "black", "black ware"])
         if w and len(w.split()) < 4 and w not in ["pueblo", "art", "pottery", "black", "black ware"]:
           words.add(w)
+
+        for _w in w.split()[:2]:
+          words.add(_w)
+
   return list(words)
 
 
 def clean_keywords(keywords, gpt):
+  print(keywords)
   if isinstance(keywords[0], list):
     return [clean_keywords[x] for x in keywords]
 
@@ -325,7 +339,7 @@ def clean_keywords(keywords, gpt):
 
 # ----- main functions
 
-def get_keywords(captions: list, gpt: GPT):
+def get_keywords_from_caption(captions: list, gpt: GPT):
   """
   Args:
     captions (list): captions is the list with output of function `Processor.parse_captions()`
@@ -347,19 +361,25 @@ def get_keywords(captions: list, gpt: GPT):
   # step 1: format the sentence
   format_buff_size = 100
   fsent = []
-  for i in range(0, len(capstr), format_buff_size):
+  pbar = trange(0, len(capstr), format_buff_size)
+  for i in pbar:
     r = format_sentence(" ".join(capstr.split()[i:i+format_buff_size]) + ".", gpt)
     fsent.append(r)
+    pbar.set_description(f"Cleaned {i+1} sentences!")
 
   # step 2: get keywords
   words = []
-  for _, o in zip(trange(len(fsent)), fsent):
-    words.append(get_keywords(o, gpt))
-  
+  pbar = trange(len(fsent))
+  for _, o in zip(pbar, fsent):
+    out = get_keywords(o, gpt)
+    words.append(out)
+    pbar.set_description(f"Got {sum([len(x) for x in words])} keywords from {len(words)} sentences")
+
   # step 3: clean the key-words
   w2 = []
-  for w in words:
-      out = clean_keywords(w, gpt)
+  pbar = trange(len(words))
+  for _, w in zip(pbar, words):
+      out = clean_keywords(", ".join(w), gpt)
       w2.append(out)
 
   return w2
@@ -12,7 +12,7 @@
 from .utils import get_caption
 
 from .daily import *
-from .gpt import GPT, get_keywords
+from .gpt import GPT, get_keywords_from_caption
 
 # ---- functions
 
@@ -43,7 +43,8 @@ def get_model(name = "EleutherAI/gpt-neo-2.7B", cache_dir = ".model-cache/"):
 
 # ---- class
 class Processor():
-  def __init__(self, hf_backbone="EleutherAI/gpt-neo-2.7B"):
+  def __init__(self, hf_backbone):
+    print("Loading GPT Processor (Meet Reduce) ...")
     here = folder(__file__)
     self.cap_folder = os.path.join(here, 'captions')
     all_cap_files = glob(f"{self.cap_folder}/*.srt")
@@ -89,12 +90,14 @@ def process(self, url, max_tries = 20):
     _file = Hashlib.md5(url)
     if not _file in self.all_cap_files:
       # get captions and return if there is some error
+      print("Getting captions ...")
       caption = get_caption(url, max_tries)
       if isinstance(caption, list):
         return f"[This]({url}) video has no captions"
       if caption is None:
         return f"Failed to fetch captions for [this]({url}) video."
       else:
+        print("Saving captions ...")
         # save and cache captions
         fp = f"{self.cap_folder}/{_file}.srt"
         with open(fp, "w") as f:
@@ -105,11 +108,13 @@ def process(self, url, max_tries = 20):
         caption = f.read()
 
     # parse caption string into caption blocks
+    print("Parse captions to blocks ...")
     captions = self.parse_captions(caption)
     heights = [] # word density plot
     for x in captions:
       heights.extend([len(x["content"].split()), ] * len(x["id"]))
 
-    keywords = get_keywords(captions, self.gpt)
+    print("Generating keywords ... (this will take time)")
+    keywords = get_keywords_from_caption(captions[:3], self.gpt)
     return keywords
 
@@ -1,6 +1,11 @@
-
+# define the util functions
 import time
+import sys
 from requests import HTTPError
+
+from .daily import folder
+sys.path.append(folder(folder(__file__)))
+
 from pytube import YouTube
 
 def get_caption(x, n, m = 0):
 
@@ -0,0 +1,19 @@
+# flake8: noqa: F401
+# noreorder
+"""
+Pytube: a very serious Python library for downloading YouTube Videos.
+"""
+__title__ = "pytube"
+__author__ = "Ronnie Ghose, Taylor Fox Dahlin, Nick Ficano"
+__license__ = "The Unlicense (Unlicense)"
+__js__ = None
+__js_url__ = None
+
+from pytube.version import __version__
+from pytube.streams import Stream
+from pytube.captions import Caption
+from pytube.query import CaptionQuery, StreamQuery
+from pytube.__main__ import YouTube
+from pytube.contrib.playlist import Playlist
+from pytube.contrib.channel import Channel
+from pytube.contrib.search import Search