Skip to content

Commit 9b1c457

Browse files
committed
fix bugs + add pytube code right in the repo (all credits to pytube team)
1 parent e7c8495 commit 9b1c457

28 files changed

+5641
-22
lines changed

meet_reduce/gpt.py

Lines changed: 30 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -248,15 +248,24 @@ def get_keywords(text, gpt, r = 4):
248248
249249
###
250250
251-
Text: "Black-on-black ware is a 20th- and 21st-century pottery tradition developed by the Puebloan Native American ceramic artists in Northern New Mexico. Traditional reduction-fired blackware has been made for centuries by pueblo artists. Black-on-black ware of the past century is produced with a smooth surface, with the designs applied through selective burnishing or the application of refractory slip. Another style involves carving or incising designs and selectively polishing the raised areas. For generations several families from Kha'po Owingeh and P'ohwhóge Owingeh pueblos have been making black-on-black ware with the techniques passed down from matriarch potters. Artists from other pueblos have also produced black-on-black ware. Several contemporary artists have created works honoring the pottery of their ancestors."
251+
Text: "Black-on-black ware is a 20th- and 21st-century pottery tradition developed by the
252+
Puebloan Native American ceramic artists in Northern New Mexico. Traditional reduction-fired
253+
blackware has been made for centuries by pueblo artists. Black-on-black ware of the past century
254+
is produced with a smooth surface, with the designs applied through selective burnishing or the
255+
application of refractory slip. Another style involves carving or incising designs and
256+
selectively polishing the raised areas. For generations several families from Kha'po
257+
Owingeh and P'ohwhóge Owingeh pueblos have been making black-on-black ware with the
258+
techniques passed down from matriarch potters. Artists from other pueblos have also produced
259+
black-on-black ware. Several contemporary artists have created works honoring the pottery of
260+
their ancestors."
252261
253262
Keywords: Pueblo, art, pottery, black, black ware
254263
255264
###
256265
257266
Text: "{text}"
258267
259-
keywords:"""
268+
Keywords:"""
260269

261270
p = prompt.format(text = text)
262271
words = set()
@@ -269,16 +278,21 @@ def get_keywords(text, gpt, r = 4):
269278
)
270279

271280
for s in out:
272-
ws = s.split("###")[2].split("\nkeywords:")[-1].split(",")
281+
ws = s.split("###")[2].split("\nKeywords:")[-1].split(",")
273282
for w in ws:
274283
w = w.strip().lower()
275284
# print("--->", w, w in ["pueblo", "art", "pottery", "black", "black ware"])
276285
if w and len(w.split()) < 4 and w not in ["pueblo", "art", "pottery", "black", "black ware"]:
277286
words.add(w)
287+
288+
for _w in w.split()[:2]:
289+
words.add(_w)
290+
278291
return list(words)
279292

280293

281294
def clean_keywords(keywords, gpt):
295+
print(keywords)
282296
if isinstance(keywords[0], list):
283297
return [clean_keywords[x] for x in keywords]
284298

@@ -325,7 +339,7 @@ def clean_keywords(keywords, gpt):
325339

326340
# ----- main functions
327341

328-
def get_keywords(captions: list, gpt: GPT):
342+
def get_keywords_from_caption(captions: list, gpt: GPT):
329343
"""
330344
Args:
331345
captions (list): captions is the list with output of function `Processor.parse_captions()`
@@ -347,19 +361,25 @@ def get_keywords(captions: list, gpt: GPT):
347361
# step 1: format the sentence
348362
format_buff_size = 100
349363
fsent = []
350-
for i in range(0, len(capstr), format_buff_size):
364+
pbar = trange(0, len(capstr), format_buff_size)
365+
for i in pbar:
351366
r = format_sentence(" ".join(capstr.split()[i:i+format_buff_size]) + ".", gpt)
352367
fsent.append(r)
368+
pbar.set_description(f"Cleaned {i+1} sentences!")
353369

354370
# step 2: get keywords
355371
words = []
356-
for _, o in zip(trange(len(fsent)), fsent):
357-
words.append(get_keywords(o, gpt))
358-
372+
pbar = trange(len(fsent))
373+
for _, o in zip(pbar, fsent):
374+
out = get_keywords(o, gpt)
375+
words.append(out)
376+
pbar.set_description(f"Got {sum([len(x) for x in words])} keywords from {len(words)} sentences")
377+
359378
# step 3: clean the key-words
360379
w2 = []
361-
for w in words:
362-
out = clean_keywords(w, gpt)
380+
pbar = trange(len(words))
381+
for _, w in zip(pbar, words):
382+
out = clean_keywords(", ".join(w), gpt)
363383
w2.append(out)
364384

365385
return w2

meet_reduce/summarizer.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from .utils import get_caption
1313

1414
from .daily import *
15-
from .gpt import GPT, get_keywords
15+
from .gpt import GPT, get_keywords_from_caption
1616

1717
# ---- functions
1818

@@ -43,7 +43,8 @@ def get_model(name = "EleutherAI/gpt-neo-2.7B", cache_dir = ".model-cache/"):
4343

4444
# ---- class
4545
class Processor():
46-
def __init__(self, hf_backbone="EleutherAI/gpt-neo-2.7B"):
46+
def __init__(self, hf_backbone):
47+
print("Loading GPT Processor (Meet Reduce) ...")
4748
here = folder(__file__)
4849
self.cap_folder = os.path.join(here, 'captions')
4950
all_cap_files = glob(f"{self.cap_folder}/*.srt")
@@ -89,12 +90,14 @@ def process(self, url, max_tries = 20):
8990
_file = Hashlib.md5(url)
9091
if not _file in self.all_cap_files:
9192
# get captions and return if there is some error
93+
print("Getting captions ...")
9294
caption = get_caption(url, max_tries)
9395
if isinstance(caption, list):
9496
return f"[This]({url}) video has no captions"
9597
if caption is None:
9698
return f"Failed to fetch captions for [this]({url}) video."
9799
else:
100+
print("Saving captions ...")
98101
# save and cache captions
99102
fp = f"{self.cap_folder}/{_file}.srt"
100103
with open(fp, "w") as f:
@@ -105,11 +108,13 @@ def process(self, url, max_tries = 20):
105108
caption = f.read()
106109

107110
# parse caption string into caption blocks
111+
print("Parse captions to blocks ...")
108112
captions = self.parse_captions(caption)
109113
heights = [] # word density plot
110114
for x in captions:
111115
heights.extend([len(x["content"].split()), ] * len(x["id"]))
112116

113-
keywords = get_keywords(captions, self.gpt)
117+
print("Generating keywords ... (this will take time)")
118+
keywords = get_keywords_from_caption(captions[:3], self.gpt)
114119
return keywords
115120

meet_reduce/utils.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
1-
1+
# define the util functions
22
import time
3+
import sys
34
from requests import HTTPError
5+
6+
from .daily import folder
7+
sys.path.append(folder(folder(__file__)))
8+
49
from pytube import YouTube
510

611
def get_caption(x, n, m = 0):

pytube/__init__.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# flake8: noqa: F401
2+
# noreorder
3+
"""
4+
Pytube: a very serious Python library for downloading YouTube Videos.
5+
"""
6+
__title__ = "pytube"
7+
__author__ = "Ronnie Ghose, Taylor Fox Dahlin, Nick Ficano"
8+
__license__ = "The Unlicense (Unlicense)"
9+
__js__ = None
10+
__js_url__ = None
11+
12+
from pytube.version import __version__
13+
from pytube.streams import Stream
14+
from pytube.captions import Caption
15+
from pytube.query import CaptionQuery, StreamQuery
16+
from pytube.__main__ import YouTube
17+
from pytube.contrib.playlist import Playlist
18+
from pytube.contrib.channel import Channel
19+
from pytube.contrib.search import Search

0 commit comments

Comments
 (0)