Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.4.1
rev: v0.12.11
hooks:
# Run the linter.
- id: ruff
Expand Down
2 changes: 1 addition & 1 deletion src/summarize_gutenberg/__main__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from summarize_gutenberg.cli import app

app()
app()
9 changes: 5 additions & 4 deletions src/summarize_gutenberg/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from summarize_gutenberg.db import DB


@dataclass
class Book:
id: int = field(default=None)
Expand All @@ -15,11 +16,11 @@ class Book:
@classmethod
def from_dict(cls, d):
return Book(**d)

def to_dict(self):
return asdict(self)



class BooksDB:
def __init__(self, db_path):
self._db_path = db_path
Expand All @@ -37,7 +38,7 @@ def get_book(self, book_id: int) -> Book:
if db_item is not None:
return Book.from_dict(db_item)
# else:
# raise InvalidBookId(book_id)
# raise InvalidBookId(book_id)

def list_books(self):
"""Return a list of books."""
Expand All @@ -59,7 +60,7 @@ def delete_book(self, book_id: int) -> None:
"""Remove a book from db with given book_id."""
self._db.delete(book_id)
# except KeyError as exc:
# raise InvalidBookId(book_id) from exc
# raise InvalidBookId(book_id) from exc

def delete_all(self) -> None:
"""Remove all books from db."""
Expand All @@ -69,4 +70,4 @@ def close(self):
self._db.close()

def path(self):
return self._db_path
return self._db_path
39 changes: 22 additions & 17 deletions src/summarize_gutenberg/cli.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import typer
from rich import print, box
from rich.prompt import Prompt, IntPrompt, Confirm
from rich.prompt import Prompt, IntPrompt
from rich.table import Table
from pathlib import Path
from contextlib import contextmanager
Expand All @@ -14,11 +14,13 @@
FILE_DIR = Path("./files/")
SUMMARY_DIR = FILE_DIR / "summaries"


def dir_check():
"""Make sure the directories for saving files exist"""
FILE_DIR.mkdir(parents=True, exist_ok=True)
SUMMARY_DIR.mkdir(parents=True, exist_ok=True)


dir_check()

app = typer.Typer()
Expand All @@ -33,7 +35,8 @@ def get_default_books():
for book in books:
with books_db() as db:
db.add_book(Book.from_dict(books[book]))



@app.command()
def default():
"""
Expand All @@ -47,28 +50,30 @@ def default():
get_default_books()

table = Table(box=box.SQUARE_DOUBLE_HEAD, border_style="magenta")
table.add_column('No.')
table.add_column('[bold cyan]Title', max_width=75, no_wrap=False)
table.add_column('[bold magenta]Author')
table.add_column('[bold yellow]Fulltext URL')
table.add_column("No.")
table.add_column("[bold cyan]Title", max_width=75, no_wrap=False)
table.add_column("[bold magenta]Author")
table.add_column("[bold yellow]Fulltext URL")

with books_db() as db:
books = db.list_books()
for order_num, book in enumerate(books, start=1):
table.add_row(f'{str(order_num)}.', book.title, book.author, f"[yellow]{book.url}")
table.add_row(f"{str(order_num)}.", book.title, book.author, f"[yellow]{book.url}")
order_num += 1
print('\n')
print("\n")
print(table)
print('\n')
print("\n")

max_choice = len(books)
choice = Prompt.ask("Select a book by number")
while not choice.isdigit() or int(choice) < 1 or int(choice) > max_choice:
choice = Prompt.ask("[red]Please choose a number between 1 and 32")

selected_book = books[int(choice) - 1]

print(f"\nYou have chosen [bold cyan]{selected_book.title}[/bold cyan] by [bold magenta]{selected_book.author}[/bold magenta].")

print(
f"\nYou have chosen [bold cyan]{selected_book.title}[/bold cyan] by [bold magenta]{selected_book.author}[/bold magenta]."
)
filepath = FILE_DIR / Path(selected_book.filename)

if filepath.exists():
Expand All @@ -78,23 +83,24 @@ def default():
write_text_to_file(selected_book.url, filepath)
print(f"\nText of {selected_book.title} saved to {filepath}.")

choice = Prompt.ask("\nDo you want to [P]rint or [S]ave your summary?", choices=['p', 's'])
choice = Prompt.ask("\nDo you want to [P]rint or [S]ave your summary?", choices=["p", "s"])
chunks = IntPrompt.ask("How many lines per chunk?", default=400)

# if chunks < 50:
# print("[red bold]Warning[/red bold]: choosing a low value could take a lot of time and resources.")
# confirmation = Confirm.ask("Are you sure?")
if choice == 'p':

if choice == "p":
print_summary(filepath, chunks)
else:
target_filepath = SUMMARY_DIR / Path(selected_book.filename)
save_summary(filepath, target_filepath, chunks)
print(f'\nSummary saved to {target_filepath}.')
print(f"\nSummary saved to {target_filepath}.")

with books_db() as db:
db.delete_all()


def get_path():
db_path_env = os.getenv("BOOKS_DB_DIR", "")
if db_path_env:
Expand All @@ -103,6 +109,7 @@ def get_path():
db_path = Path(__file__).parent / "books_db"
return db_path


@contextmanager
def books_db():
db_path = get_path()
Expand All @@ -111,5 +118,3 @@ def books_db():
yield db
finally:
db.close()


4 changes: 1 addition & 3 deletions src/summarize_gutenberg/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,7 @@

class DB:
def __init__(self, db_path, db_file_prefix):
self._db = tinydb.TinyDB(
db_path / f"{db_file_prefix}.json", create_dirs=True
)
self._db = tinydb.TinyDB(db_path / f"{db_file_prefix}.json", create_dirs=True)

def create(self, item: dict) -> int:
id = self._db.insert(item)
Expand Down
19 changes: 11 additions & 8 deletions src/summarize_gutenberg/get_books.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,13 +75,14 @@ def url_check(formats):

return url


def create_filename(title):
"""
Create a filename for the book from a shortened version of its title.
"""
res_list = []
colons = {':', ';'}
puncts = {',', ' ', '.', '—', '-', "'", '"'}
colons = {":", ";"}
puncts = {",", " ", ".", "—", "-", "'", '"'}

for char in title:
if char in colons:
Expand All @@ -90,10 +91,11 @@ def create_filename(title):
continue
else:
res_list.append(char.lower())

res_list.append('.txt')

return ''.join(res_list)
res_list.append(".txt")

return "".join(res_list)


def fetch_default_books():
"""
Expand Down Expand Up @@ -123,7 +125,7 @@ def fetch_default_books():


def process_books(books):
"""
"""
Create a dictionary of fetched books where the key is a sequential number and the value is a dictionary of book info.
"""
book_data = {}
Expand All @@ -148,9 +150,10 @@ def process_books(books):

return book_data


if __name__ == "__main__":
books = process_books(fetch_default_books())
books = process_books(fetch_default_books())

book_list = [book for book in books]
for book in book_list:
print(books[book]['filename'])
print(books[book]["filename"])
6 changes: 4 additions & 2 deletions src/summarize_gutenberg/get_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ def is_valid_utf8(byte_sequence):
except UnicodeDecodeError:
return False


def strip_headers(text):
lines = text.splitlines()
sep = os.linesep
Expand Down Expand Up @@ -159,6 +160,7 @@ def strip_headers(text):

return str(sep.join(out), encoding="utf-8")


def write_text_to_file(url, file_path):
text_request = requests.get(url, stream=True)

Expand All @@ -168,7 +170,7 @@ def write_text_to_file(url, file_path):
text_content = text_request.content
cleaned_text = strip_headers(text_content)

with open(file_path, "w", encoding='utf-8') as file:
with open(file_path, "w", encoding="utf-8") as file:
file.write(cleaned_text)

return file_path
return file_path
31 changes: 16 additions & 15 deletions src/summarize_gutenberg/make_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,20 @@
tokenizer = AutoTokenizer.from_pretrained("pszemraj/pegasus-x-large-book-summary")
model = AutoModelForSeq2SeqLM.from_pretrained("pszemraj/pegasus-x-large-book-summary")


def read_in_chunks(filepath, chunk_size=800):
with open(filepath, 'r', encoding='utf-8') as file:
current_chunk = []
for line in file:
cleaned_line = line.strip()
current_chunk.append(cleaned_line)
if len(current_chunk) == chunk_size:
yield current_chunk
current_chunk = []

if current_chunk:
with open(filepath, "r", encoding="utf-8") as file:
current_chunk = []
for line in file:
cleaned_line = line.strip()
current_chunk.append(cleaned_line)
if len(current_chunk) == chunk_size:
yield current_chunk
current_chunk = []

if current_chunk:
yield current_chunk


def make_summary(chunk):
inputs = tokenizer.encode(chunk, return_tensors="pt", truncation=True)
Expand All @@ -25,13 +27,12 @@ def make_summary(chunk):
return summary



def save_summary(source, target, chunk_size):
with open(target,'w') as target:
with open(target, "w") as target:
for chunk in read_in_chunks(source, chunk_size=chunk_size):
target.write(make_summary(' '.join(chunk)))
target.write(make_summary(" ".join(chunk)))


def print_summary(source, chunk_size):
for chunk in read_in_chunks(source, chunk_size=chunk_size):
print(make_summary(' '.join(chunk)))

print(make_summary(" ".join(chunk)))
3 changes: 2 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from summarize_gutenberg.api import Book


@pytest.fixture()
def book_fixture():
"""
Expand All @@ -19,4 +20,4 @@ def book_fixture():
filename="yesterdaystomorrows.txt",
)

return book
return book
32 changes: 18 additions & 14 deletions tests/test_author_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,24 @@
from summarize_gutenberg.get_books import author_parse

authors = [
('Aristotle', 'Aristotle'), # single name
('Austen, Jane', 'Jane Austen'), # first & last
('Stevenson, Robert Louis', 'Robert Louis Stevenson'), # first last middle
('Chesterton, G. K. (Gilbert Keith)', 'G. K. Chesterton'), # parenthetical
('H. D. (Hilda Doolittle)', 'H. D.'), # irregular parenthetical
('Tolkien, J. R. R. (John Ronald Reuel)', 'J. R. R. Tolkien'), # parenthetical with three initials
('Von Arnim, Elizabeth', 'Elizabeth Von Arnim'), # von
('Sanchez, Nellie Van de Grift', 'Nellie Van de Grift Sanchez'), # van
('Martinez de la Torre, Rafael', 'Rafael Martinez de la Torre'), # de la
('Cervantes Saavedra, Miguel de', 'Miguel de Cervantes Saavedra'), # de
('Alger, Horatio, Jr.', 'Horatio Alger Jr.'), # jr
(None, '') # none
("Aristotle", "Aristotle"), # single name
("Austen, Jane", "Jane Austen"), # first & last
("Stevenson, Robert Louis", "Robert Louis Stevenson"), # first last middle
("Chesterton, G. K. (Gilbert Keith)", "G. K. Chesterton"), # parenthetical
("H. D. (Hilda Doolittle)", "H. D."), # irregular parenthetical
(
"Tolkien, J. R. R. (John Ronald Reuel)",
"J. R. R. Tolkien",
), # parenthetical with three initials
("Von Arnim, Elizabeth", "Elizabeth Von Arnim"), # von
("Sanchez, Nellie Van de Grift", "Nellie Van de Grift Sanchez"), # van
("Martinez de la Torre, Rafael", "Rafael Martinez de la Torre"), # de la
("Cervantes Saavedra, Miguel de", "Miguel de Cervantes Saavedra"), # de
("Alger, Horatio, Jr.", "Horatio Alger Jr."), # jr
(None, ""), # none
]

@pytest.mark.parametrize('input, expected', authors)

@pytest.mark.parametrize("input, expected", authors)
def test_author_parse(input, expected):
assert author_parse(input) == expected, f'Expected {expected}, but got {author_parse(input)}'
assert author_parse(input) == expected, f"Expected {expected}, but got {author_parse(input)}"
Loading