Skip to content

Commit 2e3dc06

Browse files
committed
chunker
1 parent 33b7343 commit 2e3dc06

File tree

7 files changed

+92
-16
lines changed

7 files changed

+92
-16
lines changed

Backend/NotesChunker.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,22 @@
1-
from fastapi import APIRouter
21
import tensorflow_hub as hub
3-
import tensorflow_text
42
from sklearn.cluster import KMeans
53
import numpy as np
6-
7-
8-
9-
app = APIRouter()
10-
11-
# Load the USE model
12-
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")
4+
from tqdm import tqdm
135

146
# Preprocessing functions
157
def preprocess_text(text):
168
sentences = text.split('\n') # Split text into sentences
179
return sentences
1810

19-
# API route for extracting topic-wise chunks
20-
@app.post("/extract_chunks")
21-
def extract_chunks(text: str):
11+
def extract_chunks(text):
2212
# Preprocess the input text
2313
sentences = preprocess_text(text)
2414

15+
# Show progress bar while loading the model
16+
with tqdm(total=1, desc="Loading model") as pbar:
17+
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")
18+
pbar.update(1)
19+
2520
# Generate sentence embeddings
2621
sentence_embeddings = embed(sentences)
2722

@@ -47,4 +42,9 @@ def extract_chunks(text: str):
4742
chunk_sentences = [sentences[i] for i in range(len(sentences)) if kmeans.labels_[i] == cluster_index]
4843
chunks.append({"topic": f"Topic {cluster_index+1}", "subsections": chunk_sentences})
4944

50-
return chunks
45+
return chunks
46+
47+
# Example usage
48+
text = "This is an example text. It contains multiple sentences.\nEach sentence represents a subsection."
49+
result = extract_chunks(text)
50+
print(result)
3 Bytes
Binary file not shown.

Backend/documentai.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import nltk
2+
from sklearn.feature_extraction.text import TfidfVectorizer
3+
from sklearn.decomposition import TruncatedSVD
4+
5+
# Step 1: Preprocessing
6+
def preprocess_notes(notes):
7+
# Perform any necessary preprocessing steps here
8+
preprocessed_notes = []
9+
for note in notes:
10+
# Apply preprocessing to each note
11+
# Example: Convert to lowercase, remove punctuation, etc.
12+
preprocessed_notes.append(note.lower())
13+
return preprocessed_notes
14+
15+
# Step 2: Document-Term Matrix
16+
def create_document_term_matrix(preprocessed_notes):
17+
vectorizer = TfidfVectorizer()
18+
X = vectorizer.fit_transform(preprocessed_notes)
19+
return X
20+
21+
# Step 3: Apply LSA
22+
def apply_lsa(X, number_of_topics):
23+
lsa = TruncatedSVD(n_components=number_of_topics)
24+
lsa_representation = lsa.fit_transform(X)
25+
return lsa_representation
26+
27+
# Step 4: Topic Extraction
28+
def extract_topics(lsa_representation, notes):
29+
topic_wise_notes = {}
30+
for i, note in enumerate(notes):
31+
topic = lsa_representation[i].argmax()
32+
if topic not in topic_wise_notes:
33+
topic_wise_notes[topic] = []
34+
topic_wise_notes[topic].append(note)
35+
return topic_wise_notes
36+
37+
# Main code
38+
def main():
39+
# Input: List of notes
40+
your_notes_list = [
41+
"Note 1",
42+
"Note 2",
43+
"Note 3",
44+
# Add more notes as needed
45+
]
46+
47+
# Set the number of topics for LSA
48+
number_of_topics = 3
49+
50+
# Step 1: Preprocessing
51+
preprocessed_notes = preprocess_notes(your_notes_list)
52+
53+
# Step 2: Document-Term Matrix
54+
X = create_document_term_matrix(preprocessed_notes)
55+
56+
# Step 3: Apply LSA
57+
lsa_representation = apply_lsa(X, number_of_topics)
58+
59+
# Step 4: Topic Extraction
60+
topic_wise_notes = extract_topics(lsa_representation, your_notes_list)
61+
62+
# Print the topic-wise notes
63+
for topic, notes in topic_wise_notes.items():
64+
print(f"Topic {topic}:")
65+
for note in notes:
66+
print(note)
67+
print()
68+
69+
# Execute the main function
70+
main()

__pycache__/app.cpython-310.pyc

-66 Bytes
Binary file not shown.

app.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77

88
#from Backend.Notes_Analyser import router as api4_router
99
#from Backend.Narrator import router as api5_router
10-
from Backend.NotesChunker import app as chunker
11-
from Backend.NotesToText import router as notestotxt
10+
from Backend.NotesChunker import router as chunker
11+
#from Backend.NotesToText import router as notestotxt
1212

1313
# import other API routers as needed
1414

@@ -33,7 +33,7 @@
3333
#app.include_router(sorter)
3434
#app.include_router(api4_router)
3535
app.include_router(chunker)
36-
app.include_router(notestotxt)
36+
#app.include_router(notestotxt)
3737

3838
# include other API routers as needed
3939

model

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
<!doctype html><html><head><title>TensorFlow Hub</title><meta name="google-site-verification" content="qqy8ICUGzfuvtOqpeIPRRHh105pRztoKQMm4wgowXGg"/><link rel="icon" href="//www.gstatic.com/aihub/tfhub_logo_3.png"><link rel="stylesheet" href="https://www.gstatic.com/_/tfhubdev/_/ss/k=tfhubdev.h.oe6hlFNL_vk.L.X.O/d=0/rs=AJFuRJ-fCeC1XBTBUgsFAruZh0T1mqQs2w"><!-- Integrate Glue's Carousel --><link href="//www.gstatic.com/glue/v21_0/glue.min.css" rel="stylesheet"><script src="//www.gstatic.com/glue/v21_0/glue-detect.min.js"></script><base href="/"></head><body><app-root>Loading...</app-root><script id="base-js" src="https://www.gstatic.com/_/tfhubdev/_/js/k=tfhubdev.h.en_US.bgI9DrEVhiQ.O/d=1/rs=AJFuRJ_J_qAVaLGqSi5q1oSOLnDm1Ze6ug/m=b" async></script><script>
2+
window.ga=window.ga||function(){(ga.q=ga.q||[]).push(arguments)};ga.l=+new Date;
3+
ga('create', 'UA-121310548-2', 'auto'); window.analyticsId = 'UA-121310548-2';</script><script async src="https://www.google-analytics.com/analytics.js"></script><!-- Integrate Glue's Carousel --><script src="https://www.gstatic.com/external_hosted/hammerjs/v2_0_2/hammer.min.js"></script><script src="//www.gstatic.com/glue/v21_0/glue-vanilla.min.js"></script></body></html>

model.pb

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
<!doctype html><html><head><title>TensorFlow Hub</title><meta name="google-site-verification" content="qqy8ICUGzfuvtOqpeIPRRHh105pRztoKQMm4wgowXGg"/><link rel="icon" href="//www.gstatic.com/aihub/tfhub_logo_3.png"><link rel="stylesheet" href="https://www.gstatic.com/_/tfhubdev/_/ss/k=tfhubdev.h.oe6hlFNL_vk.L.X.O/d=0/rs=AJFuRJ-fCeC1XBTBUgsFAruZh0T1mqQs2w"><!-- Integrate Glue's Carousel --><link href="//www.gstatic.com/glue/v21_0/glue.min.css" rel="stylesheet"><script src="//www.gstatic.com/glue/v21_0/glue-detect.min.js"></script><base href="/"></head><body><app-root>Loading...</app-root><script id="base-js" src="https://www.gstatic.com/_/tfhubdev/_/js/k=tfhubdev.h.en_US.bgI9DrEVhiQ.O/d=1/rs=AJFuRJ_J_qAVaLGqSi5q1oSOLnDm1Ze6ug/m=b" async></script><script>
2+
window.ga=window.ga||function(){(ga.q=ga.q||[]).push(arguments)};ga.l=+new Date;
3+
ga('create', 'UA-121310548-2', 'auto'); window.analyticsId = 'UA-121310548-2';</script><script async src="https://www.google-analytics.com/analytics.js"></script><!-- Integrate Glue's Carousel --><script src="https://www.gstatic.com/external_hosted/hammerjs/v2_0_2/hammer.min.js"></script><script src="//www.gstatic.com/glue/v21_0/glue-vanilla.min.js"></script></body></html>

0 commit comments

Comments
 (0)