|
| 1 | +""" |
| 2 | +Advanced Extractive Text Summarization Model |
| 3 | +Issue #100 for king04aman/All-In-One-Python-Projects |
| 4 | +""" |
| 5 | +import nltk |
| 6 | +import spacy |
| 7 | +from sklearn.feature_extraction.text import TfidfVectorizer |
| 8 | +from sklearn.cluster import KMeans |
| 9 | +import numpy as np |
| 10 | + |
| 11 | +nltk.download('punkt') |
| 12 | +nlp = spacy.load('en_core_web_sm') |
| 13 | + |
| 14 | +def extract_sentences(text): |
| 15 | + return nltk.sent_tokenize(text) |
| 16 | + |
| 17 | +def score_sentences(sentences): |
| 18 | + tfidf = TfidfVectorizer().fit_transform(sentences) |
| 19 | + scores = tfidf.sum(axis=1).A1 |
| 20 | + features = [] |
| 21 | + for i, sent in enumerate(sentences): |
| 22 | + length = len(sent) |
| 23 | + position = i / len(sentences) |
| 24 | + doc = nlp(sent) |
| 25 | + entities = len(doc.ents) |
| 26 | + features.append([scores[i], length, position, entities]) |
| 27 | + return np.array(features) |
| 28 | + |
| 29 | +def cluster_sentences(features, n_clusters=3): |
| 30 | + kmeans = KMeans(n_clusters=n_clusters, random_state=42) |
| 31 | + labels = kmeans.fit_predict(features) |
| 32 | + return labels |
| 33 | + |
| 34 | +def summarize(text, n_clusters=3): |
| 35 | + sentences = extract_sentences(text) |
| 36 | + features = score_sentences(sentences) |
| 37 | + labels = cluster_sentences(features, n_clusters) |
| 38 | + summary = [] |
| 39 | + for cluster in range(n_clusters): |
| 40 | + idx = np.where(labels == cluster)[0] |
| 41 | + if len(idx) > 0: |
| 42 | + best = idx[np.argmax(features[idx, 0])] |
| 43 | + summary.append(sentences[best]) |
| 44 | + return "\n".join(summary) |
| 45 | + |
| 46 | +if __name__ == "__main__": |
| 47 | + sample_text = """ |
| 48 | + Natural Language Processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and humans through language. NLP techniques are used to analyze text, extract information, and generate summaries. Extractive summarization selects key sentences from the original text to create a concise summary. Advanced models use features like TF-IDF, sentence length, position, and named entities to score sentences. Clustering helps group related sentences and highlight critical points from different themes. This approach is useful for summarizing reports, research papers, and news articles. |
| 49 | + """ |
| 50 | + print("Summary:\n", summarize(sample_text)) |
0 commit comments