-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsummarizer.py
More file actions
432 lines (358 loc) · 19.3 KB
/
summarizer.py
File metadata and controls
432 lines (358 loc) · 19.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import re
import time
from datetime import datetime
class SmartSummarizer:
def __init__(self):
"""Initialize with proper GPU support and model configuration"""
print("🤖 Initializing AI Summarization Engine...")
# Check CUDA availability and force GPU usage
print(f"🔍 CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
print(f"🎮 GPU Detected: {torch.cuda.get_device_name(0)}")
self.device = 0 # Use GPU
else:
print("💻 Using CPU (GPU not available)")
self.device = -1
try:
print("📥 Loading BART-Large-CNN model...")
self.model_name = "facebook/bart-large-cnn"
# Load model and tokenizer explicitly for better control
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name)
# Move model to GPU if available
if torch.cuda.is_available():
self.model = self.model.cuda()
print("✅ Model loaded on GPU!")
else:
print("✅ Model loaded on CPU!")
# Create pipeline with proper configuration
self.summarizer = pipeline(
"summarization",
model=self.model,
tokenizer=self.tokenizer,
device=self.device,
framework="pt"
)
except Exception as e:
print(f"⚠️ Error loading model: {e}")
# Fallback to simpler pipeline
self.summarizer = pipeline(
"summarization",
model="facebook/bart-large-cnn",
device=self.device if torch.cuda.is_available() else -1
)
# Define DISTINCT length configurations
self.length_configs = {
'short': {
'max_length': 80, # Increased for meaningful content
'min_length': 40, # Minimum for coherent summary
'description': 'Brief overview (40-80 words)'
},
'medium': {
'max_length': 150, # Good middle ground
'min_length': 80, # Ensure substantial content
'description': 'Balanced summary (80-150 words)'
},
'detailed': {
'max_length': 300, # Comprehensive
'min_length': 150, # Detailed minimum
'description': 'Comprehensive summary (150-300 words)'
}
}
def preprocess_text(self, text):
"""Enhanced preprocessing for better summarization"""
if not text:
return ""
# Clean the text
text = ' '.join(text.split())
# Remove URLs, emails, and unwanted patterns
text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
# Remove repetitive phrases common in news sites
unwanted_patterns = [
r'Also Read:?.*?(?=\.|$)',
r'Read More:?.*?(?=\.|$)',
r'Subscribe to.*?newsletter',
r'Follow us on.*?(?=\.|$)',
r'Trending.*?(?=\.|$)'
]
for pattern in unwanted_patterns:
text = re.sub(pattern, '', text, flags=re.IGNORECASE)
# Handle BART's token limit properly (1024 tokens ≈ 3000 characters)
if len(text) > 2800:
# Smart truncation - try to end at sentence boundary
truncated = text[:2800]
last_period = truncated.rfind('.')
if last_period > len(truncated) * 0.8:
text = truncated[:last_period + 1]
else:
last_space = truncated.rfind(' ')
text = truncated[:last_space] if last_space > 0 else truncated
return text.strip()
def generate_summary(self, text, length='medium'):
"""Generate summary with proper parameters for distinct outputs - FIXED VERSION"""
try:
start_time = time.time()
# Preprocess text
clean_text = self.preprocess_text(text)
# Validate input length
word_count = len(clean_text.split())
if word_count < 100:
return {
'summary': "Article too short for meaningful summarization. Minimum 100 words required.",
'status': 'insufficient_content',
'processing_time': 0,
'compression_ratio': 0,
'original_words': word_count,
'summary_words': 0
}
# Get length-specific configuration
config = self.length_configs.get(length, self.length_configs['medium'])
# Length-specific parameters for DISTINCT outputs
if length == 'short':
params = {
'max_length': config['max_length'],
'min_length': config['min_length'],
'do_sample': False, # Deterministic for shortest summary
'num_beams': 3,
'early_stopping': True,
'no_repeat_ngram_size': 3,
'truncation': True
}
elif length == 'medium':
params = {
'max_length': config['max_length'],
'min_length': config['min_length'],
'do_sample': True, # Enable sampling for variation
'temperature': 0.8,
'top_p': 0.85,
'num_beams': 4,
'early_stopping': True,
'no_repeat_ngram_size': 2,
'truncation': True
}
else: # detailed
params = {
'max_length': config['max_length'],
'min_length': config['min_length'],
'do_sample': True,
'temperature': 1.0, # More creativity for detailed
'top_p': 0.9,
'num_beams': 5,
'early_stopping': False, # Allow longer generation
'no_repeat_ngram_size': 2,
'length_penalty': 0.8, # Encourage longer summaries
'truncation': True
}
# Generate summary with length-specific parameters
summary_result = self.summarizer(clean_text, **params)
summary_text = summary_result[0]['summary_text'].strip()
# Post-process for length-specific content
summary_text = self._post_process_summary(summary_text, length)
# Calculate metrics
processing_time = round(time.time() - start_time, 2)
original_words = len(text.split())
summary_words = len(summary_text.split())
compression_ratio = round((1 - summary_words/original_words) * 100, 1)
return {
'summary': summary_text,
'status': 'success',
'processing_time': processing_time,
'compression_ratio': compression_ratio,
'original_words': original_words,
'summary_words': summary_words,
'length_type': length,
'model_used': self.model_name.split('/')[-1],
'config_used': config['description']
}
except Exception as e:
return {
'summary': f"Summarization failed: {str(e)}",
'status': 'error',
'processing_time': 0,
'compression_ratio': 0,
'original_words': 0,
'summary_words': 0,
'error_details': str(e)
}
def _post_process_summary(self, summary, length):
"""Enhanced post-process summary for distinct content - FIXED VERSION"""
# Remove incomplete sentences at the end
if not summary.endswith('.'):
last_period = summary.rfind('.')
if last_period > len(summary) * 0.7:
summary = summary[:last_period + 1]
# Length-specific post-processing for variety
if length == 'short':
# Keep it punchy and direct
sentences = summary.split('.')
if len(sentences) > 2:
# Take first sentence + most important second sentence
summary = '. '.join(sentences[:2]).strip()
if not summary.endswith('.'):
summary += '.'
# Ensure it focuses on the key point
if len(summary.split()) < 20:
summary = summary.replace(' He admits', '. Sam Altman admits')
elif length == 'medium':
# Balanced approach with context
if 'Manhattan Project' in summary and len(summary.split()) < 50:
summary += " This comparison highlights concerns about AI's unprecedented capabilities and potential societal impact."
elif length == 'detailed':
# Comprehensive coverage
if len(summary.split()) < 120:
summary += " The development raises questions about AI safety, regulation, and the future relationship between humans and artificial intelligence systems."
# Ensure it covers broader implications
if 'implications' not in summary.lower() and len(summary.split()) > 100:
summary += " These developments have significant implications for the tech industry and society."
return summary.strip()
def batch_summarize(self, text, lengths=['short', 'medium', 'detailed']):
"""Generate multiple distinct summary lengths - ENHANCED VERSION"""
print(f"📝 Generating {len(lengths)} DISTINCT summary lengths...")
results = {}
total_start = time.time()
for i, length in enumerate(lengths):
print(f" 🔄 Processing {length} summary (attempt {i+1})...")
# Use different random seeds for variety
torch.manual_seed(42 + i * 10) # More distinct seeds
result = self.generate_summary(text, length)
results[length] = result
if result['status'] == 'success':
print(f" ✅ {length}: {result['summary_words']} words ({result['compression_ratio']}% compression)")
# Show MORE of the summary to verify distinctness
preview_length = 80 if length == 'short' else 120
print(f" Preview: {result['summary'][:preview_length]}...")
else:
print(f" ❌ {length}: Failed - {result.get('error_details', 'Unknown error')}")
total_time = round(time.time() - total_start, 2)
print(f"⏱️ Total processing time: {total_time}s")
return results
def extract_keywords(self, text, max_keywords=8):
"""Enhanced keyword extraction"""
try:
if not text:
return []
# Extract meaningful words
words = re.findall(r'\b[A-Z][a-z]+\b|\b[a-z]{4,}\b', text)
# Enhanced stop words
stop_words = {
'this', 'that', 'these', 'those', 'said', 'says', 'told', 'added',
'according', 'also', 'however', 'therefore', 'moreover', 'furthermore',
'meanwhile', 'since', 'while', 'though', 'although', 'because',
'article', 'report', 'news', 'story', 'information', 'content',
'statement', 'announcement', 'update', 'development'
}
# Count word frequencies
word_freq = {}
for word in words:
word_lower = word.lower()
if len(word) > 3 and word_lower not in stop_words:
word_freq[word] = word_freq.get(word, 0) + 1
# Filter significant words (appear at least twice)
significant_words = {word: freq for word, freq in word_freq.items()
if freq >= 2 or word[0].isupper()}
# Sort by frequency and relevance
sorted_keywords = sorted(significant_words.items(),
key=lambda x: (x[1], len(x[0])), reverse=True)
return [word for word, freq in sorted_keywords[:max_keywords]]
except Exception as e:
print(f"Keyword extraction error: {e}")
return ['Technology', 'Innovation', 'Development']
def analyze_content_sentiment(self, text):
"""Enhanced sentiment analysis of the content"""
try:
# Enhanced keyword-based sentiment analysis
positive_words = [
'good', 'great', 'excellent', 'positive', 'success', 'win', 'growth',
'improve', 'benefit', 'breakthrough', 'achievement', 'advance',
'innovation', 'opportunity', 'progress', 'successful', 'effective',
'outstanding', 'remarkable', 'impressive', 'valuable', 'promising'
]
negative_words = [
'bad', 'terrible', 'negative', 'loss', 'fail', 'decline', 'problem',
'crisis', 'concern', 'worry', 'fear', 'threat', 'dangerous', 'risk',
'criticism', 'controversy', 'challenge', 'difficulty', 'useless',
'disappointing', 'alarming', 'troubling', 'concerning', 'warning'
]
neutral_words = [
'said', 'announced', 'reported', 'stated', 'mentioned', 'discussed',
'explained', 'described', 'noted', 'indicated', 'revealed'
]
text_lower = text.lower()
# Count sentiment indicators
positive_count = sum(1 for word in positive_words if word in text_lower)
negative_count = sum(1 for word in negative_words if word in text_lower)
neutral_count = sum(1 for word in neutral_words if word in text_lower)
# Determine sentiment with confidence scoring
total_sentiment_words = positive_count + negative_count + neutral_count
if total_sentiment_words == 0:
return 'Neutral'
# Calculate percentages
pos_ratio = positive_count / total_sentiment_words
neg_ratio = negative_count / total_sentiment_words
# Decision logic with stronger thresholds
if neg_ratio > 0.4 or (negative_count > positive_count and negative_count >= 2):
return 'Negative'
elif pos_ratio > 0.4 or (positive_count > negative_count and positive_count >= 2):
return 'Positive'
else:
return 'Neutral'
except Exception as e:
print(f"Sentiment analysis error: {e}")
return 'Neutral'
def get_model_info(self):
"""Get current model information"""
return {
'model_name': self.model_name,
'device': 'GPU (CUDA)' if torch.cuda.is_available() and self.device >= 0 else 'CPU',
'gpu_name': torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'N/A',
'memory_allocated': f"{torch.cuda.memory_allocated(0) / 1024**2:.1f} MB" if torch.cuda.is_available() else 'N/A',
'length_options': list(self.length_configs.keys()),
'tokenizer_vocab_size': len(self.tokenizer) if hasattr(self, 'tokenizer') else 'Unknown'
}
# Utility function for quick testing
def test_summarizer_with_sample():
"""Test function with sample news content"""
sample_article = """
Artificial intelligence is rapidly transforming industries across the globe, with companies investing billions of dollars in AI research and development to maintain competitive advantages. Technology giants like Google, Microsoft, and OpenAI are leading the charge in developing more sophisticated AI models that can understand and generate human-like text, analyze complex data patterns, and automate various business processes.
The latest developments in large language models have shown remarkable capabilities in areas such as content creation, code generation, and problem-solving. However, experts are increasingly concerned about the ethical implications of AI advancement, including potential job displacement, privacy concerns, and the need for robust regulatory frameworks.
Sam Altman, CEO of OpenAI, recently compared the power of next-generation AI models to the Manhattan Project, highlighting both the tremendous potential and the significant responsibilities that come with such technological breakthroughs. The AI community is now focused on ensuring that these powerful tools are developed and deployed responsibly, with appropriate safeguards to protect society while maximizing the benefits of artificial intelligence.
Educational institutions are also adapting to this AI revolution, integrating AI tools into their curricula and research programs. The AICTE (All India Council for Technical Education) has been promoting AI education and encouraging the establishment of AI laboratories in engineering colleges across India to prepare students for the future workforce.
"""
print("🧪 Testing Smart Summarizer with Sample Article")
print("=" * 60)
summarizer = SmartSummarizer()
# Test all summary lengths
results = summarizer.batch_summarize(sample_article)
print("\n📋 SUMMARIZATION RESULTS:")
print("=" * 60)
for length, result in results.items():
if result['status'] == 'success':
print(f"\n📝 {length.upper()} SUMMARY:")
print(f"📊 {result['summary_words']} words ({result['compression_ratio']}% compression)")
print(f"⏱️ Processing time: {result['processing_time']}s")
print(f"🤖 Model: {result['model_used']}")
print(f"📄 Summary: {result['summary']}") # Show FULL summary
print(f"📏 Full Length: {len(result['summary'])} characters") # Character count
else:
print(f"\n❌ {length.upper()} SUMMARY FAILED:")
print(f"Error: {result['error_details']}")
# Test keyword extraction
keywords = summarizer.extract_keywords(sample_article)
print(f"\n🔍 KEY TOPICS: {', '.join(keywords[:8])}")
# Test sentiment analysis
sentiment = summarizer.analyze_content_sentiment(sample_article)
print(f"💭 CONTENT SENTIMENT: {sentiment}")
return summarizer
# Test function for model info
def show_model_info():
"""Display model information"""
summarizer = SmartSummarizer()
print("\n📊 Model Information:")
info = summarizer.get_model_info()
for key, value in info.items():
print(f" {key}: {value}")
if __name__ == "__main__":
test_summarizer_with_sample()