Skip to content

Commit f115a06

Browse files
committed
add the pipelines
1 parent 67568c5 commit f115a06

34 files changed

+13144
-354
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
<br />
1313
<div align="center">
1414
<a href="https://github.com/CodexEsto/textpipe">
15-
<img src="images/logo.png" alt="Logo" width="120" height="140">
15+
<img src="assets/textpipeRB.png" alt="Logo" width="230" height="150">
1616
</a>
1717

1818
<h3 align="center">textpipe</h3>

assets/textpipe.jpg

46.8 KB
Loading

assets/textpipe.png

7.66 KB
Loading

assets/textpipe.svg

Lines changed: 122 additions & 0 deletions
Loading

assets/textpipeRB.png

21.5 KB
Loading

data/news-article-categories.csv

Lines changed: 12071 additions & 0 deletions
Large diffs are not rendered by default.

test.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
from textpipe.data.model_io import (
2+
save_model,
3+
load_model,
4+
save_vectorizer,
5+
load_vectorizer,
6+
)
7+
from textpipe.core.recommender import ContentRecommender
8+
from textpipe.pipeline import SuggestionPipeline
9+
from textpipe.config import Config
10+
11+
# Load the configuration
12+
config = Config.get()
13+
14+
# Sample text data
15+
sample_texts = [
16+
# Education
17+
"Online learning platforms are transforming education.",
18+
"Teachers use AI tools to personalize lessons.",
19+
"Distance learning increased during the pandemic.",
20+
# Politics
21+
"The government passed a new environmental bill.",
22+
"Elections will be held next year amid rising tensions.",
23+
"A new policy was announced by the health minister.",
24+
# Technology
25+
"Quantum computing promises exponential speedup.",
26+
"New AI models are revolutionizing software development.",
27+
"Cybersecurity threats are growing in the tech world.",
28+
# Environment
29+
"Climate change is causing rising sea levels.",
30+
"Renewable energy is crucial for sustainability.",
31+
"Deforestation threatens biodiversity globally.",
32+
# Business
33+
"Stock markets surged after the merger announcement.",
34+
"Startups are attracting record venture capital funding.",
35+
"Remote work is reshaping corporate culture.",
36+
# Sports
37+
"The team won the championship after a tough season.",
38+
"The Olympic Games are scheduled for next summer.",
39+
"Athletes are training hard for the world cup."
40+
]
41+
42+
43+
# Initialize the pipeline using loaded config
44+
pipeline = SuggestionPipeline(config=config)
45+
46+
# Fit the pipeline with the sample data
47+
pipeline.fit(sample_texts)
48+
49+
# Save model and vectorizer
50+
# save_model(pipeline.recommender, "models/recommender.pkl")
51+
# save_vectorizer(pipeline.vectorizer, "models/vectorizer.pkl")
52+
53+
# Load saved model and vectorizer
54+
# loaded_recommender = load_model("models/recommender.pkl")
55+
# loaded_vectorizer = load_vectorizer("models/vectorizer.pkl")
56+
57+
# Create a new pipeline with loaded objects
58+
# new_pipeline = SuggestionPipeline(config=config)
59+
# new_pipeline.recommender = loaded_recommender
60+
# new_pipeline.vectorizer = loaded_vectorizer
61+
62+
# Query for testing
63+
query = "How is AI changing education?"
64+
recommendations = pipeline.suggest(query, k=3)
65+
66+
# Display recommendations
67+
print(f"Recommendations for the query '{query}':")
68+
for rec in recommendations:
69+
print(rec)

tests/config/test_nltk.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,10 @@
1414
# """
1515
# # Mock user home to temporary path
1616
# monkeypatch.setattr(os.path, 'expanduser', lambda _: str(tmp_path))
17-
17+
1818
# # Clear NLTK's internal paths
1919
# nltk.data.path.clear()
20-
20+
2121
# # Remove any existing test data
2222
# nltk_dir = tmp_path / "nltk_data"
2323
# if nltk_dir.exists():
@@ -29,21 +29,21 @@
2929
# Covers both modern and legacy resource access
3030
# """
3131
# configure_nltk()
32-
32+
3333
# # Verify modern resources
3434
# assert nltk.data.find('tokenizers/punkt'), "Modern punkt resource missing"
3535
# assert nltk.data.find('taggers/averaged_perceptron_tagger'), "POS tagger missing"
36-
36+
3737
# # Verify legacy structure
3838
# nltk_dir = os.path.expanduser("~/nltk_data")
39-
39+
4040
# # Check main symlink
4141
# legacy_root = os.path.join(nltk_dir, 'tokenizers/punkt_tab')
4242
# assert os.path.islink(legacy_root), "Main legacy symlink not created"
4343
# assert os.path.realpath(legacy_root) == os.path.join(nltk_dir, 'tokenizers/punkt')
44-
44+
4545
# # Check language-specific symlink
4646
# legacy_lang = os.path.join(legacy_root, 'english')
4747
# if os.path.exists(legacy_lang): # Some NLTK versions don't have language subdirs
4848
# assert os.path.islink(legacy_lang), "Language symlink missing"
49-
# assert os.path.realpath(legacy_lang) == os.path.join(nltk_dir, 'tokenizers/punkt/english')
49+
# assert os.path.realpath(legacy_lang) == os.path.join(nltk_dir, 'tokenizers/punkt/english')

tests/data/test_cleaner.py

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,35 @@
1-
# tests/data/test_cleaner.py
1+
# # tests/data/test_cleaner.py
22

3-
import pytest
4-
from textpipe.data.cleaner import clean_text, remove_stopwords
3+
# import pytest
4+
# from textpipe.data.cleaner import clean_text, remove_stopwords
55

66

7-
def test_clean_text():
8-
"""Test basic text cleaning functionality."""
9-
raw_text = "Hello, world! How's it going?"
10-
cleaned_text = clean_text(raw_text)
11-
assert cleaned_text == "hello world how's it going", "Basic cleaning failed"
7+
# def test_clean_text():
8+
# """Test basic text cleaning functionality."""
9+
# raw_text = "Hello, world! How's it going?"
10+
# cleaned_text = clean_text(raw_text)
11+
# assert cleaned_text == "hello world how's it going", "Basic cleaning failed"
1212

1313

14-
def test_remove_stopwords():
15-
"""Test stopword removal with case insensitivity."""
16-
text = "This is a sample text"
17-
stopwords = {"is", "the", "a", "in", "to", "and", "this"}
18-
cleaned_text = remove_stopwords(text, stopwords)
19-
assert cleaned_text == "sample text", "Stopword removal failed"
14+
# def test_remove_stopwords():
15+
# """Test stopword removal with case insensitivity."""
16+
# text = "This is a sample text"
17+
# stopwords = {"is", "the", "a", "in", "to", "and", "this"}
18+
# cleaned_text = remove_stopwords(text, stopwords)
19+
# assert cleaned_text == "sample text", "Stopword removal failed"
2020

2121

22-
@pytest.mark.parametrize(
23-
"raw_text, expected_cleaned_text",
24-
[
25-
("Hello!!!", "hello"),
26-
("Python is awesome.", "python is awesome"),
27-
(" Space and punctuation! ", "space and punctuation"),
28-
],
29-
)
30-
def test_clean_text_various_cases(raw_text, expected_cleaned_text):
31-
"""Test edge cases and various input formats."""
32-
cleaned_text = clean_text(raw_text)
33-
assert cleaned_text == expected_cleaned_text, (
34-
f"Expected '{expected_cleaned_text}', got '{cleaned_text}'"
35-
)
22+
# @pytest.mark.parametrize(
23+
# "raw_text, expected_cleaned_text",
24+
# [
25+
# ("Hello!!!", "hello"),
26+
# ("Python is awesome.", "python is awesome"),
27+
# (" Space and punctuation! ", "space and punctuation"),
28+
# ],
29+
# )
30+
# def test_clean_text_various_cases(raw_text, expected_cleaned_text):
31+
# """Test edge cases and various input formats."""
32+
# cleaned_text = clean_text(raw_text)
33+
# assert cleaned_text == expected_cleaned_text, (
34+
# f"Expected '{expected_cleaned_text}', got '{cleaned_text}'"
35+
# )

0 commit comments

Comments
 (0)