Commit 57cadf8
⚡️ Speed up function
### 📄 111% (1.11x) speedup for ***`check_for_nltk_package` in
`unstructured/nlp/tokenize.py`***
⏱️ Runtime : **`57.7 milliseconds`** **→** **`27.3 milliseconds`** (best
of `101` runs)
### 📝 Explanation and details
Here’s an optimized version of your program. The main improvements are.
- Eliminates the unnecessary list and loop for constructing `paths`;
instead, uses a generator expression so memory is not allocated for an
intermediate list.
- Uses `os.path.join` only if needed, otherwise leaves the original
path.
- Caches the result by using a local variable within the function
instead of constructing the list first.
- Overall reduced allocations & faster iteration.
- Avoid creating and storing a full list with potentially many paths,
instead lazily generate them as needed by `nltk.find`.
This is as fast as possible, given the external dependencies (nltk’s own
`find()` algorithm).
✅ **Correctness verification report:**
| Test | Status |
| --------------------------- | ----------------- |
| ⚙️ Existing Unit Tests | 🔘 **None Found** |
| 🌀 Generated Regression Tests | ✅ **796 Passed** |
| ⏪ Replay Tests | ✅ **8 Passed** |
| 🔎 Concolic Coverage Tests | 🔘 **None Found** |
|📊 Tests Coverage | 100.0% |
<details>
<summary>🌀 Generated Regression Tests and Runtime</summary>
```python
from __future__ import annotations
import os
import shutil
import tempfile
import nltk
# imports
import pytest # used for our unit tests
from unstructured.nlp.tokenize import check_for_nltk_package
# unit tests
# -------------------
# Basic Test Cases
# -------------------
def test_existing_corpus():
# Test with a standard corpus that is usually present if nltk_data is installed
# 'punkt' is a common tokenizer model
codeflash_output = check_for_nltk_package('punkt', 'tokenizers') # 117μs -> 76.8μs (53.7% faster)
# If 'punkt' is present, should return True
# If not present, should return False
# We check both to allow for environments where punkt is not installed
def test_nonexistent_package():
# Test with a package that does not exist
codeflash_output = check_for_nltk_package('nonexistent_package_xyz', 'corpora') # 100μs -> 59.6μs (68.8% faster)
def test_existing_wordnet_corpus():
# Test with a common corpus
codeflash_output = check_for_nltk_package('wordnet', 'corpora') # 97.5μs -> 55.7μs (75.2% faster)
def test_existing_stopwords():
# Test with another common corpus
codeflash_output = check_for_nltk_package('stopwords', 'corpora') # 96.0μs -> 55.3μs (73.6% faster)
# -------------------
# Edge Test Cases
# -------------------
def test_empty_package_name():
# Empty package name should not be found
codeflash_output = check_for_nltk_package('', 'corpora') # 99.5μs -> 57.4μs (73.3% faster)
def test_empty_package_category():
# Empty category should not be found
codeflash_output = check_for_nltk_package('punkt', '') # 98.4μs -> 56.2μs (75.2% faster)
def test_empty_both():
# Both empty should not be found
codeflash_output = check_for_nltk_package('', '') # 18.1μs -> 19.3μs (5.86% slower)
def test_special_characters_in_name():
# Special characters in package name should not be found
codeflash_output = check_for_nltk_package('!@#$%^&*()', 'corpora') # 119μs -> 72.4μs (65.1% faster)
def test_special_characters_in_category():
# Special characters in category should not be found
codeflash_output = check_for_nltk_package('punkt', '!!!') # 96.8μs -> 56.3μs (71.9% faster)
def test_case_sensitivity():
# NLTK is case-sensitive, so wrong case should not be found
codeflash_output = check_for_nltk_package('PUNKT', 'tokenizers') # 96.5μs -> 55.9μs (72.6% faster)
def test_path_without_nltk_data():
# Simulate a path without 'nltk_data' at the end
# Create a temporary directory structure
with tempfile.TemporaryDirectory() as tmpdir:
# Create a fake nltk_data/tokenizers/punkt directory
nltk_data_dir = os.path.join(tmpdir, 'nltk_data', 'tokenizers')
os.makedirs(nltk_data_dir)
# Place a dummy file for 'punkt'
with open(os.path.join(nltk_data_dir, 'punkt'), 'w') as f:
f.write('dummy')
# Temporarily override nltk.data.path
orig_paths = list(nltk.data.path)
nltk.data.path.insert(0, tmpdir)
try:
# Should find the package now
codeflash_output = check_for_nltk_package('punkt', 'tokenizers')
finally:
nltk.data.path = orig_paths
def test_path_with_nltk_data():
# Simulate a path that already ends with 'nltk_data'
with tempfile.TemporaryDirectory() as tmpdir:
nltk_data_dir = os.path.join(tmpdir, 'nltk_data')
tokenizers_dir = os.path.join(nltk_data_dir, 'tokenizers')
os.makedirs(tokenizers_dir)
with open(os.path.join(tokenizers_dir, 'punkt'), 'w') as f:
f.write('dummy')
orig_paths = list(nltk.data.path)
nltk.data.path.insert(0, nltk_data_dir)
try:
codeflash_output = check_for_nltk_package('punkt', 'tokenizers')
finally:
nltk.data.path = orig_paths
def test_oserror_on_invalid_path(monkeypatch):
# Simulate an OSError by passing in a path that cannot be accessed
# We'll monkeypatch nltk.data.path to a directory that doesn't exist
orig_paths = list(nltk.data.path)
nltk.data.path.insert(0, '/nonexistent_dir_xyz_123')
try:
# Should not raise, but return False
codeflash_output = check_for_nltk_package('punkt', 'tokenizers')
finally:
nltk.data.path = orig_paths
def test_unicode_package_name():
# Unicode in package name should not be found
codeflash_output = check_for_nltk_package('punkté', 'tokenizers') # 108μs -> 64.8μs (66.7% faster)
def test_unicode_category_name():
# Unicode in category name should not be found
codeflash_output = check_for_nltk_package('punkt', 'tokenizersé') # 102μs -> 59.0μs (73.0% faster)
# -------------------
# Large Scale Test Cases
# -------------------
def test_large_number_of_paths():
# Simulate a large number of nltk.data.path entries
orig_paths = list(nltk.data.path)
with tempfile.TemporaryDirectory() as tmpdir:
# Create many fake paths, only one contains the package
fake_paths = []
for i in range(100):
fake_dir = os.path.join(tmpdir, f"fake_{i}")
os.makedirs(fake_dir)
fake_paths.append(fake_dir)
# Add the real one at the end
real_dir = os.path.join(tmpdir, 'real_nltk_data', 'tokenizers')
os.makedirs(real_dir)
with open(os.path.join(real_dir, 'punkt'), 'w') as f:
f.write('dummy')
nltk.data.path[:] = fake_paths + [os.path.join(tmpdir, 'real_nltk_data')]
# Should find the package
codeflash_output = check_for_nltk_package('punkt', 'tokenizers')
nltk.data.path = orig_paths
def test_large_number_of_missing_packages():
# Test that all missing packages are not found efficiently
for i in range(100):
codeflash_output = check_for_nltk_package(f'nonexistent_pkg_{i}', 'corpora')
def test_large_number_of_categories():
# Test many different categories, all missing
for i in range(100):
codeflash_output = check_for_nltk_package('punkt', f'category_{i}')
def test_many_paths_with_some_invalid():
# Mix valid and invalid paths
orig_paths = list(nltk.data.path)
with tempfile.TemporaryDirectory() as tmpdir:
valid_dir = os.path.join(tmpdir, 'nltk_data', 'tokenizers')
os.makedirs(valid_dir)
with open(os.path.join(valid_dir, 'punkt'), 'w') as f:
f.write('dummy')
fake_paths = [f'/nonexistent_{i}' for i in range(50)]
nltk.data.path[:] = fake_paths + [os.path.join(tmpdir, 'nltk_data')]
codeflash_output = check_for_nltk_package('punkt', 'tokenizers')
nltk.data.path = orig_paths
def test_performance_many_checks():
# Performance: check the same valid package many times
with tempfile.TemporaryDirectory() as tmpdir:
nltk_data_dir = os.path.join(tmpdir, 'nltk_data', 'tokenizers')
os.makedirs(nltk_data_dir)
with open(os.path.join(nltk_data_dir, 'punkt'), 'w') as f:
f.write('dummy')
orig_paths = list(nltk.data.path)
nltk.data.path.insert(0, os.path.join(tmpdir, 'nltk_data'))
try:
for _ in range(100):
codeflash_output = check_for_nltk_package('punkt', 'tokenizers')
finally:
nltk.data.path = orig_paths
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
from __future__ import annotations
import os
import nltk
# imports
import pytest # used for our unit tests
from unstructured.nlp.tokenize import check_for_nltk_package
# unit tests
# ----------- BASIC TEST CASES -----------
def test_existing_corpus_package():
# Test with a commonly available corpus package, e.g., 'punkt'
# Should return True if 'punkt' is installed
codeflash_output = check_for_nltk_package('punkt', 'tokenizers'); result = codeflash_output # 110μs -> 66.0μs (68.2% faster)
def test_nonexistent_package_returns_false():
# Test with a clearly non-existent package
codeflash_output = check_for_nltk_package('not_a_real_package', 'corpora') # 100μs -> 59.0μs (70.2% faster)
def test_existing_grammar_package():
# Test with a grammar package that may exist
codeflash_output = check_for_nltk_package('sample_grammar', 'grammars'); result = codeflash_output # 98.2μs -> 56.2μs (74.8% faster)
def test_existing_corpus_category():
# Test with a corpus that is often installed by default
codeflash_output = check_for_nltk_package('words', 'corpora'); result = codeflash_output # 96.9μs -> 55.1μs (75.8% faster)
def test_existing_stemmer_package():
# Test for a stemmer package
codeflash_output = check_for_nltk_package('porter.pickle', 'stemmers'); result = codeflash_output # 98.0μs -> 55.3μs (77.2% faster)
# ----------- EDGE TEST CASES -----------
def test_empty_package_name():
# Test with empty package name
codeflash_output = check_for_nltk_package('', 'corpora') # 99.0μs -> 57.0μs (73.9% faster)
def test_empty_category_name():
# Test with empty category name
codeflash_output = check_for_nltk_package('punkt', '') # 96.7μs -> 54.9μs (76.1% faster)
def test_both_empty():
# Test with both package and category names empty
codeflash_output = check_for_nltk_package('', '') # 18.1μs -> 19.4μs (6.87% slower)
def test_package_name_with_special_characters():
# Test with special characters in package name
codeflash_output = check_for_nltk_package('!@#, 'corpora') # 101μs -> 58.5μs (73.4% faster)
def test_category_name_with_special_characters():
# Test with special characters in category name
codeflash_output = check_for_nltk_package('punkt', '!@#) # 97.8μs -> 55.7μs (75.4% faster)
def test_package_name_with_path_traversal():
# Test with directory traversal in package name
codeflash_output = check_for_nltk_package('../punkt', 'tokenizers') # 63.7μs -> 44.7μs (42.5% faster)
def test_category_name_with_path_traversal():
# Test with directory traversal in category name
codeflash_output = check_for_nltk_package('punkt', '../tokenizers') # 178μs -> 75.5μs (137% faster)
def test_case_sensitivity():
# NLTK is case-sensitive: 'Punkt' should not be found if only 'punkt' exists
codeflash_output = check_for_nltk_package('punkt', 'tokenizers'); result_lower = codeflash_output # 95.6μs -> 54.0μs (77.0% faster)
codeflash_output = check_for_nltk_package('Punkt', 'tokenizers'); result_upper = codeflash_output # 81.4μs -> 41.5μs (96.2% faster)
# If lower is True, upper should be False
if result_lower:
pass
def test_leading_trailing_spaces():
# Leading/trailing spaces should not resolve to a valid package
codeflash_output = check_for_nltk_package(' punkt ', 'tokenizers') # 96.2μs -> 54.0μs (78.2% faster)
codeflash_output = check_for_nltk_package('punkt', ' tokenizers ') # 82.0μs -> 42.2μs (94.3% faster)
def test_numeric_package_and_category():
# Numeric names are very unlikely to exist
codeflash_output = check_for_nltk_package('12345', '67890') # 93.6μs -> 53.1μs (76.4% faster)
def test_package_name_with_unicode():
# Test with unicode characters in package name
codeflash_output = check_for_nltk_package('😀', 'corpora') # 110μs -> 66.9μs (64.6% faster)
def test_category_name_with_unicode():
# Test with unicode characters in category name
codeflash_output = check_for_nltk_package('punkt', '😀') # 103μs -> 60.1μs (72.3% faster)
def test_package_and_category_with_long_names():
# Very long names should not exist and should not cause errors
long_name = 'a' * 255
codeflash_output = check_for_nltk_package(long_name, long_name) # 127μs -> 79.0μs (61.1% faster)
def test_package_and_category_with_slashes():
# Slashes in names should not resolve to valid packages
codeflash_output = check_for_nltk_package('punkt/other', 'tokenizers') # 125μs -> 62.9μs (99.4% faster)
codeflash_output = check_for_nltk_package('punkt', 'tokenizers/other') # 108μs -> 47.8μs (127% faster)
# ----------- LARGE SCALE TEST CASES -----------
def test_large_number_of_nonexistent_packages():
# Test performance/scalability with many non-existent packages
for i in range(100):
name = f"not_a_real_package_{i}"
codeflash_output = check_for_nltk_package(name, 'corpora')
def test_large_number_of_nonexistent_categories():
# Test performance/scalability with many non-existent categories
for i in range(100):
cat = f"not_a_real_category_{i}"
codeflash_output = check_for_nltk_package('punkt', cat)
def test_large_number_of_random_combinations():
# Test a large number of random package/category combinations
for i in range(100):
pkg = f"pkg_{i}"
cat = f"cat_{i}"
codeflash_output = check_for_nltk_package(pkg, cat)
def test_large_scale_existing_and_nonexisting():
# Mix of likely existing and non-existing packages
likely_existing = ['punkt', 'words', 'stopwords', 'averaged_perceptron_tagger']
for pkg in likely_existing:
codeflash_output = check_for_nltk_package(pkg, 'corpora'); result = codeflash_output # 74.8μs -> 34.1μs (119% faster)
# Now add a batch of non-existing ones
for i in range(50):
codeflash_output = check_for_nltk_package(f"noexist_{i}", 'corpora')
def test_large_scale_edge_cases():
# Edge-like names in large scale
for i in range(50):
weird_name = f"../noexist_{i}"
codeflash_output = check_for_nltk_package(weird_name, 'corpora')
codeflash_output = check_for_nltk_package('punkt', weird_name)
# ----------- DETERMINISM AND TYPE TESTS -----------
def test_return_type_is_bool():
# The function should always return a bool, regardless of input
inputs = [
('punkt', 'tokenizers'),
('not_a_real_package', 'corpora'),
('', ''),
('😀', '😀'),
('../punkt', 'tokenizers'),
('punkt', '../tokenizers'),
]
for pkg, cat in inputs:
pass
def test_function_is_deterministic():
# The function should return the same result for the same input
pkg, cat = 'punkt', 'tokenizers'
codeflash_output = check_for_nltk_package(pkg, cat); result1 = codeflash_output # 105μs -> 57.4μs (83.5% faster)
codeflash_output = check_for_nltk_package(pkg, cat); result2 = codeflash_output # 81.0μs -> 41.0μs (97.6% faster)
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
```
</details>
To edit these changes `git checkout
codeflash/optimize-check_for_nltk_package-mcftixl5` and push.
[](https://codeflash.ai)
---------
Signed-off-by: Saurabh Misra <[email protected]>
Co-authored-by: codeflash-ai[bot] <148906541+codeflash-ai[bot]@users.noreply.github.com>check_for_nltk_package by 111% (#4081)1 parent cc635c9 commit 57cadf8
2 files changed
+6
-6
lines changed| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
2 | 2 | | |
3 | 3 | | |
4 | 4 | | |
| 5 | + | |
5 | 6 | | |
6 | 7 | | |
7 | 8 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
14 | 14 | | |
15 | 15 | | |
16 | 16 | | |
17 | | - | |
18 | | - | |
19 | | - | |
20 | | - | |
21 | | - | |
| 17 | + | |
| 18 | + | |
| 19 | + | |
| 20 | + | |
22 | 21 | | |
23 | 22 | | |
24 | | - | |
| 23 | + | |
25 | 24 | | |
26 | 25 | | |
27 | 26 | | |
| |||
0 commit comments