Skip to content

Commit 2898ba5

Browse files
authored
Merge pull request #74 from brootware/dev
Dev
2 parents 75ac936 + 5b66cd9 commit 2898ba5

File tree

6 files changed

+78
-118
lines changed

6 files changed

+78
-118
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
<a href="https://twitter.com/brootware"><img src="https://img.shields.io/twitter/follow/brootware?style=social" alt="Twitter Follow"></a>
1515
<img alt="PyPI - Python Version" src="https://img.shields.io/pypi/pyversions/pyredactkit"> <img alt="PyPI" src="https://img.shields.io/pypi/v/pyredactkit">
1616
<a href="https://sonarcloud.io/summary/new_code?id=brootware_PyRedactKit"><img src="https://sonarcloud.io/api/project_badges/measure?project=brootware_PyRedactKit&metric=alert_status" alt="reliability rating"></a>
17-
<img alt="GitHub Workflow Status" src="https://img.shields.io/github/workflow/status/brootware/pyredactkit/CI?label=CI&branch=dev">
17+
<img alt="GitHub Workflow Status" src="https://img.shields.io/github/workflow/status/brootware/pyredactkit/CI?label=CI&branch=main">
1818
</p>
1919

2020
## Features

poetry.lock

Lines changed: 14 additions & 78 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
44

55
[tool.poetry]
66
name = "pyredactkit"
7-
version = "0.4.0"
7+
version = "1.0.0"
88
description = "Python cli tool to redact sensitive data"
99
authors = ["brootware <[email protected]>"]
1010
license = "GPL-3.0-or-later"
@@ -25,8 +25,6 @@ classifiers = [
2525

2626
[tool.poetry.dependencies]
2727
python = "^3.7"
28-
nltk = "^3.7"
29-
numpy = "<1.22.0"
3028

3129
[tool.poetry.dev-dependencies]
3230
pytest = "^7.1.2"
@@ -41,8 +39,6 @@ py = "^1.11.0"
4139
pyparsing = "^3.0.8"
4240
tomli = "^2.0.1"
4341
tqdm = "^4.64.0"
44-
nltk = "^3.7"
45-
numpy = "<1.22.0"
4642
rich = "^12.4.0"
4743
mypy = "^0.961"
4844
flake8 = "^4.0.1"

pyredactkit/identifiers.py

Lines changed: 0 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11

22
""" Data identifier class implementation """
3-
import nltk
43

54

65
class Identifier:
@@ -45,27 +44,3 @@ class Identifier:
4544
def __init__(self) -> None:
4645
return None
4746

48-
def names(self, data: str) -> list:
49-
""" Identify names and return them from the supplied data
50-
Args:
51-
data (str): data in alpha-numeric format
52-
53-
Returns:
54-
name_list (array): array of names identified from the supplied data
55-
"""
56-
name = ""
57-
name_list = []
58-
words = nltk.word_tokenize(data)
59-
part_of_speech_tagsets = nltk.pos_tag(words)
60-
named_ent = nltk.ne_chunk(part_of_speech_tagsets, binary=False)
61-
62-
for subtree in named_ent.subtrees():
63-
if subtree.label() == 'PERSON':
64-
l = []
65-
for leaf in subtree.leaves():
66-
l.append(leaf[0])
67-
name = ' '.join(l)
68-
if name not in name_list:
69-
name_list.append(name)
70-
71-
return name_list

tests/test_runner.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import pytest
2+
from pyredactkit import runner as Runner
3+
4+
5+
@pytest.fixture
6+
def mocker_text_file(mocker):
7+
content = "Message to write on file to be written"
8+
mocked_open = mocker.mock_open(read_data=content)
9+
builtin_open = "builtins.open"
10+
mocker.patch(builtin_open, mocked_open)
11+
12+
13+
def test_is_it_file(mocker_text_file, tmp_path):
14+
assert Runner.is_it_file('This is a test string') is False, "is_it_file function should return False for this string"
15+
16+
17+
def test_recursive_file_search(mocker_text_file, tmp_path):
18+
assert Runner.recursive_file_search('This is a test string', 'txt', True) == set(), "recursive_file_search function should return an empty set"
19+
20+
21+
def test_api_identify_sensitive_data(mocker_text_file, tmp_path):
22+
test_string = """this is my IP: 102.23.5.1
23+
My router is : 10.10.10.1
24+
71.159.188.33
25+
81.141.167.45
26+
165.65.59.139
27+
64.248.67.225
28+
https://tech.gov.sg
29+
My email is [email protected]
30+
this is my IP: 102.23.5.1
31+
My router is: 10.10.10.1
32+
71.159.188.33
33+
81.141.167.45
34+
165.65.59.139
35+
64.248.67.225
36+
Base64 data
37+
QVBJX1RPS0VO
38+
UzNjcjN0UGFzc3dvcmQ=
39+
U3VwM3JTM2NyZXRQQHNzd29yZA==
40+
Singapore NRIC
41+
G0022121F
42+
F2121200F
43+
G1021022E
44+
S1022221L
45+
G1222221C
46+
S0000212Q
47+
F2120212E
48+
S0021001P
49+
"""
50+
test_data = ['102.23.5.1', '10.10.10.1', '71.159.188.33', '81.141.167.45', '165.65.59.139', '64.248.67.225', 'https://tech.gov.sg', '[email protected]', 'mail.com', '102.23.5.1', '10.10.10.1', '71.159.188.33', '81.141.167.45', '165.65.59.139', '64.248.67.225', 'G0022121F', 'F2121200F', 'G1021022E', 'S1022221L', 'G1222221C', 'S0000212Q', 'F2120212E', 'S0021001P']
51+
52+
assert Runner.api_identify_sensitive_data(test_string) == test_data, "api_identify_sensitive_data function should return a list of sensitive data"
53+
assert Runner.api_identify_sensitive_data('This is a test string') == [], "api_identify_sensitive_data function should return an empty list"

tools/install_nltk_popular.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
#!/usr/bin/env python
2-
import nltk
3-
import ssl
2+
# import nltk
3+
# import ssl
44

5-
try:
6-
_create_unverified_https_context = ssl._create_unverified_context
7-
except AttributeError:
8-
pass
9-
else:
10-
ssl._create_default_https_context = _create_unverified_https_context
5+
# try:
6+
# _create_unverified_https_context = ssl._create_unverified_context
7+
# except AttributeError:
8+
# pass
9+
# else:
10+
# ssl._create_default_https_context = _create_unverified_https_context
1111

12-
nltk.download('popular')
12+
# nltk.download('popular')

0 commit comments

Comments
 (0)