Skip to content

Commit 2b34b69

Browse files
committed
improvements, documentation changes
1 parent b612757 commit 2b34b69

File tree

7 files changed

+141
-43
lines changed

7 files changed

+141
-43
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
.idea
44
node_modules
55
*.iml
6+
*.ipr
7+
*.iws
68
build
79
.gradle
810
gradle*

README.md

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,45 @@
11
# Homoglyphs
2+
3+
## Java Quick Start
4+
5+
Include the Homoglyph library in your project by downloading it from Maven Central:
6+
7+
```xml
8+
<dependency>
9+
<groupId>net.codebox</groupId>
10+
<artifactId>homoglyph</artifactId>
11+
<version>1.0.2</version>
12+
</dependency>
13+
```
14+
15+
Then use the `HomoglyphBuilder` class to build a `Homoglyph` object, and call its `search()` method with the text you want
16+
to search, and the word/s you want to search for:
17+
18+
```java
19+
String textToSearch = "Get free ϲrEd1ᴛ";
20+
String[] bannedWords = new String[]{"credit"};
21+
Homoglyph homoglyph = HomoglyphBuilder.build();
22+
List<SearchResult> results = homoglyph.search(textToSearch, bannedWords);
23+
```
24+
25+
## JavaScript Quick Start
26+
27+
Include the Homoglyph library in your project by downloading it from NPM:
28+
29+
```
30+
npm install homoglyph-search
31+
```
32+
33+
Then call the module's `search()` function with the text you want to search, and the word/s you want to search for:
34+
35+
```javascript
36+
var homoglyphSearch = require('homoglyph-search');
37+
var bannedWords = ['credit'];
38+
var textToSearch = 'Get free ϲrEd1ᴛ';
39+
var results = homoglyphSearch.search(textToSearch, bannedWords);
40+
```
41+
42+
## Background
243
Homoglyphs are characters with different meanings, that look similar/identical to each other - like the digit '0' and the capital letter 'O' for example.
344

445
Homoglyphs within a single alphabet tend to be rare for obvious reasons. These days, however, the internet runs on Unicode which means that it is possible to mix the letters from many [different languages](http://www.unicode.org/cldr/charts/latest/supplemental/languages_and_scripts.html) together in one place, massively increasing the number of homoglyphs.

build.gradle

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
group = "net.codebox"
2-
version = "1.0.0"
2+
version = "1.0.2"
33

44
apply plugin: 'java'
55
apply plugin: 'idea'
66
apply plugin: 'maven'
77
apply plugin: 'signing'
88

9+
sourceSets.main.resources {
10+
srcDirs = ["raw_data"]; include "char_codes.txt"
11+
}
12+
913
repositories {
1014
mavenCentral()
1115
}

src/main/java/net/codebox/homoglyph/Homoglyph.java

Lines changed: 5 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,5 @@
11
package net.codebox.homoglyph;
22

3-
import java.io.BufferedReader;
4-
import java.io.FileNotFoundException;
5-
import java.io.FileReader;
6-
import java.io.IOException;
73
import java.util.*;
84

95
/**
@@ -64,7 +60,6 @@ public List<SearchResult> search(final String text, final String... targetWords)
6460
return search(text, Arrays.asList(targetWords));
6561
}
6662

67-
6863
private Collection<SearchResult> checkForWord(final CodePoints text, final CodePoints targetWord) {
6964
final Collection<SearchResult> results = new ArrayList<SearchResult>();
7065

@@ -104,6 +99,11 @@ public SearchResult(final int index, final String match, final String word){
10499
public int index;
105100
public String match;
106101
public String word;
102+
103+
@Override
104+
public String toString() {
105+
return String.format("'%s' at position %s matches '%s'", match, index, word);
106+
}
107107
}
108108

109109
public static class CodePoints{
@@ -165,27 +165,4 @@ public Set<Integer> lookup(final int cp){
165165
}
166166
}
167167

168-
public static List<Set<Integer>> parseCharCodesFile(final String path) throws IOException {
169-
final List<Set<Integer>> homoglyphs = new ArrayList<Set<Integer>>();
170-
final BufferedReader reader = new BufferedReader(new FileReader(path));
171-
172-
String line;
173-
while((line = reader.readLine()) != null){
174-
line = line.trim();
175-
if (line.startsWith("#") || line.length() == 0){
176-
continue;
177-
}
178-
final Set<Integer> set = new HashSet<Integer>();
179-
for (String charCode : line.split(",")) {
180-
try {
181-
set.add(Integer.parseInt(charCode, 16));
182-
} catch (NumberFormatException ex){
183-
// ignore badly formatted lines
184-
}
185-
}
186-
homoglyphs.add(set);
187-
}
188-
189-
return homoglyphs;
190-
}
191168
}
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
package net.codebox.homoglyph;
2+
3+
import java.io.*;
4+
import java.nio.file.Files;
5+
import java.nio.file.Paths;
6+
import java.util.*;
7+
import java.util.regex.Pattern;
8+
import java.util.stream.Collectors;
9+
10+
/**
11+
* Helper class providing methods that supply populated Homoglyph objects.
12+
*
13+
* @author Rob Dawson
14+
*/
15+
public class HomoglyphBuilder {
16+
private static final String CHAR_CODES_FILE = "/char_codes.txt";
17+
18+
/**
19+
* Parses the bundled char_codes.txt file, and uses it to construct a populated Homoglyph object.
20+
*
21+
* @return a Homoglyph object populated using the contents of the char_codes.txt file
22+
*
23+
* @throws MissingResourceException if the char_codes.txt file is missing
24+
* @throws IOException if the char_codes.txt exists but cannot be read
25+
*/
26+
public static Homoglyph build() throws IOException {
27+
final InputStream is = HomoglyphBuilder.class.getResourceAsStream(CHAR_CODES_FILE);
28+
if (is == null){
29+
throw new MissingResourceException("Unable to read " + CHAR_CODES_FILE,
30+
HomoglyphBuilder.class.getName(), CHAR_CODES_FILE);
31+
}
32+
return build(new InputStreamReader(is));
33+
}
34+
35+
/**
36+
* Parses the specified file and uses it to construct a populated Homoglyph object.
37+
*
38+
* @return a Homoglyph object populated using the contents of the specified file
39+
*
40+
* @throws IOException if the specified file cannot be read
41+
*/
42+
public static Homoglyph build(final String path) throws IOException {
43+
return build(new FileReader(path));
44+
}
45+
46+
/**
47+
* Consumes the supplied Reader and uses it to construct a populated Homoglyph object.
48+
*
49+
* @return a Homoglyph object populated using the data returned by the Reader object
50+
*
51+
* @throws IOException if the specified Reader cannot be read
52+
*/
53+
public static Homoglyph build(final Reader reader) throws IOException {
54+
final List<Set<Integer>> homoglyphs = new ArrayList<Set<Integer>>();
55+
final BufferedReader bufferedReader = new BufferedReader(reader);
56+
57+
String line;
58+
while((line = bufferedReader.readLine()) != null){
59+
line = line.trim();
60+
if (line.startsWith("#") || line.length() == 0){
61+
continue;
62+
}
63+
final Set<Integer> set = new HashSet<Integer>();
64+
for (String charCode : line.split(",")) {
65+
try {
66+
set.add(Integer.parseInt(charCode, 16));
67+
} catch (NumberFormatException ex){
68+
// ignore badly formatted lines
69+
}
70+
}
71+
homoglyphs.add(set);
72+
}
73+
74+
return new Homoglyph(homoglyphs);
75+
}
76+
77+
}

src/test/java/net/codebox/homoglyph/HomoglyphDataTest.java

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
11
package net.codebox.homoglyph;
22

3-
import org.junit.Assert;
43
import org.junit.Before;
54
import org.junit.Test;
65

76
import java.io.IOException;
87
import java.util.Arrays;
9-
import java.util.HashSet;
108
import java.util.List;
119
import java.util.Set;
1210

@@ -46,7 +44,7 @@ public void testMixedCase(){
4644
}
4745

4846
private void check(String text, String targetWord){
49-
List<Homoglyph.SearchResult> r = homoglyph.search(text, Arrays.asList(targetWord));
47+
List<SearchResult> r = homoglyph.search(text, Arrays.asList(targetWord));
5048
assertEquals(1, r.size());
5149
assertEquals(targetWord, r.get(0).word);
5250
}

src/test/java/net/codebox/homoglyph/HomoglyphLogicTest.java

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
package net.codebox.homoglyph;
22

3-
import org.junit.Assert;
43
import org.junit.Before;
54
import org.junit.Test;
65

@@ -34,69 +33,69 @@ public void setup() throws IOException {
3433

3534
@Test
3635
public void whenTextDoesNotContainAnyTargetWords_thenNoMatchesFound(){
37-
List<Homoglyph.SearchResult> r = homoglyph.search("Nothing to see here", "TARGET");
36+
List<SearchResult> r = homoglyph.search("Nothing to see here", "TARGET");
3837
assertEquals(0, r.size());
3938
}
4039

4140
@Test
4241
public void whenTextIdenticalToTargetWord_thenMatchFound(){
43-
List<Homoglyph.SearchResult> r = homoglyph.search("SOIL", "SOIL");
42+
List<SearchResult> r = homoglyph.search("SOIL", "SOIL");
4443
assertEquals(1, r.size());
4544
checkResult(r.get(0), 0, "SOIL", "SOIL");
4645
}
4746

4847
@Test
4948
public void whenTextContainsTargetWord_thenMatchFound(){
50-
List<Homoglyph.SearchResult> r = homoglyph.search("I have SOIL in my garden", "SOIL");
49+
List<SearchResult> r = homoglyph.search("I have SOIL in my garden", "SOIL");
5150
assertEquals(1, r.size());
5251
checkResult(r.get(0), 7, "SOIL", "SOIL");
5352
}
5453

5554
@Test
5655
public void whenTextContainsOneOfTheTargetWords_thenMatchFound(){
57-
List<Homoglyph.SearchResult> r = homoglyph.search("I have SOIL in my garden", "CHEESE", "SOIL", "FALCONS");
56+
List<SearchResult> r = homoglyph.search("I have SOIL in my garden", "CHEESE", "SOIL", "FALCONS");
5857
assertEquals(1, r.size());
5958
checkResult(r.get(0), 7, "SOIL", "SOIL");
6059
}
6160

6261
@Test
6362
public void whenTargetWordContainsHomoglyphs_thenMatchFound(){
64-
List<Homoglyph.SearchResult> r = homoglyph.search("I have 501L in my garden", Arrays.asList("CHEESE", "SOIL", "FALCONS"));
63+
List<SearchResult> r = homoglyph.search("I have 501L in my garden", Arrays.asList("CHEESE", "SOIL", "FALCONS"));
6564
assertEquals(1, r.size());
6665
checkResult(r.get(0), 7, "SOIL", "501L");
6766
}
6867

6968
@Test
7069
public void whenTargetWordIsAtStartOfText_thenMatchFound(){
71-
List<Homoglyph.SearchResult> r = homoglyph.search("FALC0N5 fly", Arrays.asList("CHEESE", "SOIL", "FALCONS"));
70+
List<SearchResult> r = homoglyph.search("FALC0N5 fly", Arrays.asList("CHEESE", "SOIL", "FALCONS"));
7271
assertEquals(1, r.size());
7372
checkResult(r.get(0), 0, "FALCONS", "FALC0N5");
7473
}
7574

7675
@Test
7776
public void whenTargetWordIsAtEndOfText_thenMatchFound(){
78-
List<Homoglyph.SearchResult> r = homoglyph.search("I like FALC0N5", Arrays.asList("CHEESE", "SOIL", "FALCONS"));
77+
List<SearchResult> r = homoglyph.search("I like FALC0N5", Arrays.asList("CHEESE", "SOIL", "FALCONS"));
7978
assertEquals(1, r.size());
8079
checkResult(r.get(0), 7, "FALCONS", "FALC0N5");
8180
}
8281

8382
@Test
8483
public void whenTargetWordHasDifferentCaseInText_thenMatchFound(){
85-
List<Homoglyph.SearchResult> r = homoglyph.search("I like fALc0N5 fly", "Falcons");
84+
List<SearchResult> r = homoglyph.search("I like fALc0N5 fly", "Falcons");
8685
assertEquals(1, r.size());
8786
checkResult(r.get(0), 7, "Falcons", "fALc0N5");
8887
}
8988

9089
@Test
9190
public void whenTargetWordContainsMultipleMatchesWithDifferentHomoglyphs_thenMatchFound(){
92-
List<Homoglyph.SearchResult> r = homoglyph.search("I have 501L and FALC0N5 in my garden, I prefer the SO|L", Arrays.asList("CHEESE", "SOIL", "FALCONS"));
91+
List<SearchResult> r = homoglyph.search("I have 501L and FALC0N5 in my garden, I prefer the SO|L", Arrays.asList("CHEESE", "SOIL", "FALCONS"));
9392
assertEquals(3, r.size());
9493
checkResult(r.get(0), 7, "SOIL", "501L");
9594
checkResult(r.get(1), 51, "SOIL", "SO|L");
9695
checkResult(r.get(2), 16, "FALCONS", "FALC0N5");
9796
}
9897

99-
private void checkResult(Homoglyph.SearchResult result, int expectedIndex, String expectedWord, String expectedMatch){
98+
private void checkResult(SearchResult result, int expectedIndex, String expectedWord, String expectedMatch){
10099
assertEquals(expectedIndex, result.index);
101100
assertEquals(expectedWord, result.word);
102101
assertEquals(expectedMatch, result.match);

0 commit comments

Comments
 (0)