Skip to content

Commit 164548f

Browse files
committed
feat: exclude IP addresses from link extraction
1 parent 27d7c4d commit 164548f

File tree

1 file changed

+27
-5
lines changed

1 file changed

+27
-5
lines changed

rook/src/git/link_extractor.rs

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@ use std::collections::HashSet;
44

55
use crate::RepoManager;
66

7-
const REGEX_URL: &str = r"https?://(www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_+.~#?&/=]*)";
7+
const REGEX_DOMAIN: &str = r"https?://(www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_+.~#?&/=]*)";
8+
const REGEX_IP_ADDRESS: &str = r"https?://(localhost|(?:\d{1,3}\.){3}\d{1,3})(?::\d+)?";
89

910
#[derive(Debug, Clone, Serialize, Deserialize)]
1011
/// Represents a hyperlink found in a repository, along with its location.
@@ -65,11 +66,16 @@ pub fn extract_links_from_repo_url(
6566
}
6667

6768
fn find_link_in_content(content: &str, file_path: String) -> HashSet<LinkInfo> {
68-
let url_regex = Regex::new(REGEX_URL).unwrap();
69+
let domain_regex = Regex::new(REGEX_DOMAIN).unwrap();
70+
let ip_address_regex = Regex::new(REGEX_IP_ADDRESS).unwrap();
6971
let mut result = HashSet::new();
7072

7173
for (line_num, line) in content.lines().enumerate() {
72-
for mat in url_regex.find_iter(line) {
74+
for mat in domain_regex.find_iter(line) {
75+
if ip_address_regex.is_match(mat.as_str()) {
76+
continue;
77+
}
78+
7379
let url = mat
7480
.as_str()
7581
.trim_end_matches(&[')', '>', '.', ',', ';'][..])
@@ -124,6 +130,22 @@ mod tests {
124130
}
125131
}
126132

133+
#[test]
134+
fn test_skip_ip_addresses() {
135+
let content = r#"
136+
http://192.168.1.1
137+
http://192.168.1.1/path
138+
http://192.168.1.1/path?param=value
139+
this is localhost ip address http://127.0.0.1
140+
front server http://localhost:3000
141+
backend server http://localhost:8080
142+
"#;
143+
144+
let file_path = "test.txt".to_string();
145+
let links = find_link_in_content(content, file_path);
146+
assert!(links.is_empty(), "Expected no links");
147+
}
148+
127149
#[test]
128150
fn test_link_info_uniqueness() {
129151
let mut links = HashSet::new();
@@ -193,10 +215,10 @@ mod tests {
193215

194216
assert!(!result.is_empty(), "No links found in the repository");
195217

196-
let url_regex = Regex::new(REGEX_URL).unwrap();
218+
let domain_regex = Regex::new(REGEX_DOMAIN).unwrap();
197219
for link in &result {
198220
assert!(
199-
url_regex.is_match(&link.url),
221+
domain_regex.is_match(&link.url),
200222
"Invalid URL found: {} at {}:{}",
201223
link.url,
202224
link.file_path,

0 commit comments

Comments
 (0)