Skip to content

Commit b94dec9

Browse files
swarnim-deepsourceraghav-deepsourceprajwal-deepsource
authored
fix: escaping in languages & meaningful src lines (#7)
* fix: escaping in languages & meaningful src lines * add: a lot more tests * add: more tests * add: yet more tests * add: moooore tests * chore: cleanup the tests * chore: cleanup package description * chore: code cleanup and more tests * fix: add description to Dracula Signed-off-by: Swarnim Arun <[email protected]> * fix: add description to pydracula Signed-off-by: Swarnim Arun <[email protected]> * Apply suggestions from code review Signed-off-by: raghav-deepsource <[email protected]> * fix: repeat dsl * fix: c tests as per updated lang parsing rules * fix: as suggested * add: support for more types of python `str`s * fix: let `tests.rs` showup in gitdiff * fix: as suggested Co-authored-by: Prajwal S N <[email protected]> Co-authored-by: raghav-deepsource <[email protected]> Signed-off-by: Swarnim Arun <[email protected]> * add: more tests for python and, - fix line-ending on EOF bug with pre-exact - add golden files for testing cleaned_src for python --------- Signed-off-by: Swarnim Arun <[email protected]> Signed-off-by: raghav-deepsource <[email protected]> Co-authored-by: raghav-deepsource <[email protected]> Co-authored-by: Prajwal S N <[email protected]>
1 parent 2a80b51 commit b94dec9

File tree

16 files changed

+1229
-188
lines changed

16 files changed

+1229
-188
lines changed

Cargo.lock

Lines changed: 342 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,21 +4,32 @@ members = [
44
"pydracula"
55
]
66

7-
[package]
8-
name = "dracula"
7+
[workspace.package]
8+
edition = "2021"
99
description = "🧛 Count-ing lines, AH AH AHH!"
10-
version = "0.1.2"
10+
version = "0.1.3"
1111
authors = ["Swarnim Arun <[email protected]>"]
12-
edition = "2021"
1312
license-file = "LICENSE"
1413
documentation = "https://docs.rs/dracula"
1514
homepage = "https://github.com/deepsourcelabs/dracula"
1615
repository = "https://github.com/deepsourcelabs/dracula"
1716
keywords = ["parsing", "line-count"]
1817
categories = ["command-line-utilities", "text-processing"]
1918

19+
20+
[package]
21+
name = "dracula"
22+
version.workspace = true
23+
authors.workspace = true
24+
edition.workspace = true
25+
2026
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
2127

28+
[dev-dependencies]
29+
ra_ap_syntax = "0.0.149"
30+
pretty_assertions = "1.3.0"
31+
letr = "0.2.1"
32+
2233
[dependencies.log]
2334
version = "0.4"
2435
optional = true

cdracula/Cargo.toml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
[package]
22
name = "cdracula"
3-
description = "🧛 Count-ing lines, AH AH AHH!"
4-
version = "0.1.2"
5-
authors = ["Swarnim Arun <[email protected]>"]
6-
edition = "2021"
3+
version.workspace = true
4+
description.workspace = true
5+
authors.workspace = true
6+
edition.workspace = true
77

88
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
99
[lib]

cdracula/src/util_macros.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ macro_rules! languages_supported {
2121
$(
2222
if idx == $num {
2323
return Some(
24-
dracula::count::get_cleaned_source_code::<dracula::langs::$name>(src)
24+
dracula::count::get_cleaned_source_code::<dracula::langs::$name>(src).unwrap_or_else(|| src.to_string())
2525
);
2626
}
2727
)+

cdracula/tests/test_capi.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ int main() {
139139
) + "\0")
140140
.into(),
141141
);
142-
assert_eq!(get_meaningful_line_count(src.as_ptr(), C_LANG, 0), 4);
142+
assert_eq!(get_meaningful_line_count(src.as_ptr(), C_LANG, 0), 3);
143143
}
144144
}
145145

@@ -164,7 +164,7 @@ int main() {
164164
let mut len = 0u64;
165165
let ptr = meaningful_lines(src.as_ptr(), C_LANG, 0, &mut len as *mut u64);
166166
let v = Vec::from_raw_parts(ptr, len as _, len as _);
167-
assert_eq!(&v, &[2, 4, 7, 8]);
167+
assert_eq!(&v, &[2, 4, 7]);
168168
}
169169
}
170170

@@ -193,7 +193,7 @@ int main() {
193193
r#"int main() {
194194
return 0;
195195
int x = 10;
196-
}"#,)
196+
"#,)
197197
);
198198
}
199199
}

pydracula/Cargo.toml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
[package]
22
name = "pydracula"
3-
version = "0.1.2"
4-
authors = ["Swarnim Arun <[email protected]>"]
5-
edition = "2018"
3+
version.workspace = true
4+
authors.workspace = true
5+
description.workspace = true
6+
edition.workspace = true
67

78
[lib]
89
name = "pydracula"

pydracula/src/lib.rs

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,27 @@ impl Lang {
2222
Lang::Java => get_meaningful_line_indices::<Java>(src).flatten().collect(),
2323
}
2424
}
25+
#[rustfmt::skip]
2526
fn get_cleaned_source_code(&self, src: &str) -> String {
2627
use dracula::count::*;
2728
use dracula::langs::*;
2829
match self {
29-
Lang::Python => get_cleaned_source_code::<Python>(src),
30-
Lang::Rust => get_cleaned_source_code::<Rust>(src),
31-
Lang::C => get_cleaned_source_code::<C>(src),
32-
Lang::Java => get_cleaned_source_code::<Java>(src),
30+
Lang::Python => {
31+
get_cleaned_source_code::<Python>(src)
32+
.unwrap_or_else(|| src.to_string())
33+
}
34+
Lang::Rust => {
35+
get_cleaned_source_code::<Rust>(src)
36+
.unwrap_or_else(|| src.to_string())
37+
},
38+
Lang::C => {
39+
get_cleaned_source_code::<C>(src)
40+
.unwrap_or_else(|| src.to_string())
41+
},
42+
Lang::Java => {
43+
get_cleaned_source_code::<Java>(src)
44+
.unwrap_or_else(|| src.to_string())
45+
},
3346
}
3447
}
3548
fn get_count_of_meaningful_lines(&self, src: &str) -> usize {

src/count.rs

Lines changed: 53 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,24 @@ pub struct ParseLineMeaningfulIndexIter<'a, L: Language> {
1515
line_span: Span,
1616
parse_span: Span,
1717
line_index: usize,
18+
max_lines: usize,
1819
last_parsed_output: Option<ParseOutput<'a>>,
20+
failed: bool,
1921
}
2022

2123
impl<'a, L: Language> Iterator for ParseLineMeaningfulIndexIter<'a, L> {
2224
type Item = Option<usize>;
2325

2426
fn next(&mut self) -> Option<Self::Item> {
27+
// Self::Item is Option<usize> to signify that a line may also not be meaningful.
28+
if self.failed {
29+
let li = self.line_index;
30+
if li > self.max_lines {
31+
return None;
32+
} else {
33+
return Some(Some(li));
34+
}
35+
}
2536
if self.line_span.end >= self.src.len() {
2637
return None;
2738
}
@@ -70,6 +81,8 @@ impl<'a, L: Language> Iterator for ParseLineMeaningfulIndexIter<'a, L> {
7081
}
7182
}
7283

84+
/// Builds the iterator [`ParseLineMeaningfulIndexIter`] to run over lines of src while
85+
/// figuring out meaningful lines from it
7386
pub fn get_meaningful_line_indices<L: Language + 'static>(
7487
src: &str,
7588
) -> ParseLineMeaningfulIndexIter<L> {
@@ -80,14 +93,20 @@ pub fn get_meaningful_line_indices<L: Language + 'static>(
8093
parse_span: Span::default(),
8194
line_index: 0,
8295
last_parsed_output: None,
96+
failed: false,
97+
max_lines: src.lines().count(),
8398
}
8499
}
85100

86-
pub fn get_cleaned_source_code<L: Language>(src: &str) -> String {
101+
/// Uses the [`Parser`] to try and figure out the meaningful parts of the source
102+
pub fn get_cleaned_source_code<L: Language>(src: &str) -> Option<String> {
87103
let parsed = L::get_parser(src);
88104
let mut meaningful_src = String::default();
89105
let mut stack = vec![];
90106
for p in parsed {
107+
if matches!(p, ParseOutput::Invalid(..)) {
108+
return None;
109+
}
91110
if matches!(p, ParseOutput::EOL(_) | ParseOutput::EOF) {
92111
let meaningful_src_len = meaningful_src.len();
93112
for po in stack.iter() {
@@ -97,61 +116,50 @@ pub fn get_cleaned_source_code<L: Language>(src: &str) -> String {
97116
}
98117
}
99118
}
100-
if matches!(p, ParseOutput::EOL(_))
101-
&& meaningful_src_len != meaningful_src.len()
102-
{
119+
if matches!(p, ParseOutput::EOL(_)) && meaningful_src_len != meaningful_src.len() {
103120
meaningful_src.push('\n');
104121
}
105122
stack.clear();
106123
} else {
107124
stack.push(p);
108125
}
109126
}
110-
meaningful_src
127+
Some(meaningful_src)
111128
}
112129

113-
pub fn get_count_of_meaningful_lines<L: Language>(src: &str) -> usize {
114-
let parsed = L::get_parser(src);
115-
let mut line_count: usize = 0;
116-
let mut stack = vec![];
117-
for p in parsed {
118-
if matches!(p, ParseOutput::EOL(_) | ParseOutput::EOF) {
119-
if stack.iter().any(L::is_meaningful) {
120-
line_count += 1;
121-
}
122-
// We clear the stack once we reach the end of a line.
123-
stack.clear();
124-
} else {
125-
// we accumulate tokens we see as meaningful tokens for the language.
126-
stack.push(p);
127-
}
128-
}
129-
line_count
130+
/// Uses the [`get_meaningful_line_indices`] function to build an iterator
131+
/// and count all meaningful lines
132+
pub fn get_count_of_meaningful_lines<L: Language + 'static>(src: &str) -> usize {
133+
get_meaningful_line_indices::<L>(src).flatten().count()
130134
}
131135

132-
#[test]
133-
fn test_halting_get_count_of_meaningful_lines() {
134-
get_count_of_meaningful_lines::<crate::langs::C>("");
135-
get_count_of_meaningful_lines::<crate::langs::Rust>("");
136-
get_count_of_meaningful_lines::<crate::langs::Python>("");
137-
get_count_of_meaningful_lines::<crate::langs::Java>("");
138-
}
136+
/// No halting tests for [`get_count_of_meaningful_lines`] as it uses
137+
/// Iterator provided by [`get_meaningful_line_indices`]
138+
#[cfg(test)]
139+
mod halting_tests_count_api {
140+
use super::*;
139141

140-
#[test]
141-
fn test_halting_get_cleaned_source_code() {
142-
get_cleaned_source_code::<crate::langs::C>("");
143-
get_cleaned_source_code::<crate::langs::Rust>("");
144-
get_cleaned_source_code::<crate::langs::Python>("");
145-
get_cleaned_source_code::<crate::langs::Java>("");
146-
}
142+
#[test]
143+
fn test_halting_get_cleaned_source_code() {
144+
get_cleaned_source_code::<crate::langs::C>("\nint main() {}\n");
145+
get_cleaned_source_code::<crate::langs::Rust>("\nfn main() {}\n");
146+
get_cleaned_source_code::<crate::langs::Python>("\ndef main():\n\tpass");
147+
get_cleaned_source_code::<crate::langs::Java>("\nvoid main() {}\n");
148+
}
147149

148-
#[test]
149-
fn test_halting_get_meaningful_line_indices() {
150-
get_meaningful_line_indices::<crate::langs::C>("
151-
int main() {}
152-
").flatten().for_each(|_| ());
153-
get_meaningful_line_indices::<crate::langs::Rust>("").flatten().for_each(|_| ());
154-
get_meaningful_line_indices::<crate::langs::Python>("").flatten().for_each(|_| ());
155-
get_meaningful_line_indices::<crate::langs::Java>("").flatten().for_each(|_| ());
150+
#[test]
151+
fn test_halting_get_meaningful_line_indices() {
152+
get_meaningful_line_indices::<crate::langs::C>("\nint main() {}\n")
153+
.flatten()
154+
.for_each(|_| ());
155+
get_meaningful_line_indices::<crate::langs::Rust>("")
156+
.flatten()
157+
.for_each(|_| ());
158+
get_meaningful_line_indices::<crate::langs::Python>("")
159+
.flatten()
160+
.for_each(|_| ());
161+
get_meaningful_line_indices::<crate::langs::Java>("")
162+
.flatten()
163+
.for_each(|_| ());
164+
}
156165
}
157-

src/fixtures/more_python_tests.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# pylint:disable=pointless-string-statement, redundant-u-string-prefix
2+
"""Test for backslash escapes in byte vs unicode strings"""
3+
4+
# Would be valid in Unicode, but probably not what you want otherwise
5+
BAD_UNICODE = b'\u0042' # [anomalous-unicode-escape-in-string]
6+
BAD_LONG_UNICODE = b'\U00000042' # [anomalous-unicode-escape-in-string]
7+
# +1:[anomalous-unicode-escape-in-string]
8+
BAD_NAMED_UNICODE = b'\N{GREEK SMALL LETTER ALPHA}'
9+
10+
GOOD_UNICODE = u'\u0042'
11+
GOOD_LONG_UNICODE = u'\U00000042'
12+
GOOD_NAMED_UNICODE = u'\N{GREEK SMALL LETTER ALPHA}'
13+
14+
15+
# Valid raw strings
16+
RAW_BACKSLASHES = r'raw'
17+
18+
# In a comment you can have whatever you want: \ \\ \n \m
19+
# even things that look like bad strings: "C:\Program Files"
20+
21+
22+
"""Test for anomalous backslash escapes in strings"""
23+
24+
BAD_ESCAPE = '\z' # [anomalous-backslash-in-string]
25+
BAD_ESCAPE_NOT_FIRST = 'abc\z' # [anomalous-backslash-in-string]
26+
BAD_ESCAPE_WITH_PREFIX = b'abc\z' # [anomalous-backslash-in-string]
27+
BAD_ESCAPE_WITH_BACKSLASH = b'a\
28+
\z' # [anomalous-backslash-in-string]
29+
# +3:[anomalous-backslash-in-string]
30+
BAD_ESCAPE_BLOCK = b'''
31+
abc
32+
\z
33+
'''
34+
BAD_ESCAPE_PARENS = (b'abc'
35+
b'\z') # [anomalous-backslash-in-string]
36+
GOOD_ESCAPE = '\b'
37+
38+
# Valid raw strings
39+
BAD_ESCAPE_BUT_RAW = r'\z'
40+
41+
# In a comment you can have whatever you want: \z
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
BAD_UNICODE =
2+
BAD_LONG_UNICODE =
3+
BAD_NAMED_UNICODE =
4+
GOOD_UNICODE =
5+
GOOD_LONG_UNICODE =
6+
GOOD_NAMED_UNICODE =
7+
RAW_BACKSLASHES =
8+
BAD_ESCAPE =
9+
BAD_ESCAPE_NOT_FIRST =
10+
BAD_ESCAPE_WITH_PREFIX =
11+
BAD_ESCAPE_WITH_BACKSLASH =
12+
BAD_ESCAPE_BLOCK =
13+
BAD_ESCAPE_PARENS = (
14+
)
15+
GOOD_ESCAPE =
16+
BAD_ESCAPE_BUT_RAW =

0 commit comments

Comments
 (0)