Skip to content

Commit 96b5bb2

Browse files
committed
Break lines in text files with long lines
* some large JS map and minified JS files can exhibit some weird behavior as they can be huge (yet they contain copyrights and licenses * these file types are detected and various heuristics applied to avoid having very long lines (which is the unit of work for license and copyright detection) * this addresses issues reported in #958 Signed-off-by: Philippe Ombredanne <[email protected]>
1 parent 2377ee8 commit 96b5bb2

File tree

5 files changed

+166
-134
lines changed

5 files changed

+166
-134
lines changed

src/textcode/analysis.py

Lines changed: 86 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,15 @@
2626
from __future__ import print_function
2727
from __future__ import unicode_literals
2828

29+
import codecs
30+
import json
31+
import os
32+
import re
2933
import unicodedata
3034

3135
import chardet
3236

37+
from commoncode.system import on_linux
3338
from textcode import pdf
3439
from textcode import markup
3540
from textcode import strings
@@ -41,6 +46,26 @@
4146
All internal processing assumes unicode in and out.
4247
"""
4348

49+
# Tracing flags
50+
TRACE = False or os.environ.get('SCANCODE_DEBUG_TEXT_ANALYSIS', False)
51+
52+
53+
# Tracing flags
54+
def logger_debug(*args):
55+
pass
56+
57+
58+
if TRACE:
59+
import logging
60+
import sys
61+
62+
logger = logging.getLogger(__name__)
63+
logging.basicConfig(stream=sys.stdout)
64+
logger.setLevel(logging.DEBUG)
65+
66+
def logger_debug(*args):
67+
return logger.debug(' '.join(isinstance(a, unicode) and a or repr(a) for a in args))
68+
4469

4570
def text_lines(location, demarkup=False):
4671
"""
@@ -84,18 +109,33 @@ def text_lines(location, demarkup=False):
84109
# try again later with as plain text
85110
pass
86111

87-
# TODO: handle minified JS and single JSON such as map files
88-
89112
# TODO: handle Office-like documents, RTF, etc
90113
# if T.is_doc:
91114
# return unicode_text_lines_from_doc(location)
92115

116+
if T.is_js_map:
117+
try:
118+
return js_map_sources_lines(location)
119+
except:
120+
# try again later with as plain text
121+
pass
122+
93123
if T.is_text:
94-
return unicode_text_lines(location)
124+
lines = unicode_text_lines(location)
125+
# text with very long lines such minified JS, JS map files or large JSON
126+
locale = b'locale' if on_linux else u'locale'
127+
package_json = b'package.json' if on_linux else u'package.json'
128+
129+
if (not location.endswith(package_json)
130+
and (T.is_text_with_long_lines or T.is_compact_js
131+
or T.filetype_file == 'data' or locale in location)):
132+
133+
lines = break_unicode_text_lines(lines)
134+
return lines
95135

96136
# DO NOT introspect media, archives and compressed files
97-
# if not T.contains_text:
98-
# return iter([])
137+
# if not T.contains_text:
138+
# return iter([])
99139

100140
if T.is_binary:
101141
# fall back to binary
@@ -128,6 +168,47 @@ def unicode_text_lines_from_pdf(location):
128168
yield as_unicode(line)
129169

130170

171+
def break_unicode_text_lines(lines, split=u'([",\'])', max_len=200, chunk_len=30):
172+
"""
173+
Yield text lines breaking long lines on `split`.
174+
"""
175+
splitter = re.compile(split).split
176+
for line in lines:
177+
if len(line) > max_len:
178+
# spli then reassemble in more reasonable chunks
179+
splitted = splitter(line)
180+
chunks = (splitted[i:i + chunk_len] for i in xrange(0, len(splitted), chunk_len))
181+
for chunk in chunks:
182+
yield u''.join(chunk)
183+
else:
184+
yield line
185+
186+
187+
def js_map_sources_lines(location):
188+
"""
189+
Yield unicode text lines from the js.map or css.map file at `location`.
190+
Spec is at:
191+
https://docs.google.com/document/d/1U1RGAehQwRypUTovF1KRlpiOFze0b-_2gc6fAH0KY0k/edit
192+
The format is:
193+
{
194+
"version" : 3,
195+
"file": "out.js",
196+
"sourceRoot": "",
197+
"sources": ["foo.js", "bar.js"],
198+
"sourcesContent": [null, null],
199+
"names": ["src", "maps", "are", "fun"],
200+
"mappings": "A,AAAB;;ABCDE;"
201+
}
202+
We care only about the presence of these tags for detection: version, sources, sourcesContent.
203+
"""
204+
with codecs.open(location, 'rb', encoding='utf-8') as jsm:
205+
content = json.load(jsm)
206+
sources = content['sourcesContent']
207+
for entry in sources:
208+
for line in entry.splitlines():
209+
yield line
210+
211+
131212
def as_unicode(line):
132213
"""
133214
Return a unicode text line from a text line.

src/textcode/js.py

Lines changed: 0 additions & 114 deletions
This file was deleted.

src/textcode/markup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ def demarkup(location):
103103
from textcode.analysis import unicode_text_lines
104104

105105
for line in unicode_text_lines(location):
106-
yield (demarkup_text(line))
106+
yield demarkup_text(line)
107107

108108

109109
def demarkup_text(text):

0 commit comments

Comments
 (0)