|
26 | 26 | from __future__ import print_function |
27 | 27 | from __future__ import unicode_literals |
28 | 28 |
|
| 29 | +import codecs |
| 30 | +import json |
| 31 | +import os |
| 32 | +import re |
29 | 33 | import unicodedata |
30 | 34 |
|
31 | 35 | import chardet |
32 | 36 |
|
| 37 | +from commoncode.system import on_linux |
33 | 38 | from textcode import pdf |
34 | 39 | from textcode import markup |
35 | 40 | from textcode import strings |
|
41 | 46 | All internal processing assumes unicode in and out. |
42 | 47 | """ |
43 | 48 |
|
| 49 | +# Tracing flags |
| 50 | +TRACE = False or os.environ.get('SCANCODE_DEBUG_TEXT_ANALYSIS', False) |
| 51 | + |
| 52 | + |
| 53 | +# Tracing flags |
| 54 | +def logger_debug(*args): |
| 55 | + pass |
| 56 | + |
| 57 | + |
| 58 | +if TRACE: |
| 59 | + import logging |
| 60 | + import sys |
| 61 | + |
| 62 | + logger = logging.getLogger(__name__) |
| 63 | + logging.basicConfig(stream=sys.stdout) |
| 64 | + logger.setLevel(logging.DEBUG) |
| 65 | + |
| 66 | + def logger_debug(*args): |
| 67 | + return logger.debug(' '.join(isinstance(a, unicode) and a or repr(a) for a in args)) |
| 68 | + |
44 | 69 |
|
45 | 70 | def text_lines(location, demarkup=False): |
46 | 71 | """ |
@@ -84,18 +109,33 @@ def text_lines(location, demarkup=False): |
84 | 109 | # try again later with as plain text |
85 | 110 | pass |
86 | 111 |
|
87 | | - # TODO: handle minified JS and single JSON such as map files |
88 | | - |
89 | 112 | # TODO: handle Office-like documents, RTF, etc |
90 | 113 | # if T.is_doc: |
91 | 114 | # return unicode_text_lines_from_doc(location) |
92 | 115 |
|
| 116 | + if T.is_js_map: |
| 117 | + try: |
| 118 | + return js_map_sources_lines(location) |
| 119 | + except: |
| 120 | + # try again later with as plain text |
| 121 | + pass |
| 122 | + |
93 | 123 | if T.is_text: |
94 | | - return unicode_text_lines(location) |
| 124 | + lines = unicode_text_lines(location) |
| 125 | + # text with very long lines such minified JS, JS map files or large JSON |
| 126 | + locale = b'locale' if on_linux else u'locale' |
| 127 | + package_json = b'package.json' if on_linux else u'package.json' |
| 128 | + |
| 129 | + if (not location.endswith(package_json) |
| 130 | + and (T.is_text_with_long_lines or T.is_compact_js |
| 131 | + or T.filetype_file == 'data' or locale in location)): |
| 132 | + |
| 133 | + lines = break_unicode_text_lines(lines) |
| 134 | + return lines |
95 | 135 |
|
96 | 136 | # DO NOT introspect media, archives and compressed files |
97 | | -# if not T.contains_text: |
98 | | -# return iter([]) |
| 137 | + # if not T.contains_text: |
| 138 | + # return iter([]) |
99 | 139 |
|
100 | 140 | if T.is_binary: |
101 | 141 | # fall back to binary |
@@ -128,6 +168,47 @@ def unicode_text_lines_from_pdf(location): |
128 | 168 | yield as_unicode(line) |
129 | 169 |
|
130 | 170 |
|
| 171 | +def break_unicode_text_lines(lines, split=u'([",\'])', max_len=200, chunk_len=30): |
| 172 | + """ |
| 173 | + Yield text lines breaking long lines on `split`. |
| 174 | + """ |
| 175 | + splitter = re.compile(split).split |
| 176 | + for line in lines: |
| 177 | + if len(line) > max_len: |
| 178 | + # spli then reassemble in more reasonable chunks |
| 179 | + splitted = splitter(line) |
| 180 | + chunks = (splitted[i:i + chunk_len] for i in xrange(0, len(splitted), chunk_len)) |
| 181 | + for chunk in chunks: |
| 182 | + yield u''.join(chunk) |
| 183 | + else: |
| 184 | + yield line |
| 185 | + |
| 186 | + |
| 187 | +def js_map_sources_lines(location): |
| 188 | + """ |
| 189 | + Yield unicode text lines from the js.map or css.map file at `location`. |
| 190 | + Spec is at: |
| 191 | + https://docs.google.com/document/d/1U1RGAehQwRypUTovF1KRlpiOFze0b-_2gc6fAH0KY0k/edit |
| 192 | + The format is: |
| 193 | + { |
| 194 | + "version" : 3, |
| 195 | + "file": "out.js", |
| 196 | + "sourceRoot": "", |
| 197 | + "sources": ["foo.js", "bar.js"], |
| 198 | + "sourcesContent": [null, null], |
| 199 | + "names": ["src", "maps", "are", "fun"], |
| 200 | + "mappings": "A,AAAB;;ABCDE;" |
| 201 | + } |
| 202 | + We care only about the presence of these tags for detection: version, sources, sourcesContent. |
| 203 | + """ |
| 204 | + with codecs.open(location, 'rb', encoding='utf-8') as jsm: |
| 205 | + content = json.load(jsm) |
| 206 | + sources = content['sourcesContent'] |
| 207 | + for entry in sources: |
| 208 | + for line in entry.splitlines(): |
| 209 | + yield line |
| 210 | + |
| 211 | + |
131 | 212 | def as_unicode(line): |
132 | 213 | """ |
133 | 214 | Return a unicode text line from a text line. |
|
0 commit comments