Skip to content

Commit e1e3568

Browse files
authored
Merge pull request github#17807 from github/tausbn/python-fix-string-encoding-dataset-check-failure
Python: Fix string encoding dataset check failure
2 parents 197642c + ae4a4bb commit e1e3568

File tree

3 files changed

+38
-1
lines changed

3 files changed

+38
-1
lines changed
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
"\uD800"
2+
"?"
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#!/bin/bash
2+
3+
set -Eeuo pipefail # see https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail/
4+
5+
set -x
6+
7+
CODEQL=${CODEQL:-codeql}
8+
9+
SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
10+
cd "$SCRIPTDIR"
11+
12+
rm -rf db
13+
14+
$CODEQL database create db --language python --source-root repo_dir/
15+
16+
$CODEQL dataset check db/db-python
17+
18+
echo "Test successfully completed."

python/extractor/semmle/python/passes/objects.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,23 @@
4343

4444
LITERALS = (ast.Num, ast.Str)
4545

46+
# A variant of the 'replace' error handler that replaces unencodable characters with U+FFFD
47+
# rather than '?'. Without this, a string like '\uD800' (which is not encodable) would get mapped
48+
# to '?', and potentially clash with the regular string '?' if it appeared elsewhere in the source
49+
# code. Used in 'get_label_for_object' below. Based on code from https://peps.python.org/pep-0293/
50+
def fffd_replace(exc):
51+
if isinstance(exc, UnicodeEncodeError):
52+
return ((exc.end-exc.start)*u"\\ufffd", exc.end)
53+
elif isinstance(exc, UnicodeDecodeError):
54+
return (u"\\ufffd", exc.end)
55+
elif isinstance(exc, UnicodeTranslateError):
56+
return ((exc.end-exc.start)*u"\\ufffd", exc.end)
57+
else:
58+
raise TypeError("can't handle %s" % exc.__name__)
59+
60+
import codecs
61+
codecs.register_error("fffdreplace", fffd_replace)
62+
4663
class _CObject(object):
4764
'''Utility class to wrap arbitrary C objects.
4865
Treat all objects as unique. Rely on naming in the
@@ -239,7 +256,7 @@ def get_label_for_object(self, obj, default_label, obj_type):
239256
else:
240257
prefix = u"C_bytes$"
241258
if t is str:
242-
obj = obj.encode("utf8", errors='replace')
259+
obj = obj.encode("utf8", errors='fffdreplace')
243260
return prefix + hashlib.sha1(obj).hexdigest()
244261
if t is bytes:
245262
return prefix + hashlib.sha1(obj).hexdigest()

0 commit comments

Comments
 (0)