Skip to content

Commit 49a010c

Browse files
authored
Ability to insert file contents as text, in addition to blob (#321)
1 parent 9258f4b commit 49a010c

File tree

3 files changed

+101
-16
lines changed

3 files changed

+101
-16
lines changed

docs/cli.rst

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -331,7 +331,7 @@ The CSV data that was piped into the script is available in the ``stdin`` table,
331331
\-\-schema, \-\-analyze, \-\-dump and \-\-save
332332
----------------------------------------------
333333

334-
To see the in-memory datbase schema that would be used for a file or for multiple files, use ``--schema``::
334+
To see the in-memory database schema that would be used for a file or for multiple files, use ``--schema``::
335335

336336
% sqlite-utils memory dogs.csv --schema
337337
CREATE TABLE [dogs] (
@@ -909,12 +909,10 @@ The command will fail if you reference columns that do not exist on the table. T
909909

910910
.. _cli_insert_files:
911911

912-
Inserting binary data from files
913-
================================
914-
915-
SQLite ``BLOB`` columns can be used to store binary content. It can be useful to insert the contents of files into a SQLite table.
912+
Inserting data from files
913+
=========================
916914

917-
The ``insert-files`` command can be used to insert the content of files, along with their metadata.
915+
The ``insert-files`` command can be used to insert the content of files, along with their metadata, into a SQLite table.
918916

919917
Here's an example that inserts all of the GIF files in the current directory into a ``gifs.db`` database, placing the file contents in an ``images`` table::
920918

@@ -932,6 +930,8 @@ By default this command will create a table with the following schema::
932930
[size] INTEGER
933931
);
934932

933+
Content will be treated as binary by default and stored in a ``BLOB`` column. You can use the ``--text`` option to store that content in a ``TEXT`` column instead.
934+
935935
You can customize the schema using one or more ``-c`` options. For a table schema that includes just the path, MD5 hash and last modification time of the file, you would use this::
936936

937937
$ sqlite-utils insert-files gifs.db images *.gif -c path -c md5 -c mtime --pk=path
@@ -944,6 +944,8 @@ This will result in the following schema::
944944
[mtime] FLOAT
945945
);
946946

947+
Note that there's no ``content`` column here at all - if you specify custom columns using ``-c`` you need to include ``-c content`` to create that column.
948+
947949
You can change the name of one of these columns using a ``-c colname:coldef`` parameter. To rename the ``mtime`` column to ``last_modified`` you would use this::
948950

949951
$ sqlite-utils insert-files gifs.db images *.gif \
@@ -967,6 +969,8 @@ The full list of column definitions you can use is as follows:
967969
The permission bits of the file, as an integer - you may want to convert this to octal
968970
``content``
969971
The binary file contents, which will be stored as a BLOB
972+
``content_text``
973+
The text file contents, which will be stored as TEXT
970974
``mtime``
971975
The modification time of the file, as floating point seconds since the Unix epoch
972976
``ctime``
@@ -988,7 +992,7 @@ You can insert data piped from standard input like this::
988992

989993
The ``-`` argument indicates data should be read from standard input. The string passed using the ``--name`` option will be used for the file name and path values.
990994

991-
When inserting data from standard input only the following column definitions are supported: ``name``, ``path``, ``content``, ``sha256``, ``md5`` and ``size``.
995+
When inserting data from standard input only the following column definitions are supported: ``name``, ``path``, ``content``, ``content_text``, ``sha256``, ``md5`` and ``size``.
992996

993997
.. _cli_convert:
994998

sqlite_utils/cli.py

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1811,6 +1811,11 @@ def extract(
18111811
@click.option("--replace", is_flag=True, help="Replace files with matching primary key")
18121812
@click.option("--upsert", is_flag=True, help="Upsert files with matching primary key")
18131813
@click.option("--name", type=str, help="File name to use")
1814+
@click.option("--text", is_flag=True, help="Store file content as TEXT, not BLOB")
1815+
@click.option(
1816+
"--encoding",
1817+
help="Character encoding for input, defaults to utf-8",
1818+
)
18141819
@click.option("-s", "--silent", is_flag=True, help="Don't show a progress bar")
18151820
@load_extension_option
18161821
def insert_files(
@@ -1823,6 +1828,8 @@ def insert_files(
18231828
replace,
18241829
upsert,
18251830
name,
1831+
text,
1832+
encoding,
18261833
silent,
18271834
load_extension,
18281835
):
@@ -1842,7 +1849,10 @@ def insert_files(
18421849
--pk name
18431850
"""
18441851
if not column:
1845-
column = ["path:path", "content:content", "size:size"]
1852+
if text:
1853+
column = ["path:path", "content_text:content_text", "size:size"]
1854+
else:
1855+
column = ["path:path", "content:content", "size:size"]
18461856
if not pk:
18471857
pk = "path"
18481858

@@ -1866,14 +1876,26 @@ def yield_paths_and_relative_paths():
18661876
def to_insert():
18671877
for path, relative_path in bar:
18681878
row = {}
1869-
lookups = FILE_COLUMNS
1879+
# content_text is special case as it considers 'encoding'
1880+
1881+
def _content_text(p):
1882+
resolved = p.resolve()
1883+
try:
1884+
return resolved.read_text(encoding=encoding)
1885+
except UnicodeDecodeError as e:
1886+
raise UnicodeDecodeErrorForPath(e, resolved)
1887+
1888+
lookups = dict(FILE_COLUMNS, content_text=_content_text)
18701889
if path == "-":
18711890
stdin_data = sys.stdin.buffer.read()
18721891
# We only support a subset of columns for this case
18731892
lookups = {
18741893
"name": lambda p: name or "-",
18751894
"path": lambda p: name or "-",
18761895
"content": lambda p: stdin_data,
1896+
"content_text": lambda p: stdin_data.decode(
1897+
encoding or "utf-8"
1898+
),
18771899
"sha256": lambda p: hashlib.sha256(stdin_data).hexdigest(),
18781900
"md5": lambda p: hashlib.md5(stdin_data).hexdigest(),
18791901
"size": lambda p: len(stdin_data),
@@ -1899,9 +1921,16 @@ def to_insert():
18991921

19001922
db = sqlite_utils.Database(path)
19011923
_load_extensions(db, load_extension)
1902-
with db.conn:
1903-
db[table].insert_all(
1904-
to_insert(), pk=pk, alter=alter, replace=replace, upsert=upsert
1924+
try:
1925+
with db.conn:
1926+
db[table].insert_all(
1927+
to_insert(), pk=pk, alter=alter, replace=replace, upsert=upsert
1928+
)
1929+
except UnicodeDecodeErrorForPath as e:
1930+
raise click.ClickException(
1931+
UNICODE_ERROR.format(
1932+
"Could not read file '{}' as text\n\n{}".format(e.path, e.exception)
1933+
)
19051934
)
19061935

19071936

@@ -2149,6 +2178,12 @@ def _render_common(title, values):
21492178
return "\n".join(lines)
21502179

21512180

2181+
class UnicodeDecodeErrorForPath(Exception):
2182+
def __init__(self, exception, path):
2183+
self.exception = exception
2184+
self.path = path
2185+
2186+
21522187
FILE_COLUMNS = {
21532188
"name": lambda p: p.name,
21542189
"path": lambda p: str(p),

tests/test_insert_files.py

Lines changed: 50 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import os
44
import pathlib
55
import pytest
6+
import sys
67

78

89
@pytest.mark.parametrize("silent", (False, True))
@@ -23,6 +24,7 @@ def test_insert_files(silent):
2324
"md5",
2425
"mode",
2526
"content",
27+
"content_text",
2628
"mtime",
2729
"ctime",
2830
"mtime_int",
@@ -52,6 +54,7 @@ def test_insert_files(silent):
5254
)
5355
assert {
5456
"content": b"This is file one",
57+
"content_text": "This is file one",
5558
"md5": "556dfb57fce9ca301f914e2273adf354",
5659
"name": "one.txt",
5760
"path": "one.txt",
@@ -60,6 +63,7 @@ def test_insert_files(silent):
6063
}.items() <= one.items()
6164
assert {
6265
"content": b"Two is shorter",
66+
"content_text": "Two is shorter",
6367
"md5": "f86f067b083af1911043eb215e74ac70",
6468
"name": "two.txt",
6569
"path": "two.txt",
@@ -68,6 +72,7 @@ def test_insert_files(silent):
6872
}.items() <= two.items()
6973
assert {
7074
"content": b"Three is nested",
75+
"content_text": "Three is nested",
7176
"md5": "12580f341781f5a5b589164d3cd39523",
7277
"name": "three.txt",
7378
"path": os.path.join("nested", "three.txt"),
@@ -84,24 +89,65 @@ def test_insert_files(silent):
8489
"mtime_iso": str,
8590
"mode": int,
8691
"fullpath": str,
92+
"content": bytes,
93+
"content_text": str,
8794
}
8895
for colname, expected_type in expected_types.items():
8996
for row in (one, two, three):
9097
assert isinstance(row[colname], expected_type)
9198

9299

93-
def test_insert_files_stdin():
100+
@pytest.mark.parametrize(
101+
"use_text,encoding,input,expected",
102+
(
103+
(False, None, "hello world", b"hello world"),
104+
(True, None, "hello world", "hello world"),
105+
(False, None, b"S\xe3o Paulo", b"S\xe3o Paulo"),
106+
(True, "latin-1", b"S\xe3o Paulo", "S\xe3o Paulo"),
107+
),
108+
)
109+
def test_insert_files_stdin(use_text, encoding, input, expected):
94110
runner = CliRunner()
95111
with runner.isolated_filesystem():
96112
tmpdir = pathlib.Path(".")
97113
db_path = str(tmpdir / "files.db")
114+
args = ["insert-files", db_path, "files", "-", "--name", "stdin-name"]
115+
if use_text:
116+
args += ["--text"]
117+
if encoding is not None:
118+
args += ["--encoding", encoding]
98119
result = runner.invoke(
99120
cli.cli,
100-
["insert-files", db_path, "files", "-", "--name", "stdin-name"],
121+
args,
101122
catch_exceptions=False,
102-
input="hello world",
123+
input=input,
103124
)
104125
assert result.exit_code == 0, result.stdout
105126
db = Database(db_path)
106127
row = list(db["files"].rows)[0]
107-
assert {"path": "stdin-name", "content": b"hello world", "size": 11} == row
128+
key = "content"
129+
if use_text:
130+
key = "content_text"
131+
assert {"path": "stdin-name", key: expected}.items() <= row.items()
132+
133+
134+
@pytest.mark.skipif(
135+
sys.platform.startswith("win"),
136+
reason="Windows has a different way of handling default encodings",
137+
)
138+
def test_insert_files_bad_text_encoding_error():
139+
runner = CliRunner()
140+
with runner.isolated_filesystem():
141+
tmpdir = pathlib.Path(".")
142+
latin = tmpdir / "latin.txt"
143+
latin.write_bytes(b"S\xe3o Paulo")
144+
db_path = str(tmpdir / "files.db")
145+
result = runner.invoke(
146+
cli.cli,
147+
["insert-files", db_path, "files", str(latin), "--text"],
148+
catch_exceptions=False,
149+
)
150+
assert result.exit_code == 1, result.output
151+
assert result.output.strip().startswith(
152+
"Error: Could not read file '{}' as text".format(str(latin.resolve()))
153+
)

0 commit comments

Comments
 (0)