Skip to content

Commit 0be0b48

Browse files
authored
robuster handling of invalid reverse input lines (#55)
1 parent b8399a9 commit 0be0b48

File tree

6 files changed

+89
-18
lines changed

6 files changed

+89
-18
lines changed

opencage/batch.py

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,12 @@
66
import backoff
77
import certifi
88
import random
9+
import re
910

1011
from tqdm import tqdm
1112
from urllib.parse import urlencode
1213
from contextlib import suppress
13-
from opencage.geocoder import OpenCageGeocode, OpenCageGeocodeError
14+
from opencage.geocoder import OpenCageGeocode, OpenCageGeocodeError, _query_for_reverse_geocoding
1415

1516
class OpenCageBatchGeocoder():
1617
def __init__(self, options):
@@ -38,9 +39,11 @@ async def geocode(self, input, output):
3839

3940
queue = asyncio.Queue(maxsize=self.options.limit)
4041

41-
await self.read_input(input, queue)
42+
read_warnings = await self.read_input(input, queue)
4243

4344
if self.options.dry_run:
45+
if not read_warnings:
46+
print('All good.')
4447
return
4548

4649
if self.options.headers:
@@ -78,19 +81,28 @@ async def test_request(self):
7881
return { 'error': exc }
7982

8083
async def read_input(self, input, queue):
84+
any_warnings = False
8185
for index, row in enumerate(input):
8286
line_number = index + 1
8387

8488
if len(row) == 0:
85-
raise Exception(f"Empty line in input file at line number {line_number}, aborting")
89+
self.log(f"Line {line_number} - Empty line")
90+
any_warnings = True
91+
row = ['']
8692

8793
item = await self.read_one_line(row, line_number)
94+
if item['warnings'] is True:
95+
any_warnings = True
8896
await queue.put(item)
8997

9098
if queue.full():
9199
break
92100

101+
return any_warnings
102+
93103
async def read_one_line(self, row, row_id):
104+
warnings = False
105+
94106
if self.options.command == 'reverse':
95107
input_columns = [1, 2]
96108
elif self.options.input_columns:
@@ -105,14 +117,26 @@ async def read_one_line(self, row, row_id):
105117
# input_columns option uses 1-based indexing
106118
address.append(row[column - 1])
107119
except IndexError:
108-
self.log(f"Missing input column {column} in {row}")
120+
self.log(f"Line {row_id} - Missing input column {column} in {row}")
121+
warnings = True
109122
else:
110123
address = row
111124

112-
if self.options.command == 'reverse' and len(address) != 2:
113-
self.log(f"Expected two comma-separated values for reverse geocoding, got {address}")
125+
if self.options.command == 'reverse':
114126

115-
return { 'row_id': row_id, 'address': ','.join(address), 'original_columns': row }
127+
if len(address) != 2:
128+
self.log(f"Line {row_id} - Expected two comma-separated values for reverse geocoding, got {address}")
129+
else:
130+
# _query_for_reverse_geocoding attempts to convert into numbers. We rather have it fail
131+
# now than during the actual geocoding
132+
try:
133+
_query_for_reverse_geocoding(address[0], address[1])
134+
except:
135+
self.log(f"Line {row_id} - Does not look like latitude and longitude: '{address[0]}' and '{address[1]}'")
136+
warnings = True
137+
address = []
138+
139+
return { 'row_id': row_id, 'address': ','.join(address), 'original_columns': row, 'warnings': warnings }
116140

117141
async def worker(self, output, queue, progress):
118142
while True:
@@ -147,8 +171,9 @@ async def _geocode_one_address():
147171

148172
try:
149173
if self.options.command == 'reverse':
150-
lon, lat = address.split(',')
151-
geocoding_results = await geocoder.reverse_geocode_async(lon, lat, **params)
174+
if ',' in address:
175+
lon, lat = address.split(',')
176+
geocoding_results = await geocoder.reverse_geocode_async(lon, lat, **params)
152177
else:
153178
geocoding_results = await geocoder.geocode_async(address, **params)
154179
except OpenCageGeocodeError as exc:
@@ -205,6 +230,7 @@ async def write_one_geocoding_result(self, output, row_id, address, geocoding_re
205230
output.writerow(row)
206231
self.write_counter = self.write_counter + 1
207232

233+
208234
def log(self, message):
209235
if not self.options.quiet:
210236
sys.stderr.write(f"{message}\n")

opencage/command_line.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,10 @@ def parse_args(args):
5353
print(f"Error: The output file '{options.output}' already exists. You can add --overwrite to your command.", file=sys.stderr)
5454
sys.exit(1)
5555

56+
if 0 in options.input_columns:
57+
print(f"Error: A column 0 in --input-columns does not exist. The lowest possible number is 1.", file=sys.stderr)
58+
sys.exit(1)
59+
5660
return options
5761

5862

pytest.ini

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
[pytest]
2+
pythonpath = .
23
asyncio_default_fixture_loop_scope = session

test/cli/test_cli_args.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,10 @@ def test_invalid_command(capfd):
3535
capfd
3636
)
3737

38-
def test_invalid_command(capfd):
38+
def test_version_number(capfd):
3939
with pytest.raises(SystemExit):
4040
parse_args(['--version'])
41-
out, err = capfd.readouterr()
41+
out, _ = capfd.readouterr()
4242

4343
assert __version__ in out
4444

@@ -79,6 +79,20 @@ def test_argument_range(capfd):
7979
capfd
8080
)
8181

82+
def test_zero_based_list(capfd):
83+
assert_parse_args_error(
84+
[
85+
"forward",
86+
"--api-key", "oc_gc_12345678901234567890123456789012",
87+
"--input", "test/fixtures/input.txt",
88+
"--output", "test/fixtures/output.csv",
89+
"--input-columns", "0,1,2"
90+
],
91+
'The lowest possible number is 1',
92+
capfd
93+
)
94+
95+
8296
def test_full_argument_list():
8397
args = parse_args([
8498
"reverse",
@@ -130,7 +144,8 @@ def test_defaults():
130144
assert args.limit == 0
131145
assert args.headers is False
132146
assert args.input_columns == [1]
133-
assert args.add_columns == ["lat", "lng", "_type", "_category", "country_code", "country", "state", "county", "_normalized_city", "postcode", "road", "house_number", "confidence", "formatted"]
147+
assert args.add_columns == ["lat", "lng", "_type", "_category", "country_code", "country", "state",
148+
"county", "_normalized_city", "postcode", "road", "house_number", "confidence", "formatted"]
134149
assert args.workers == 1
135150
assert args.timeout == 10
136151
assert args.retries == 10

test/cli/test_cli_run.py

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import pathlib
2-
import pytest
32
import os
4-
import sys
3+
import pytest
54

65
from opencage.command_line import main
76

@@ -85,14 +84,32 @@ def test_input_errors(capfd):
8584
"--api-key", TEST_APIKEY_200,
8685
"--input", "test/fixtures/cli/reverse_with_errors.csv",
8786
"--output", "test/fixtures/cli/output.csv",
87+
"--add-columns", "country_code,postcode",
8888
"--no-progress"
8989
])
9090

9191
_, err = capfd.readouterr()
92-
assert 'Missing input column 2 in' in err
93-
assert 'Expected two comma-separated values' in err
92+
# assert err == ''
93+
assert err.count("\n") == 6
94+
assert "Line 1 - Missing input column 2 in ['50.101010']" in err
95+
assert "Line 1 - Expected two comma-separated values for reverse geocoding, got ['50.101010']" in err
96+
assert "Line 3 - Empty line" in err
97+
assert "Line 3 - Missing input column 2 in ['']" in err
98+
assert "Line 3 - Expected two comma-separated values for reverse geocoding, got ['']" in err
99+
assert "Line 4 - Does not look like latitude and longitude: 'a' and 'b'" in err
94100

95-
def test_empty_result(capfd):
101+
assert_output(
102+
path="test/fixtures/cli/output.csv",
103+
length=4,
104+
lines=[
105+
'50.101010,,',
106+
'-100,60.1,de,48153',
107+
',,',
108+
'a,b,,'
109+
]
110+
)
111+
112+
def test_empty_result():
96113
# 'NOWHERE-INTERESTING' is guaranteed to return no result
97114
# https://opencagedata.com/api#testingkeys
98115
main([
@@ -138,6 +155,11 @@ def test_dryrun(capfd):
138155

139156
assert not os.path.isfile("test/fixtures/cli/output.csv")
140157

158+
out, _ = capfd.readouterr()
159+
assert out.count("\n") == 1
160+
assert "All good." in out
161+
162+
141163
def test_invalid_domain(capfd):
142164
main([
143165
"forward",
Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,4 @@
1-
50.101010
1+
50.101010
2+
-100,60.1
3+
4+
a,b

0 commit comments

Comments
 (0)