diff --git a/frictionless/analyzer/analyzer.py b/frictionless/analyzer/analyzer.py index f015e70cd8..61eeb5aeea 100644 --- a/frictionless/analyzer/analyzer.py +++ b/frictionless/analyzer/analyzer.py @@ -34,7 +34,8 @@ def analyze_table_resource( # Iterate rows columns_data: Dict[str, List[Any]] = {} numeric = ["integer", "numeric", "number"] - with resource: + # Use a copy of the resource to avoid side effects (see #1622) + with resource.to_copy() as resource: for row in resource.row_stream: null_columns = 0 for field_name in row: diff --git a/frictionless/formats/csv/parser.py b/frictionless/formats/csv/parser.py index 898b4cc18a..79d16fcec1 100644 --- a/frictionless/formats/csv/parser.py +++ b/frictionless/formats/csv/parser.py @@ -63,7 +63,8 @@ def write_row_stream(self, source: TableResource): "wt", delete=False, encoding=self.resource.encoding, newline="" ) as file: writer = csv.writer(file, **options) # type: ignore - with source: + # Use a copy of the source to avoid side effects (see #1622) + with source.to_copy() as source: if self.resource.dialect.header: writer.writerow(source.schema.field_names) for row in source.row_stream: diff --git a/frictionless/formats/excel/parsers/xls.py b/frictionless/formats/excel/parsers/xls.py index 48de1cde18..7818e0f80e 100644 --- a/frictionless/formats/excel/parsers/xls.py +++ b/frictionless/formats/excel/parsers/xls.py @@ -109,7 +109,8 @@ def write_row_stream(self, source: TableResource): if isinstance(title, int): title = f"Sheet {control.sheet}" sheet = book.add_sheet(title) - with source: + # Write from a copy of the source to avoid side effects (see #1622) + with source.to_copy() as source: if self.resource.dialect.header: for field_index, name in enumerate(source.schema.field_names): sheet.write(0, field_index, name) diff --git a/frictionless/formats/excel/parsers/xlsx.py b/frictionless/formats/excel/parsers/xlsx.py index b402fdc6d5..f9810070d3 100644 --- a/frictionless/formats/excel/parsers/xlsx.py +++ b/frictionless/formats/excel/parsers/xlsx.py @@ -148,7 +148,8 @@ def write_row_stream(self, source: TableResource): if isinstance(title, int): title = f"Sheet {control.sheet}" sheet = book.create_sheet(title) - with source: + # Write from a copy of the source to avoid side effects (see #1622) + with source.to_copy() as source: if self.resource.dialect.header: sheet.append(source.schema.field_names) for row in source.row_stream: diff --git a/frictionless/formats/gsheets/parser.py b/frictionless/formats/gsheets/parser.py index 7cf81b2777..523118132f 100644 --- a/frictionless/formats/gsheets/parser.py +++ b/frictionless/formats/gsheets/parser.py @@ -53,7 +53,8 @@ def write_row_stream(self, source: TableResource): sh = gc.open_by_key(key) wks = sh.worksheet_by_id(gid) if gid else sh[0] # type: ignore data: List[Any] = [] - with source: + # Use a copy of the source to avoid side effects (see #1622) + with source.to_copy() as source: data.append(source.schema.field_names) for row in source.row_stream: data.append(row.to_list()) diff --git a/frictionless/formats/html/parser.py b/frictionless/formats/html/parser.py index a304685ef3..a3d0a5934c 100644 --- a/frictionless/formats/html/parser.py +++ b/frictionless/formats/html/parser.py @@ -57,7 +57,8 @@ def read_cell_stream_create(self) -> types.ICellStream: # It will give us an ability to support HtmlDialect def write_row_stream(self, source: TableResource): html = "\n" - with source: + # Use a copy of the source to avoid side effects (see #1622) + with source.to_copy() as source: html += "" for name in source.schema.field_names: html += f"" diff --git a/frictionless/formats/inline/parser.py b/frictionless/formats/inline/parser.py index 48d61c8d11..e22cc04ea1 100644 --- a/frictionless/formats/inline/parser.py +++ b/frictionless/formats/inline/parser.py @@ -91,7 +91,8 @@ def read_cell_stream_create(self): # type: ignore def write_row_stream(self, source: TableResource): data: List[Any] = [] control = InlineControl.from_dialect(self.resource.dialect) - with source: + # Write from a copy of the source to avoid side effects (see #1622) + with source.to_copy() as source: if self.resource.dialect.header and not control.keyed: data.append(source.schema.field_names) for row in source.row_stream: diff --git a/frictionless/formats/json/parsers/json.py b/frictionless/formats/json/parsers/json.py index 42f22f4fc1..d88e83d903 100644 --- a/frictionless/formats/json/parsers/json.py +++ b/frictionless/formats/json/parsers/json.py @@ -54,7 +54,8 @@ def read_cell_stream_create(self) -> types.ICellStream: def write_row_stream(self, source: TableResource): data: List[Any] = [] control = JsonControl.from_dialect(self.resource.dialect) - with source: + # Write from a copy of the source to avoid side effects (see #1622) + with source.to_copy() as source: if self.resource.dialect.header and not control.keyed: data.append(source.schema.field_names) for row in source.row_stream: diff --git a/frictionless/formats/json/parsers/jsonl.py b/frictionless/formats/json/parsers/jsonl.py index 62e35e6081..3ed2cc0035 100644 --- a/frictionless/formats/json/parsers/jsonl.py +++ b/frictionless/formats/json/parsers/jsonl.py @@ -46,7 +46,8 @@ def write_row_stream(self, source: TableResource): control = JsonControl.from_dialect(self.resource.dialect) with tempfile.NamedTemporaryFile(delete=False) as file: writer = platform.jsonlines.Writer(file) - with source: + # Write from a copy of the source to avoid side effects (see #1622) + with source.to_copy() as source: if self.resource.dialect.header and not control.keyed: writer.write(source.schema.field_names) for row in source.row_stream: diff --git a/frictionless/formats/ods/parser.py b/frictionless/formats/ods/parser.py index 96dcaaac63..bd878cf009 100644 --- a/frictionless/formats/ods/parser.py +++ b/frictionless/formats/ods/parser.py @@ -82,15 +82,16 @@ def write_row_stream(self, source: TableResource): file.close() book = platform.ezodf.newdoc(doctype="ods", filename=file.name) title = f"Sheet {control.sheet}" - # Get size - with source: + # Get size. Use a copy of the source to avoid side effects (see #1622) + with source.to_copy() as source: row_size = 1 col_size = len(source.schema.fields) for _ in source.row_stream: row_size += 1 book.sheets += platform.ezodf.Sheet(title, size=(row_size, col_size)) sheet = book.sheets[title] - with source: + # Write from a copy of the source to avoid side effects (see #1622) + with source.to_copy() as source: if self.resource.dialect.header: for field_index, name in enumerate(source.schema.field_names): sheet[(0, field_index)].set_value(name) diff --git a/frictionless/formats/pandas/parser.py b/frictionless/formats/pandas/parser.py index ab7b3389a0..28c2bd4f6c 100644 --- a/frictionless/formats/pandas/parser.py +++ b/frictionless/formats/pandas/parser.py @@ -128,7 +128,8 @@ def write_row_stream(self, source: TableResource): data_rows: List[Tuple[Any]] = [] index_rows: List[Tuple[Any]] = [] fixed_types = {} - with source: + # Use a copy of the source to avoid side effects (see #1622) + with source.to_copy() as source: for row in source.row_stream: data_values: List[Any] = [] index_values: List[Any] = [] diff --git a/frictionless/formats/qsv/adapter.py b/frictionless/formats/qsv/adapter.py index eae77976f6..2b18a7b371 100644 --- a/frictionless/formats/qsv/adapter.py +++ b/frictionless/formats/qsv/adapter.py @@ -27,7 +27,8 @@ def read_schema(self, resource: Resource) -> Schema: command = [self.qsv_path, "stats", "--infer-dates", "--dates-whitelist", "all"] process = sp.Popen(command, stdout=sp.PIPE, stdin=sp.PIPE) # TODO: Use FileResource here (or future resource.stream_bytes()) - with resource: + # Use a copy of the resource to avoid side effects (see #1622) + with resource.to_copy() as resource: while True: chunk = resource.read_bytes(size=BLOCK_SIZE) if not chunk: diff --git a/frictionless/formats/spss/parser.py b/frictionless/formats/spss/parser.py index 0b706bdf9f..9a40054fd0 100644 --- a/frictionless/formats/spss/parser.py +++ b/frictionless/formats/spss/parser.py @@ -99,7 +99,8 @@ def write_row_stream(self, source: TableResource): # Write rows with sav.SavWriter(self.resource.normpath, ioUtf8=True, **spss_schema) as writer: # type: ignore - with source: + # Use a copy of the source to avoid side effects (see #1622) + with source.to_copy() as source: for row in source.row_stream: # type: ignore cells: List[Any] = [] for field in source.schema.fields: # type: ignore @@ -130,7 +131,8 @@ def __write_convert_schema(self, source: TableResource): "varTypes": {}, "formats": {}, } - with source: + # Use a copy of the source to avoid side effects (see #1622) + with source.to_copy() as source: # Add fields sizes: Dict[str, int] = {} mapping = self.__write_convert_type() diff --git a/frictionless/formats/sql/adapter.py b/frictionless/formats/sql/adapter.py index 5f49b7b4b5..554798a358 100644 --- a/frictionless/formats/sql/adapter.py +++ b/frictionless/formats/sql/adapter.py @@ -109,7 +109,8 @@ def write_package(self, package: Package): for table in self.metadata.sorted_tables: if package.has_table_resource(table.name): resource = package.get_table_resource(table.name) - with resource: + # Use a copy of the resource to avoid side effects (see #1622) + with resource.to_copy() as resource: self.write_row_stream(resource.row_stream, table_name=table.name) return models.PublishResult( url=self.engine.url.render_as_string(hide_password=True), diff --git a/frictionless/formats/sql/parser.py b/frictionless/formats/sql/parser.py index d9475e53fc..3e1d68883e 100644 --- a/frictionless/formats/sql/parser.py +++ b/frictionless/formats/sql/parser.py @@ -51,6 +51,7 @@ def write_row_stream(self, source: TableResource): adapter = SqlAdapter(engine, control=control) if not adapter: raise FrictionlessException(f"Not supported source: {self.resource.normpath}") - with source: + # Write from a copy to prevent side effects (see #1622) + with source.to_copy() as source: adapter.write_schema(source.schema, table_name=control.table) adapter.write_row_stream(source.row_stream, table_name=control.table) diff --git a/frictionless/formats/yaml/parser.py b/frictionless/formats/yaml/parser.py index 7d2e3016c5..6d8da920ae 100644 --- a/frictionless/formats/yaml/parser.py +++ b/frictionless/formats/yaml/parser.py @@ -52,7 +52,8 @@ def read_cell_stream_create(self) -> types.ICellStream: def write_row_stream(self, source: TableResource): data: List[Any] = [] control = YamlControl.from_dialect(self.resource.dialect) - with source: + # Write from a copy of the source to avoid side effects (see #1622) + with source.to_copy() as source: if self.resource.dialect.header and not control.keyed: data.append(source.schema.field_names) for row in source.row_stream: diff --git a/frictionless/indexer/indexer.py b/frictionless/indexer/indexer.py index e689315d41..8277987ba8 100644 --- a/frictionless/indexer/indexer.py +++ b/frictionless/indexer/indexer.py @@ -45,20 +45,24 @@ def __attrs_post_init__(self): def index(self) -> Optional[Report]: self.prepare_resource() - with self.resource: - # Index is resouce-based operation not supporting FKs - if self.resource.schema.foreign_keys: - self.resource.schema.foreign_keys = [] - self.create_table() - while True: - try: - return self.populate_table() - except Exception: - if self.fast and self.use_fallback: - self.fast = False - continue - self.delete_table() - raise + + # Infer resource if needed + if self.resource.closed: + self.resource.infer() + + # Index is resouce-based operation not supporting FKs + if self.resource.schema.foreign_keys: + self.resource.schema.foreign_keys = [] + self.create_table() + while True: + try: + return self.populate_table() + except Exception: + if self.fast and self.use_fallback: + self.fast = False + continue + self.delete_table() + raise def prepare_resource(self): if self.qsv_path: @@ -108,10 +112,12 @@ def populate_table_fast_sqlite(self): sql_command = f".import '|cat -' \"{self.table_name}\"" command = ["sqlite3", "-csv", self.adapter.engine.url.database, sql_command] process = subprocess.Popen(command, stdin=PIPE, stdout=PIPE) - for line_number, line in enumerate(self.resource.byte_stream, start=1): - if line_number > 1: - process.stdin.write(line) # type: ignore - self.report_progress(f"{self.resource.stats.bytes} bytes") + # Iterate over a copy of the resouce to avoid side effects (see #1622) + with self.resource.to_copy() as resource: + for line_number, line in enumerate(resource.byte_stream, start=1): + if line_number > 1: + process.stdin.write(line) # type: ignore + self.report_progress(f"{self.resource.stats.bytes} bytes") process.stdin.close() # type: ignore process.wait() @@ -119,14 +125,16 @@ def populate_table_fast_postgresql(self): database_url = self.adapter.engine.url.render_as_string(hide_password=False) with platform.psycopg.connect(database_url) as connection: with connection.cursor() as cursor: - query = 'COPY "%s" FROM STDIN CSV HEADER' % self.table_name - with cursor.copy(query) as copy: # type: ignore - while True: - chunk = self.resource.read_bytes(size=settings.BLOCK_SIZE) - if not chunk: - break - copy.write(chunk) - self.report_progress(f"{self.resource.stats.bytes} bytes") + # Iterate over a copy of the resouce to avoid side effects (see #1622) + with self.resource.to_copy() as resource: + query = 'COPY "%s" FROM STDIN CSV HEADER' % self.table_name + with cursor.copy(query) as copy: # type: ignore + while True: + chunk = resource.read_bytes(size=settings.BLOCK_SIZE) + if not chunk: + break + copy.write(chunk) + self.report_progress(f"{self.resource.stats.bytes} bytes") def delete_table(self): self.adapter.delete_resource(self.table_name) diff --git a/frictionless/resource/resource.py b/frictionless/resource/resource.py index 3d93d85c95..b8f7d11fae 100644 --- a/frictionless/resource/resource.py +++ b/frictionless/resource/resource.py @@ -238,6 +238,7 @@ def __attrs_post_init__(self): # Internal self.__loader: Optional[Loader] = None self.__buffer: Optional[types.IBuffer] = None + self.__context_manager_entered: bool = False # Detect resource system.detect_resource(self) @@ -257,11 +258,58 @@ def __attrs_post_init__(self): # TODO: shall we guarantee here that it's at the beginning for the file? # TODO: maybe it's possible to do type narrowing here? def __enter__(self): - if self.closed: - self.open() + """ + Enters a context manager for the resource. + We need to be careful with contexts because they open and close the Resource + (and thus any underlying files) and we don't want to close a file that is + being used somewhere higher up the call stack. + + e.g. if nested contexts were allowed then: + + with Resource("in.csv") as resource: + with resource: + # use resource + resource.write("out.csv") + + would result in errors because the second context would close the file + before the write happened. While the above code is obvious, similar + things can happen when composing steps in pipelines, calling petl code etc. + where the various functions may have no knowledge of each other. + See #1622 for more details. + + So we only allow a single context to be open at a time, and raise an + exception if nested context is attempted. For similar reasons, we + also raise an exception if a context is attempted on an open resource. + + The above code can be successfully written as: + + with Resource("in.csv") as resource: + with resource.to_copy() as resource2: + use resource2: + resource.write("out.csv") + + which keeps resource and resource2 as independent views on the same file. + + Note that if you absolutely need to use a resource in a manner where you + don't care if it is "opened" multiple times and closed once then you + can directly use `open()` and `close()` but you also become responsible + for ensuring the file is closed at the correct time. + """ + if self.__context_manager_entered: + note = "Resource has previously entered a context manager (`with` statement) and does not support nested contexts. To use in a nested context use `to_copy()` then use the copy in the `with`." + raise FrictionlessException(note) + if not self.closed: + note = "Resource is currently open, and cannot be used in a `with` statement (which would reopen the file). To use `with` on an open Resouece, use to_copy() then use the copy in the `with`." + raise FrictionlessException(note) + + self.__context_manager_entered = True + + self.open() return self def __exit__(self, type, value, traceback): # type: ignore + # Mark the context manager as closed so that sequential contexts are allowed. + self.__context_manager_entered = False self.close() @property diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index 98a76f3473..0c2f00c38a 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -254,7 +254,8 @@ def __open_lookup(self): self.__lookup[source_name][source_key] = set() if not source_res: continue - with source_res: + # Iterate on a copy to avoid side effects (see #1622) + with source_res.to_copy() as source_res: for row in source_res.row_stream: # type: ignore cells = tuple(row.get(field_name) for field_name in source_key) # type: ignore if set(cells) == {None}: # type: ignore @@ -641,12 +642,15 @@ def from_petl(view: Any, **options: Any): def to_petl(self, normalize: bool = False): """Export resource as a PETL table""" - resource = self.to_copy() + # Store a copy of self to avoid side effects (see #1622) + self_copy = self.to_copy() # Define view class ResourceView(platform.petl.Table): # type: ignore def __iter__(self): # type: ignore - with resource: + # Iterate over a copy of the resource so that each instance of the iterator is independent (see #1622) + # If we didn't do this, then different iterators on the same table would interfere with each other. + with self_copy.to_copy() as resource: if normalize: yield resource.schema.field_names yield from (row.to_list() for row in resource.row_stream) diff --git a/frictionless/steps/table/table_debug.py b/frictionless/steps/table/table_debug.py index b5175bfd9b..1810785368 100644 --- a/frictionless/steps/table/table_debug.py +++ b/frictionless/steps/table/table_debug.py @@ -33,8 +33,9 @@ def transform_resource(self, resource: Resource): # Data def data(): # type: ignore - with current: - for row in current.row_stream: # type: ignore + # Use a copy of the source to avoid side effects (see #1622) + with current.to_copy() as current_copy: + for row in current_copy.row_stream: # type: ignore self.function(row) # type: ignore yield row diff --git a/frictionless/steps/table/table_normalize.py b/frictionless/steps/table/table_normalize.py index 409d2a90ab..cd5bbb2fb5 100644 --- a/frictionless/steps/table/table_normalize.py +++ b/frictionless/steps/table/table_normalize.py @@ -24,11 +24,12 @@ class table_normalize(Step): # Transform def transform_resource(self, resource: Resource): - current = resource.to_copy() + resource_copy = resource.to_copy() # Data def data(): # type: ignore - with current: + # Yield from a copy to avoid side effects (see #1622) + with resource_copy.to_copy() as current: yield current.header.to_list() # type: ignore for row in current.row_stream: # type: ignore yield row.to_list() # type: ignore diff --git a/frictionless/steps/table/table_validate.py b/frictionless/steps/table/table_validate.py index 1d17bd1afd..dba4d2ff92 100644 --- a/frictionless/steps/table/table_validate.py +++ b/frictionless/steps/table/table_validate.py @@ -29,11 +29,14 @@ def transform_resource(self, resource: Resource): # Data def data(): # type: ignore - with current: - if not current.header.valid: # type: ignore - raise FrictionlessException(error=current.header.errors[0]) # type: ignore - yield current.header # type: ignore - for row in current.row_stream: # type: ignore + # Use a copy of the source to avoid side effects (see #1622) + with current.to_copy() as current_copy: # type: ignore + if not current_copy.header.valid: # type: ignore + raise FrictionlessException( + error=current_copy.header.errors[0] # type: ignore + ) # type: ignore + yield current_copy.header # type: ignore + for row in current_copy.row_stream: # type: ignore if not row.valid: # type: ignore raise FrictionlessException(error=row.errors[0]) # type: ignore yield row diff --git a/frictionless/validator/validator.py b/frictionless/validator/validator.py index 4657c9f758..3674fcc89f 100644 --- a/frictionless/validator/validator.py +++ b/frictionless/validator/validator.py @@ -94,10 +94,6 @@ def validate_resource( errors: List[Error] = [] warnings: List[str] = [] - # Prepare checklist - checklist = checklist or Checklist() - checks = checklist.connect(resource) - # Validate metadata try: resource.to_descriptor(validate=True) @@ -119,13 +115,20 @@ def validate_resource( try: resource.open() except FrictionlessException as exception: - resource.close() return Report.from_validation_task( resource, time=timer.time, errors=exception.to_errors() ) + finally: + # Always close the resource if we opened it to avoid side effects + resource.close() + + # Validate row data + # Run the per-row validation against a copy of the resource to avoid side effects (see #1622) + with resource.to_copy() as resource_copy: + # Prepare checklist, and connect it to the resource copy + checklist = checklist or Checklist() + checks = checklist.connect(resource_copy) - # Validate data - with resource: # Validate start for index, check in enumerate(checks): for error in check.validate_start(): @@ -135,20 +138,22 @@ def validate_resource( errors.append(error) # Validate file - if not isinstance(resource, platform.frictionless_resources.TableResource): - if resource.hash is not None or resource.bytes is not None: - helpers.pass_through(resource.byte_stream) + if not isinstance( + resource_copy, platform.frictionless_resources.TableResource + ): + if resource_copy.hash is not None or resource_copy.bytes is not None: + helpers.pass_through(resource_copy.byte_stream) # Validate table else: row_count = 0 - labels = resource.labels + labels = resource_copy.labels while True: row_count += 1 # Emit row try: - row = next(resource.row_stream) # type: ignore + row = next(resource_copy.row_stream) # type: ignore except FrictionlessException as exception: errors.append(exception.error) continue @@ -189,6 +194,11 @@ def validate_resource( if checklist.match(error): errors.append(error) + # Update the stats in the base resource with those from the copy + # Note that this mutation of the base resource is an expected result of the validation, + # but depending on what other code does with the resource, they may be overwritten. + resource.stats = resource_copy.stats + # Return report return Report.from_validation_task( resource, time=timer.time, labels=labels, errors=errors, warnings=warnings diff --git a/tests/analyzer/test_resource.py b/tests/analyzer/test_resource.py index 53da72cc12..f572afb9e7 100644 --- a/tests/analyzer/test_resource.py +++ b/tests/analyzer/test_resource.py @@ -241,3 +241,29 @@ def test_analyze_resource_detailed_with_invalid_data(): assert analysis["rowsWithNullValues"] == 3 assert analysis["notNullRows"] == 1 assert analysis["variableTypes"] == {"integer": 3, "string": 1} + + +def test_analyze_resource_is_independent_bug_1622(): + # Test that we can analyze a resource without side effects + resource = TableResource(path="data/analysis-data.csv") + with resource: + analysis = resource.analyze() + assert list(analysis.keys()) == [ + "variableTypes", + "notNullRows", + "rowsWithNullValues", + "fieldStats", + "averageRecordSizeInBytes", + "timeTaken", + "md5", + "sha256", + "bytes", + "fields", + "rows", + ] + assert round(analysis["averageRecordSizeInBytes"]) == 85 + assert analysis["fields"] == 11 + assert analysis["rows"] == 9 + assert analysis["rowsWithNullValues"] == 2 + assert analysis["notNullRows"] == 7 + assert analysis["variableTypes"] == {} diff --git a/tests/formats/csv/test_parser.py b/tests/formats/csv/test_parser.py index 2bf4bca368..6978352174 100644 --- a/tests/formats/csv/test_parser.py +++ b/tests/formats/csv/test_parser.py @@ -344,3 +344,17 @@ def test_csv_parser_proper_quote_issue_493(): resource.infer() assert resource.dialect.to_descriptor() == {} assert len(resource.schema.fields) == 126 + + +@pytest.mark.skipif(platform.type == "windows", reason="Fix on Windows") +def test_csv_parser_write_independent_issue_1622(tmpdir): + source = TableResource(path="data/table.csv") + with source: + target = TableResource(path=str(tmpdir.join("table.csv"))) + source.write(target) + with target: + assert target.header == ["id", "name"] + assert target.read_rows() == [ + {"id": 1, "name": "english"}, + {"id": 2, "name": "中国人"}, + ] diff --git a/tests/formats/excel/parsers/test_xls.py b/tests/formats/excel/parsers/test_xls.py index 73e5a02213..9668a32ee1 100644 --- a/tests/formats/excel/parsers/test_xls.py +++ b/tests/formats/excel/parsers/test_xls.py @@ -169,3 +169,16 @@ def test_xls_parser_cast_int_to_string_1251(): {"A": "001", "B": "b", "C": "1", "D": "a", "E": 1}, {"A": "002", "B": "c", "C": "1", "D": "1", "E": 1}, ] + + +def test_xls_parser_write_independent_bug_1622(tmpdir): + source = TableResource(path="data/table.csv") + with source: + target = TableResource(path=str(tmpdir.join("table.xls"))) + source.write(target) + with target: + assert target.header == ["id", "name"] + assert target.read_rows() == [ + {"id": 1, "name": "english"}, + {"id": 2, "name": "中国人"}, + ] diff --git a/tests/formats/excel/parsers/test_xlsx.py b/tests/formats/excel/parsers/test_xlsx.py index 2deb051e7f..61f2b520ed 100644 --- a/tests/formats/excel/parsers/test_xlsx.py +++ b/tests/formats/excel/parsers/test_xlsx.py @@ -307,3 +307,16 @@ def test_xlsx_parser_cannot_read_resource_from_remote_package_issue_1504(): resource = package.get_table_resource("excel") table = resource.read_table() assert len(table.rows) == 4 + + +def test_xlsx_parser_write_independent_1622(tmpdir): + source = TableResource(path="data/table.csv") + with source: + target = TableResource(path=str(tmpdir.join("table.xlsx"))) + source.write(target) + with target: + assert target.header == ["id", "name"] + assert target.read_rows() == [ + {"id": 1, "name": "english"}, + {"id": 2, "name": "中国人"}, + ] diff --git a/tests/formats/gsheets/test_parser.py b/tests/formats/gsheets/test_parser.py index 815167fb45..22f1dfbbff 100644 --- a/tests/formats/gsheets/test_parser.py +++ b/tests/formats/gsheets/test_parser.py @@ -52,10 +52,11 @@ def test_gsheets_parser_write(google_credentials_path): path = "https://docs.google.com/spreadsheets/d/1F2OiYmaf8e3x7jSc95_uNgfUyBlSXrcRg-4K_MFNZQI/edit" control = formats.GsheetsControl(credentials=google_credentials_path) source = TableResource(path="data/table.csv") - target = source.write(path=path, control=control) - with target: - assert target.header == ["id", "name"] - assert target.read_rows() == [ - {"id": 1, "name": "english"}, - {"id": 2, "name": "中国人"}, - ] + with source: + target = source.write(path=path, control=control) + with target: + assert target.header == ["id", "name"] + assert target.read_rows() == [ + {"id": 1, "name": "english"}, + {"id": 2, "name": "中国人"}, + ] diff --git a/tests/formats/html/test_parser.py b/tests/formats/html/test_parser.py index 225cd22180..382cf325f6 100644 --- a/tests/formats/html/test_parser.py +++ b/tests/formats/html/test_parser.py @@ -62,3 +62,17 @@ def test_html_parser_newline_in_cell_construction_file_issue_865(tmpdir): target = source.write(str(tmpdir.join("table.csv"))) target.infer(stats=True) assert target.stats.rows == 226 + + +@pytest.mark.skipif(platform.type == "windows", reason="Fix on Windows") +def test_html_parser_write_independent_bug_1622(tmpdir): + source = TableResource(path="data/table.csv") + with source: + target = TableResource(path=str(tmpdir.join("table.html"))) + source.write(target) + with target: + assert target.header == ["id", "name"] + assert target.read_rows() == [ + {"id": 1, "name": "english"}, + {"id": 2, "name": "中国人"}, + ] diff --git a/tests/formats/inline/test_parser.py b/tests/formats/inline/test_parser.py index 829c695b2a..a5a060b920 100644 --- a/tests/formats/inline/test_parser.py +++ b/tests/formats/inline/test_parser.py @@ -139,3 +139,15 @@ def test_inline_parser_write_skip_header(): with TableResource(path="data/table.csv") as resource: resource.write(target) assert target.data == [[1, "english"], [2, "中国人"]] + + +@pytest.mark.skip +def test_inline_parser_write_keyed_independent_bug_1622(tmpdir): + control = formats.InlineControl(keyed=True) + source = TableResource(path="data/table.csv") + with source: + target = source.write(format="inline", control=control) + assert target.data == [ + {"id": 1, "name": "english"}, + {"id": 2, "name": "中国人"}, + ] diff --git a/tests/formats/json/parsers/test_json.py b/tests/formats/json/parsers/test_json.py index 12a49af9a6..386b8df9b7 100644 --- a/tests/formats/json/parsers/test_json.py +++ b/tests/formats/json/parsers/test_json.py @@ -135,3 +135,20 @@ def test_json_parser_write_skip_header(tmpdir): with TableResource(path="data/table.csv") as resource: target = resource.write(target) assert target.read_data() == [[1, "english"], [2, "中国人"]] + + +# Bugs + + +def test_json_parser_write_independent_bug_1622(tmpdir): + source = TableResource(path="data/table.csv") + with source: + target = TableResource(path=str(tmpdir.join("table.json"))) + target = source.write(target) + assert target.normpath + with open(target.normpath) as file: + assert json.load(file) == [ + ["id", "name"], + [1, "english"], + [2, "中国人"], + ] diff --git a/tests/formats/json/parsers/test_jsonl.py b/tests/formats/json/parsers/test_jsonl.py index b29cb9339d..6c55799a38 100644 --- a/tests/formats/json/parsers/test_jsonl.py +++ b/tests/formats/json/parsers/test_jsonl.py @@ -59,3 +59,18 @@ def test_jsonl_parser_write_skip_header(tmpdir): {"field1": 1, "field2": "english"}, {"field1": 2, "field2": "中国人"}, ] + + +# Bugs + + +def test_jsonl_parser_write_independent_bug_1622(tmpdir): + source = TableResource(path="data/table.csv") + with source: + target = source.write(path=str(tmpdir.join("table.jsonl"))) + with target: + assert target.header == ["id", "name"] + assert target.read_rows() == [ + {"id": 1, "name": "english"}, + {"id": 2, "name": "中国人"}, + ] diff --git a/tests/formats/ods/test_parser.py b/tests/formats/ods/test_parser.py index 1ab6d564a8..c8a491aa3d 100644 --- a/tests/formats/ods/test_parser.py +++ b/tests/formats/ods/test_parser.py @@ -139,3 +139,19 @@ def test_ods_parser_write_skip_header(tmpdir): resource.write_table(target) table = target.read_table() assert table.header == ["field1", "field2"] + + +# Bugs + + +def test_ods_parser_write_independent_bug_1622(tmpdir): + source = TableResource(path="data/table.csv") + with source: + target = TableResource(path=str(tmpdir.join("table.ods"))) + source.write(target) + with target: + assert target.header == ["id", "name"] + assert target.read_rows() == [ + {"id": 1, "name": "english"}, + {"id": 2, "name": "中国人"}, + ] diff --git a/tests/formats/pandas/test_parser.py b/tests/formats/pandas/test_parser.py index cb60d791da..ce22960a13 100644 --- a/tests/formats/pandas/test_parser.py +++ b/tests/formats/pandas/test_parser.py @@ -324,3 +324,16 @@ def test_validate_package_with_in_code_resources_1245(): datapackage.add_resource(resource) report = validate(datapackage) assert len(report.errors) == 0 + + +# Bugs + + +def test_pandas_parser_write_independent_bug_1622(): + source = TableResource(path="data/table.csv") + with source: + target = source.write(format="pandas") + assert target.data.to_dict("records") == [ # type: ignore + {"id": 1, "name": "english"}, + {"id": 2, "name": "中国人"}, + ] diff --git a/tests/formats/parquet/test_parser.py b/tests/formats/parquet/test_parser.py index 76b39efda0..142f257989 100644 --- a/tests/formats/parquet/test_parser.py +++ b/tests/formats/parquet/test_parser.py @@ -77,3 +77,20 @@ def test_parquet_parser_write_datetime_field_with_timezone(tmpdir): ) } ] + + +# Bugs + + +def test_parquet_parser_write_independent_bug_1622(tmpdir): + source = TableResource(path="data/table.csv") + with source: + target = TableResource(path=str(tmpdir.join("table.parq"))) + source.write(target) + with target: + assert target.format == "parq" + assert target.header == ["id", "name"] + assert target.read_rows() == [ + {"id": 1, "name": "english"}, + {"id": 2, "name": "中国人"}, + ] diff --git a/tests/formats/spss/test_parser.py b/tests/formats/spss/test_parser.py index 4824056920..f79447cf96 100644 --- a/tests/formats/spss/test_parser.py +++ b/tests/formats/spss/test_parser.py @@ -128,3 +128,18 @@ def test_spss_parser_write_timezone(tmpdir): "time": time(18), }, ] + + +# Bugs + + +def test_spss_parser_write_independent_bug_1622(tmpdir): + source = TableResource(path="data/table.csv") + with source: + target = source.write(str(tmpdir.join("table.sav"))) + with target: + assert target.header == ["id", "name"] + assert target.read_rows() == [ + {"id": 1, "name": "english"}, + {"id": 2, "name": "中国人"}, + ] diff --git a/tests/formats/sql/databases/duckdb/test_parser.py b/tests/formats/sql/databases/duckdb/test_parser.py index 1393408e90..ce0c268cb2 100644 --- a/tests/formats/sql/databases/duckdb/test_parser.py +++ b/tests/formats/sql/databases/duckdb/test_parser.py @@ -160,3 +160,16 @@ def test_sql_parser_describe_to_yaml_failing_issue_821(duckdb_url_data): resource = TableResource(path=duckdb_url_data, control=control) resource.infer() assert resource.to_yaml() + + +def test_sql_parser_write_independent_issue_1622(duckdb_url_data): + source = TableResource(path="data/table.csv") + with source: + control = formats.SqlControl(table="name", order_by="id") + target = source.write(path=duckdb_url_data, control=control) + with target: + assert target.header == ["id", "name"] + assert target.read_rows() == [ + {"id": 1, "name": "english"}, + {"id": 2, "name": "中国人"}, + ] diff --git a/tests/formats/sql/databases/mysql/test_parser.py b/tests/formats/sql/databases/mysql/test_parser.py index c95b61b2fb..efd2d70c7b 100644 --- a/tests/formats/sql/databases/mysql/test_parser.py +++ b/tests/formats/sql/databases/mysql/test_parser.py @@ -55,3 +55,32 @@ def test_sql_parser_write_string_pk_issue_777_mysql(mysql_url): {"id": 1, "name": "english"}, {"id": 2, "name": "中国人"}, ] + + +@pytest.mark.skipif(platform.type == "darwin", reason="Skip SQL test in MacOS") +@pytest.mark.skipif(platform.type == "windows", reason="Skip SQL test in Windows") +def test_sql_parser_write_independent_bug_1622(mysql_url): + source = TableResource(path="data/timezone.csv") + with source: + control = formats.SqlControl(table="timezone") + target = source.write(path=mysql_url, control=control) + with target: + assert target.header == ["datetime", "time"] + assert target.read_rows() == [ + { + "datetime": datetime(2020, 1, 1, 15), + "time": time(15), + }, + { + "datetime": datetime(2020, 1, 1, 15), + "time": time(15), + }, + { + "datetime": datetime(2020, 1, 1, 12), + "time": time(12), + }, + { + "datetime": datetime(2020, 1, 1, 18), + "time": time(18), + }, + ] diff --git a/tests/formats/sql/databases/postgresql/test_parser.py b/tests/formats/sql/databases/postgresql/test_parser.py index 6e8f7acc33..94d43378c2 100644 --- a/tests/formats/sql/databases/postgresql/test_parser.py +++ b/tests/formats/sql/databases/postgresql/test_parser.py @@ -62,3 +62,32 @@ def test_sql_parser_write_string_pk_issue_777_postgresql(postgresql_url): {"id": 1, "name": "english"}, {"id": 2, "name": "中国人"}, ] + + +@pytest.mark.skipif(platform.type == "darwin", reason="Skip SQL test in MacOS") +@pytest.mark.skipif(platform.type == "windows", reason="Skip SQL test in Windows") +def test_sql_parser_write_independent_bug_1622(postgresql_url): + source = TableResource(path="data/timezone.csv") + with source: + control = formats.SqlControl(table="timezone") + target = source.write(postgresql_url, control=control) + with target: + assert target.header == ["datetime", "time"] + assert target.read_rows() == [ + { + "datetime": datetime(2020, 1, 1, 15), + "time": time(15), + }, + { + "datetime": datetime(2020, 1, 1, 15), + "time": time(15), + }, + { + "datetime": datetime(2020, 1, 1, 12), + "time": time(12), + }, + { + "datetime": datetime(2020, 1, 1, 18), + "time": time(18), + }, + ] diff --git a/tests/formats/sql/test_parser.py b/tests/formats/sql/test_parser.py index 996fee9ffc..beef5df76c 100644 --- a/tests/formats/sql/test_parser.py +++ b/tests/formats/sql/test_parser.py @@ -151,3 +151,16 @@ def test_sql_parser_describe_to_yaml_failing_issue_821(sqlite_url_data): resource = TableResource(path=sqlite_url_data, control=control) resource.infer() assert resource.to_yaml() + + +def test_sql_parser_write_independent_bug_1622(sqlite_url_data): + source = TableResource(path="data/table.csv") + with source: + control = formats.SqlControl(table="name", order_by="id") + target = source.write(path=sqlite_url_data, control=control) + with target: + assert target.header == ["id", "name"] + assert target.read_rows() == [ + {"id": 1, "name": "english"}, + {"id": 2, "name": "中国人"}, + ] diff --git a/tests/formats/yaml/test_parser.py b/tests/formats/yaml/test_parser.py index 186eab9423..69bc7362fd 100644 --- a/tests/formats/yaml/test_parser.py +++ b/tests/formats/yaml/test_parser.py @@ -48,3 +48,20 @@ def test_yaml_parser_write_skip_header(tmpdir): {"field1": 1, "field2": "english"}, {"field1": 2, "field2": "中国人"}, ] + + +# Bugs + + +def test_yaml_parser_write_independent_bug_1622(tmpdir): + source = TableResource(path="data/table.csv") + with source: + target = TableResource(path=str(tmpdir.join("table.yaml"))) + source.write(target) + with target: + assert target.format == "yaml" + assert target.header == ["id", "name"] + assert target.read_rows() == [ + {"id": 1, "name": "english"}, + {"id": 2, "name": "中国人"}, + ] diff --git a/tests/indexer/test_resource.py b/tests/indexer/test_resource.py index 5df10bb027..561cb7d0b6 100644 --- a/tests/indexer/test_resource.py +++ b/tests/indexer/test_resource.py @@ -94,3 +94,18 @@ def test_resource_index_sqlite_on_progress(database_url, mocker): assert on_progress.call_count == 2 on_progress.assert_any_call(control.table, "2 rows") on_progress.assert_any_call(control.table, "3 rows") + + +# Bugs + + +@pytest.mark.parametrize("database_url", database_urls) +def test_resource_index_sqlite_independent_bug_1622(database_url): + assert control.table + resource = TableResource(path="data/table.csv") + with resource: + resource.index(database_url, name=control.table) + assert TableResource(path=database_url, control=control).read_rows() == [ + {"id": 1, "name": "english"}, + {"id": 2, "name": "中国人"}, + ] diff --git a/tests/resource/test_context_manager.py b/tests/resource/test_context_manager.py new file mode 100644 index 0000000000..0170001fe5 --- /dev/null +++ b/tests/resource/test_context_manager.py @@ -0,0 +1,103 @@ +import pytest + +from frictionless import FrictionlessException, Resource + +# Test that the context manager implementation works correctly + +# As per PEP-343, the context manager should be a single-use object (like files) +# See https://peps.python.org/pep-0343/#caching-context-managers + + +def test_context_manager_opens_resource(): + with Resource("data/table.csv") as resource: + assert resource.closed is False + + +def test_context_manager_closes_resource(): + with Resource("data/table.csv") as resource: + pass + assert resource.closed is True + + +def test_context_manager_returns_same_resource(): + resource = Resource("data/table.csv") + with resource as context_manager_return_value: + assert resource == context_manager_return_value + + +def test_nested_context_causes_exception(): + with pytest.raises(FrictionlessException): + # Create nested with statements to test that we can't open + # the same resource twice via context managers + with Resource("data/table.csv") as resource: + with resource: + pass + + +def test_resource_copy_can_use_nested_context(): + # Create nested with statements to test that we can still open + # the same resource twice via context if we copy the resource + # before the second `with` + with Resource("data/table.csv") as resource: + copy = resource.to_copy() + with copy: + assert copy.closed is False + assert resource.closed is False + + # Check that the original resource is still open + assert copy.closed is True + assert resource.closed is False + + +def test_resource_can_use_repeated_non_nested_contexts(): + # Repeat context allowed + resource = Resource("data/table.csv") + with resource: + assert resource.closed is False + + assert resource.closed is True + + with resource: + assert resource.closed is False + assert resource.closed is True + + +def test_resource_copy_can_use_repeated_context(): + # Repeated context with a copy is allowed + resource = Resource("data/table.csv") + copy = resource.to_copy() + with resource: + assert resource.closed is False + assert copy.closed is True + + with copy: + assert resource.closed is True + assert copy.closed is False + + +def test_context_manager_on_open_resource_throw_exception(): + """ + Using the Resource in a `with` statement after it has been opened will unexpectedly close the resource + at the end of the context. So this is prevented by throwing an exception. + """ + resource = Resource("data/table.csv") + resource.open() + assert resource.closed is False + with pytest.raises(FrictionlessException): + with resource: + pass + + +def test_explicit_open_can_be_repeated(): + # Explicit open can be nested + # Note that the first close() call will close the resource, so anyone + # using explicit open() calls must be aware of that. + resource = Resource("data/table.csv") + resource.open() + assert resource.closed is False + resource.open() + assert resource.closed is False + resource.close() + assert resource.closed is True + resource.close() + assert resource.closed is True diff --git a/tests/steps/table/test_table_debug.py b/tests/steps/table/test_table_debug.py new file mode 100644 index 0000000000..48019a3bc0 --- /dev/null +++ b/tests/steps/table/test_table_debug.py @@ -0,0 +1,33 @@ +from frictionless import Pipeline, steps +from frictionless.resources import TableResource + + +class Counter: + count = 0 + + def __call__(self, row): + self.count += 1 + + +def test_step_table_debug(): + source = TableResource(path="data/transform.csv") + counter = Counter() + + pipeline = Pipeline( + steps=[steps.table_debug(function=counter)], + ) + target = source.transform(pipeline) + assert target.schema.to_descriptor() == { + "fields": [ + {"name": "id", "type": "integer"}, + {"name": "name", "type": "string"}, + {"name": "population", "type": "integer"}, + ] + } + assert target.read_rows() == [ + {"id": 1, "name": "germany", "population": 83}, + {"id": 2, "name": "france", "population": 66}, + {"id": 3, "name": "spain", "population": 47}, + ] + + assert counter.count == 3 diff --git a/tests/table/test_to_petl.py b/tests/table/test_to_petl.py new file mode 100644 index 0000000000..d56ba70ab3 --- /dev/null +++ b/tests/table/test_to_petl.py @@ -0,0 +1,117 @@ +from petl import util + +from frictionless.resources import TableResource + + +def __assert_nth_row(it, n, expected): + """ + A helper function to assert that the nth row of an iterator is as expected. + """ + for _ in range(n - 1): + next(it) + assert next(it) == expected + + +def test_to_petl_gives_valid_table(): + resource = TableResource("data/table.csv") + table = resource.to_petl() + assert util.header(table) == ("id", "name") + + +def test_to_petl_is_iterable(): + resource = TableResource("data/table.csv") + table = resource.to_petl() + it = iter(table) + assert next(it) == ["id", "name"] + assert next(it) == ["1", "english"] + assert next(it) == ["2", "中国人"] + + +def test_to_petl_iterators_are_independent(): + resource = TableResource("data/table.csv") + table = resource.to_petl() + it1 = iter(table) + it2 = iter(table) + + # Start reading from it1 + assert next(it1) == ["id", "name"] + assert next(it1) == ["1", "english"] + + # Check it2 now reads from the start again + assert next(it2) == ["id", "name"] + assert next(it2) == ["1", "english"] + assert next(it2) == ["2", "中国人"] + + # Check it1 is still reading from where it left off + assert next(it1) == ["2", "中国人"] + + +def test_to_petl_iterators_have_independent_lifetime(): + resource = TableResource("data/table-1MB.csv") + table = resource.to_petl() + it1 = iter(table) + + # Assert the 101st row is as expected. + # Need to go that far to get past the buffer that is loaded on open()/__enter__ + # and start reading from the file (as the file is closed by close()/__exit__, + # but the buffer is not, so you would get away with incorrectly closing the + # resource if you remain within the buffer). + # See #1622 for more. + __assert_nth_row( + it1, + 101, + [ + "ahltic", + "22354", + "428.17", + "382.54", + "false", + "1926-09-15T01:15:27Z", + "1956-04-14", + "08:20:13", + "4,5", + '{"x":1,"y":7}', + ], + ) + + # Make a local function to give it2 a different scope + def read_from_it2(): + it2 = iter(table) + __assert_nth_row( + it2, + 101, + [ + "ahltic", + "22354", + "428.17", + "382.54", + "false", + "1926-09-15T01:15:27Z", + "1956-04-14", + "08:20:13", + "4,5", + '{"x":1,"y":7}', + ], + ) + + # Read from it2 within the nested function scope + read_from_it2() + + # Check we can stil read from it1 from where we left off + # Prior to the fix for #1622 this would throw an exception: "ValueError: I/O operation on closed file." + __assert_nth_row( + it1, + 101, + [ + "tlbmv8", + "91378", + "101.19", + "832.96", + "false", + "1983-02-26T12:44:52Z", + "1960-08-28", + "04:44:23", + "5,6", + '{"x":9,"y":4}', + ], + )
{name}