|
3 | 3 | import json |
4 | 4 | import multiprocessing |
5 | 5 | import os |
| 6 | +import pathlib |
6 | 7 | import time |
| 8 | +from typing import Callable |
7 | 9 |
|
| 10 | +import pandas as pd |
8 | 11 | import pytest |
9 | 12 |
|
10 | 13 | import pathway as pw |
| 14 | +from pathway.internals import api |
11 | 15 | from pathway.internals.parse_graph import G |
12 | 16 | from pathway.tests.utils import ( |
13 | 17 | CsvPathwayChecker, |
| 18 | + consolidate, |
14 | 19 | needs_multiprocessing_fork, |
| 20 | + run, |
15 | 21 | wait_result_with_checker, |
16 | 22 | write_csv, |
| 23 | + write_lines, |
17 | 24 | ) |
18 | 25 |
|
19 | 26 |
|
@@ -212,7 +219,7 @@ def pw_identity_program(): |
212 | 219 | pw.io.jsonlines.write(table, output_path) |
213 | 220 | pw.run(persistence_config=persistence_config) |
214 | 221 |
|
215 | | - file_contents = {} |
| 222 | + file_contents: dict[str, str] = {} |
216 | 223 | next_file_contents = 0 |
217 | 224 | for sequence in scenario: |
218 | 225 | expected_diffs = [] |
@@ -259,8 +266,303 @@ def pw_identity_program(): |
259 | 266 | actual_diffs = [] |
260 | 267 | with open(output_path, "r") as f: |
261 | 268 | for row in f: |
262 | | - row = json.loads(row) |
263 | | - actual_diffs.append([row["data"], row["diff"]]) |
| 269 | + row_parsed = json.loads(row) |
| 270 | + actual_diffs.append([row_parsed["data"], row_parsed["diff"]]) |
264 | 271 | actual_diffs.sort() |
265 | 272 | expected_diffs.sort() |
266 | 273 | assert actual_diffs == expected_diffs |
| 274 | + |
| 275 | + |
| 276 | +def combine_columns(df: pd.DataFrame) -> pd.Series: |
| 277 | + result = None |
| 278 | + for column in df.columns: |
| 279 | + if column == "time": |
| 280 | + continue |
| 281 | + if result is None: |
| 282 | + result = df[column].astype(str) |
| 283 | + else: |
| 284 | + result += "," + df[column].astype(str) |
| 285 | + return result |
| 286 | + |
| 287 | + |
| 288 | +def get_one_table_runner( |
| 289 | + tmp_path: pathlib.Path, |
| 290 | + mode: api.PersistenceMode, |
| 291 | + logic: Callable[[pw.Table], pw.Table], |
| 292 | + schema: type[pw.Schema], |
| 293 | +) -> tuple[Callable[[list[str], set[str]], None], pathlib.Path]: |
| 294 | + input_path = tmp_path / "1" |
| 295 | + os.makedirs(input_path) |
| 296 | + output_path = tmp_path / "out.csv" |
| 297 | + persistent_storage_path = tmp_path / "p" |
| 298 | + count = 0 |
| 299 | + |
| 300 | + def run_computation(inputs, expected): |
| 301 | + nonlocal count |
| 302 | + count += 1 |
| 303 | + G.clear() |
| 304 | + path = input_path / str(count) |
| 305 | + write_lines(path, inputs) |
| 306 | + t_1 = pw.io.csv.read(input_path, schema=schema, mode="static") |
| 307 | + res = logic(t_1) |
| 308 | + pw.io.csv.write(res, output_path) |
| 309 | + run( |
| 310 | + persistence_config=pw.persistence.Config( |
| 311 | + pw.persistence.Backend.filesystem(persistent_storage_path), |
| 312 | + persistence_mode=mode, |
| 313 | + ) |
| 314 | + ) |
| 315 | + result = consolidate(pd.read_csv(output_path)) |
| 316 | + assert set(combine_columns(result)) == expected |
| 317 | + |
| 318 | + return run_computation, input_path |
| 319 | + |
| 320 | + |
| 321 | +def get_two_tables_runner( |
| 322 | + tmp_path: pathlib.Path, |
| 323 | + mode: api.PersistenceMode, |
| 324 | + logic: Callable[[pw.Table, pw.Table], pw.Table], |
| 325 | + schema: type[pw.Schema], |
| 326 | + terminate_on_error: bool = True, |
| 327 | +) -> tuple[ |
| 328 | + Callable[[list[str], list[str], set[str]], None], pathlib.Path, pathlib.Path |
| 329 | +]: |
| 330 | + |
| 331 | + input_path_1 = tmp_path / "1" |
| 332 | + input_path_2 = tmp_path / "2" |
| 333 | + os.makedirs(input_path_1) |
| 334 | + os.makedirs(input_path_2) |
| 335 | + output_path = tmp_path / "out.csv" |
| 336 | + persistent_storage_path = tmp_path / "p" |
| 337 | + count = 0 |
| 338 | + |
| 339 | + def run_computation(inputs_1, inputs_2, expected): |
| 340 | + nonlocal count |
| 341 | + count += 1 |
| 342 | + G.clear() |
| 343 | + path_1 = input_path_1 / str(count) |
| 344 | + path_2 = input_path_2 / str(count) |
| 345 | + write_lines(path_1, inputs_1) |
| 346 | + write_lines(path_2, inputs_2) |
| 347 | + t_1 = pw.io.csv.read(input_path_1, schema=schema, mode="static") |
| 348 | + t_2 = pw.io.csv.read(input_path_2, schema=schema, mode="static") |
| 349 | + res = logic(t_1, t_2) |
| 350 | + pw.io.csv.write(res, output_path) |
| 351 | + run( |
| 352 | + persistence_config=pw.persistence.Config( |
| 353 | + pw.persistence.Backend.filesystem(persistent_storage_path), |
| 354 | + persistence_mode=mode, |
| 355 | + ), |
| 356 | + terminate_on_error=terminate_on_error, |
| 357 | + # hack to allow changes from different files at different point in time |
| 358 | + ) |
| 359 | + result = consolidate(pd.read_csv(output_path)) |
| 360 | + assert set(combine_columns(result)) == expected |
| 361 | + |
| 362 | + return run_computation, input_path_1, input_path_2 |
| 363 | + |
| 364 | + |
| 365 | +@pytest.mark.parametrize( |
| 366 | + "mode", [api.PersistenceMode.PERSISTING, api.PersistenceMode.OPERATOR_PERSISTING] |
| 367 | +) |
| 368 | +def test_restrict(tmp_path, mode): |
| 369 | + class InputSchema(pw.Schema): |
| 370 | + a: int = pw.column_definition(primary_key=True) |
| 371 | + |
| 372 | + def logic(t_1: pw.Table, t_2: pw.Table) -> pw.Table: |
| 373 | + t_2.promise_universe_is_subset_of(t_1) |
| 374 | + return t_1.restrict(t_2) |
| 375 | + |
| 376 | + run, _, input_path_2 = get_two_tables_runner( |
| 377 | + tmp_path, mode, logic, InputSchema, terminate_on_error=False |
| 378 | + ) |
| 379 | + |
| 380 | + run(["a", "1", "2", "3"], ["a", "1"], {"1,1"}) |
| 381 | + run(["a"], ["a", "3"], {"3,1"}) |
| 382 | + run(["a", "4", "5"], ["a", "5"], {"5,1"}) |
| 383 | + run(["a", "6"], ["a", "4", "6"], {"4,1", "6,1"}) |
| 384 | + os.remove(input_path_2 / "3") |
| 385 | + run(["a"], ["a"], {"5,-1"}) |
| 386 | + |
| 387 | + |
| 388 | +@pytest.mark.parametrize( |
| 389 | + "mode", [api.PersistenceMode.PERSISTING, api.PersistenceMode.OPERATOR_PERSISTING] |
| 390 | +) |
| 391 | +def test_with_universe_of(tmp_path, mode): |
| 392 | + class InputSchema(pw.Schema): |
| 393 | + a: int = pw.column_definition(primary_key=True) |
| 394 | + b: int |
| 395 | + |
| 396 | + def logic(t_1: pw.Table, t_2: pw.Table) -> pw.Table: |
| 397 | + return t_1.with_universe_of(t_2).with_columns(c=t_2.b) |
| 398 | + |
| 399 | + run, input_path_1, input_path_2 = get_two_tables_runner( |
| 400 | + tmp_path, mode, logic, InputSchema, terminate_on_error=False |
| 401 | + ) |
| 402 | + |
| 403 | + run(["a,b", "1,2", "2,3"], ["a,b", "1,3", "2,4"], {"1,2,3,1", "2,3,4,1"}) |
| 404 | + run(["a,b", "3,3", "5,1"], ["a,b", "3,4", "5,0"], {"3,3,4,1", "5,1,0,1"}) |
| 405 | + os.remove(input_path_1 / "2") |
| 406 | + os.remove(input_path_2 / "2") |
| 407 | + run( |
| 408 | + ["a,b", "3,4"], |
| 409 | + ["a,b", "3,5"], |
| 410 | + { |
| 411 | + "3,3,4,-1", |
| 412 | + "5,1,0,-1", |
| 413 | + "3,4,5,1", |
| 414 | + }, |
| 415 | + ) |
| 416 | + |
| 417 | + |
| 418 | +@pytest.mark.parametrize( |
| 419 | + "mode", [api.PersistenceMode.PERSISTING, api.PersistenceMode.OPERATOR_PERSISTING] |
| 420 | +) |
| 421 | +def test_intersect(tmp_path, mode): |
| 422 | + class InputSchema(pw.Schema): |
| 423 | + a: int = pw.column_definition(primary_key=True) |
| 424 | + |
| 425 | + def logic(t_1: pw.Table, t_2: pw.Table) -> pw.Table: |
| 426 | + return t_1.intersect(t_2) |
| 427 | + |
| 428 | + run, _, input_path_2 = get_two_tables_runner(tmp_path, mode, logic, InputSchema) |
| 429 | + |
| 430 | + run(["a", "1", "2", "3"], ["a", "1"], {"1,1"}) |
| 431 | + run(["a"], ["a", "3"], {"3,1"}) |
| 432 | + run(["a", "4", "5"], ["a", "5", "6"], {"5,1"}) |
| 433 | + run(["a", "6"], ["a", "4"], {"4,1", "6,1"}) |
| 434 | + os.remove(input_path_2 / "3") |
| 435 | + run(["a"], ["a"], {"5,-1", "6,-1"}) |
| 436 | + |
| 437 | + |
| 438 | +@pytest.mark.parametrize( |
| 439 | + "mode", [api.PersistenceMode.PERSISTING, api.PersistenceMode.OPERATOR_PERSISTING] |
| 440 | +) |
| 441 | +def test_difference(tmp_path, mode): |
| 442 | + class InputSchema(pw.Schema): |
| 443 | + a: int = pw.column_definition(primary_key=True) |
| 444 | + |
| 445 | + def logic(t_1: pw.Table, t_2: pw.Table) -> pw.Table: |
| 446 | + return t_1.difference(t_2) |
| 447 | + |
| 448 | + run, _, input_path_2 = get_two_tables_runner(tmp_path, mode, logic, InputSchema) |
| 449 | + |
| 450 | + run(["a", "1", "2", "3"], ["a", "1"], {"2,1", "3,1"}) |
| 451 | + run(["a"], ["a", "3"], {"3,-1"}) |
| 452 | + run(["a", "4", "5"], ["a", "5", "6"], {"4,1"}) |
| 453 | + run(["a", "6"], ["a", "4"], {"4,-1"}) |
| 454 | + os.remove(input_path_2 / "3") |
| 455 | + run(["a"], ["a"], {"5,1", "6,1"}) |
| 456 | + |
| 457 | + |
| 458 | +@pytest.mark.parametrize( |
| 459 | + "mode", [api.PersistenceMode.PERSISTING, api.PersistenceMode.OPERATOR_PERSISTING] |
| 460 | +) |
| 461 | +def test_sorting_ix(tmp_path, mode): |
| 462 | + class InputSchema(pw.Schema): |
| 463 | + a: int = pw.column_definition(primary_key=True) |
| 464 | + |
| 465 | + def logic(t_1: pw.Table) -> pw.Table: |
| 466 | + t_1 += t_1.sort(pw.this.a) |
| 467 | + t_1_filtered = t_1.filter(pw.this.prev.is_not_none()) |
| 468 | + return t_1_filtered.select(b=t_1.ix(pw.this.prev).a, a=pw.this.a) |
| 469 | + |
| 470 | + run, input_path = get_one_table_runner(tmp_path, mode, logic, InputSchema) |
| 471 | + |
| 472 | + run(["a", "1", "6"], {"1,6,1"}) |
| 473 | + run(["a", "3"], {"1,6,-1", "1,3,1", "3,6,1"}) |
| 474 | + run(["a", "4", "5"], {"3,6,-1", "3,4,1", "4,5,1", "5,6,1"}) |
| 475 | + os.remove(input_path / "2") |
| 476 | + run(["a"], {"1,3,-1", "3,4,-1", "1,4,1"}) |
| 477 | + run(["a", "2"], {"1,4,-1", "1,2,1", "2,4,1"}) |
| 478 | + |
| 479 | + |
| 480 | +@pytest.mark.parametrize( |
| 481 | + "mode", [api.PersistenceMode.PERSISTING, api.PersistenceMode.OPERATOR_PERSISTING] |
| 482 | +) |
| 483 | +def test_update_rows(tmp_path, mode): |
| 484 | + class InputSchema(pw.Schema): |
| 485 | + a: int = pw.column_definition(primary_key=True) |
| 486 | + b: int |
| 487 | + |
| 488 | + def logic(t_1: pw.Table, t_2: pw.Table) -> pw.Table: |
| 489 | + return t_1.update_rows(t_2) |
| 490 | + |
| 491 | + run, _, input_path_2 = get_two_tables_runner(tmp_path, mode, logic, InputSchema) |
| 492 | + |
| 493 | + run(["a,b", "1,2", "2,4"], ["a,b", "1,3", "3,5"], {"1,3,1", "2,4,1", "3,5,1"}) |
| 494 | + run(["a,b", "3,3"], ["a,b", "2,6", "5,1"], {"2,4,-1", "2,6,1", "5,1,1"}) |
| 495 | + os.remove(input_path_2 / "1") |
| 496 | + run(["a,b"], ["a,b"], {"3,5,-1", "3,3,1", "1,3,-1", "1,2,1"}) |
| 497 | + run(["a,b", "7,10"], ["a,b", "3,8"], {"3,3,-1", "3,8,1", "7,10,1"}) |
| 498 | + |
| 499 | + |
| 500 | +@pytest.mark.parametrize( |
| 501 | + "mode", [api.PersistenceMode.PERSISTING, api.PersistenceMode.OPERATOR_PERSISTING] |
| 502 | +) |
| 503 | +def test_update_cells(tmp_path, mode): |
| 504 | + class InputSchema(pw.Schema): |
| 505 | + a: int = pw.column_definition(primary_key=True) |
| 506 | + b: int |
| 507 | + |
| 508 | + def logic(t_1: pw.Table, t_2: pw.Table) -> pw.Table: |
| 509 | + t_2.promise_universe_is_subset_of(t_1) |
| 510 | + return t_1.update_cells(t_2) |
| 511 | + |
| 512 | + run, _, input_path_2 = get_two_tables_runner( |
| 513 | + tmp_path, mode, logic, InputSchema, terminate_on_error=False |
| 514 | + ) |
| 515 | + |
| 516 | + run(["a,b", "1,2", "2,4"], ["a,b", "1,3"], {"1,3,1", "2,4,1"}) |
| 517 | + run(["a,b", "3,3"], ["a,b", "2,6"], {"2,4,-1", "2,6,1", "3,3,1"}) |
| 518 | + os.remove(input_path_2 / "1") |
| 519 | + run(["a,b"], ["a,b"], {"1,3,-1", "1,2,1"}) |
| 520 | + run(["a,b", "7,10"], ["a,b", "3,8"], {"3,3,-1", "3,8,1", "7,10,1"}) |
| 521 | + |
| 522 | + |
| 523 | +@pytest.mark.parametrize( |
| 524 | + "mode", [api.PersistenceMode.PERSISTING, api.PersistenceMode.OPERATOR_PERSISTING] |
| 525 | +) |
| 526 | +def test_join(tmp_path, mode): |
| 527 | + class InputSchema(pw.Schema): |
| 528 | + a: int = pw.column_definition(primary_key=True) |
| 529 | + b: int |
| 530 | + |
| 531 | + def logic(t_1: pw.Table, t_2: pw.Table) -> pw.Table: |
| 532 | + return t_1.join(t_2, t_1.a == t_2.a).select( |
| 533 | + pw.this.a, b=pw.left.b, c=pw.right.b |
| 534 | + ) |
| 535 | + |
| 536 | + run, _, input_path_2 = get_two_tables_runner(tmp_path, mode, logic, InputSchema) |
| 537 | + |
| 538 | + run(["a,b", "1,2", "2,4"], ["a,b", "1,3"], {"1,2,3,1"}) |
| 539 | + run(["a,b", "3,3"], ["a,b", "2,6", "1,4"], {"2,4,6,1", "1,2,4,1"}) |
| 540 | + os.remove(input_path_2 / "1") |
| 541 | + run(["a,b"], ["a,b"], {"1,2,3,-1"}) |
| 542 | + run(["a,b", "1,4"], ["a,b", "1,8"], {"1,2,8,1", "1,4,8,1", "1,4,4,1"}) |
| 543 | + |
| 544 | + |
| 545 | +@pytest.mark.parametrize( |
| 546 | + "mode", [api.PersistenceMode.PERSISTING, api.PersistenceMode.OPERATOR_PERSISTING] |
| 547 | +) |
| 548 | +def test_groupby(tmp_path, mode): |
| 549 | + class InputSchema(pw.Schema): |
| 550 | + a: int |
| 551 | + b: int |
| 552 | + |
| 553 | + def logic(t_1: pw.Table) -> pw.Table: |
| 554 | + return t_1.groupby(pw.this.a).reduce( |
| 555 | + pw.this.a, |
| 556 | + c=pw.reducers.count(), |
| 557 | + s=pw.reducers.sum(pw.this.b), |
| 558 | + m=pw.reducers.max(pw.this.b), |
| 559 | + ) |
| 560 | + |
| 561 | + run, input_path = get_one_table_runner(tmp_path, mode, logic, InputSchema) |
| 562 | + |
| 563 | + run(["a,b", "1,3", "2,4"], {"1,1,3,3,1", "2,1,4,4,1"}) |
| 564 | + run(["a,b", "1,1"], {"1,1,3,3,-1", "1,2,4,3,1"}) |
| 565 | + run(["a,b", "2,5"], {"2,1,4,4,-1", "2,2,9,5,1"}) |
| 566 | + os.remove(input_path / "2") |
| 567 | + run(["a,b"], {"1,1,3,3,1", "1,2,4,3,-1"}) |
| 568 | + run(["a,b", "2,0"], {"2,2,9,5,-1", "2,3,9,5,1"}) |
0 commit comments