Skip to content

Commit c21bf24

Browse files
KamilPiechowiakzxqfd555
authored andcommitted
operator persistence - basic operators (#7905)
Co-authored-by: Sergey <sergey@pathway.com> GitOrigin-RevId: 5ee8ed454ac463957d414a785e713daf3979b64e
1 parent 4b5c5c0 commit c21bf24

File tree

9 files changed

+982
-268
lines changed

9 files changed

+982
-268
lines changed

python/pathway/tests/test_persistence.py

Lines changed: 305 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,24 @@
33
import json
44
import multiprocessing
55
import os
6+
import pathlib
67
import time
8+
from typing import Callable
79

10+
import pandas as pd
811
import pytest
912

1013
import pathway as pw
14+
from pathway.internals import api
1115
from pathway.internals.parse_graph import G
1216
from pathway.tests.utils import (
1317
CsvPathwayChecker,
18+
consolidate,
1419
needs_multiprocessing_fork,
20+
run,
1521
wait_result_with_checker,
1622
write_csv,
23+
write_lines,
1724
)
1825

1926

@@ -212,7 +219,7 @@ def pw_identity_program():
212219
pw.io.jsonlines.write(table, output_path)
213220
pw.run(persistence_config=persistence_config)
214221

215-
file_contents = {}
222+
file_contents: dict[str, str] = {}
216223
next_file_contents = 0
217224
for sequence in scenario:
218225
expected_diffs = []
@@ -259,8 +266,303 @@ def pw_identity_program():
259266
actual_diffs = []
260267
with open(output_path, "r") as f:
261268
for row in f:
262-
row = json.loads(row)
263-
actual_diffs.append([row["data"], row["diff"]])
269+
row_parsed = json.loads(row)
270+
actual_diffs.append([row_parsed["data"], row_parsed["diff"]])
264271
actual_diffs.sort()
265272
expected_diffs.sort()
266273
assert actual_diffs == expected_diffs
274+
275+
276+
def combine_columns(df: pd.DataFrame) -> pd.Series:
277+
result = None
278+
for column in df.columns:
279+
if column == "time":
280+
continue
281+
if result is None:
282+
result = df[column].astype(str)
283+
else:
284+
result += "," + df[column].astype(str)
285+
return result
286+
287+
288+
def get_one_table_runner(
289+
tmp_path: pathlib.Path,
290+
mode: api.PersistenceMode,
291+
logic: Callable[[pw.Table], pw.Table],
292+
schema: type[pw.Schema],
293+
) -> tuple[Callable[[list[str], set[str]], None], pathlib.Path]:
294+
input_path = tmp_path / "1"
295+
os.makedirs(input_path)
296+
output_path = tmp_path / "out.csv"
297+
persistent_storage_path = tmp_path / "p"
298+
count = 0
299+
300+
def run_computation(inputs, expected):
301+
nonlocal count
302+
count += 1
303+
G.clear()
304+
path = input_path / str(count)
305+
write_lines(path, inputs)
306+
t_1 = pw.io.csv.read(input_path, schema=schema, mode="static")
307+
res = logic(t_1)
308+
pw.io.csv.write(res, output_path)
309+
run(
310+
persistence_config=pw.persistence.Config(
311+
pw.persistence.Backend.filesystem(persistent_storage_path),
312+
persistence_mode=mode,
313+
)
314+
)
315+
result = consolidate(pd.read_csv(output_path))
316+
assert set(combine_columns(result)) == expected
317+
318+
return run_computation, input_path
319+
320+
321+
def get_two_tables_runner(
322+
tmp_path: pathlib.Path,
323+
mode: api.PersistenceMode,
324+
logic: Callable[[pw.Table, pw.Table], pw.Table],
325+
schema: type[pw.Schema],
326+
terminate_on_error: bool = True,
327+
) -> tuple[
328+
Callable[[list[str], list[str], set[str]], None], pathlib.Path, pathlib.Path
329+
]:
330+
331+
input_path_1 = tmp_path / "1"
332+
input_path_2 = tmp_path / "2"
333+
os.makedirs(input_path_1)
334+
os.makedirs(input_path_2)
335+
output_path = tmp_path / "out.csv"
336+
persistent_storage_path = tmp_path / "p"
337+
count = 0
338+
339+
def run_computation(inputs_1, inputs_2, expected):
340+
nonlocal count
341+
count += 1
342+
G.clear()
343+
path_1 = input_path_1 / str(count)
344+
path_2 = input_path_2 / str(count)
345+
write_lines(path_1, inputs_1)
346+
write_lines(path_2, inputs_2)
347+
t_1 = pw.io.csv.read(input_path_1, schema=schema, mode="static")
348+
t_2 = pw.io.csv.read(input_path_2, schema=schema, mode="static")
349+
res = logic(t_1, t_2)
350+
pw.io.csv.write(res, output_path)
351+
run(
352+
persistence_config=pw.persistence.Config(
353+
pw.persistence.Backend.filesystem(persistent_storage_path),
354+
persistence_mode=mode,
355+
),
356+
terminate_on_error=terminate_on_error,
357+
# hack to allow changes from different files at different point in time
358+
)
359+
result = consolidate(pd.read_csv(output_path))
360+
assert set(combine_columns(result)) == expected
361+
362+
return run_computation, input_path_1, input_path_2
363+
364+
365+
@pytest.mark.parametrize(
366+
"mode", [api.PersistenceMode.PERSISTING, api.PersistenceMode.OPERATOR_PERSISTING]
367+
)
368+
def test_restrict(tmp_path, mode):
369+
class InputSchema(pw.Schema):
370+
a: int = pw.column_definition(primary_key=True)
371+
372+
def logic(t_1: pw.Table, t_2: pw.Table) -> pw.Table:
373+
t_2.promise_universe_is_subset_of(t_1)
374+
return t_1.restrict(t_2)
375+
376+
run, _, input_path_2 = get_two_tables_runner(
377+
tmp_path, mode, logic, InputSchema, terminate_on_error=False
378+
)
379+
380+
run(["a", "1", "2", "3"], ["a", "1"], {"1,1"})
381+
run(["a"], ["a", "3"], {"3,1"})
382+
run(["a", "4", "5"], ["a", "5"], {"5,1"})
383+
run(["a", "6"], ["a", "4", "6"], {"4,1", "6,1"})
384+
os.remove(input_path_2 / "3")
385+
run(["a"], ["a"], {"5,-1"})
386+
387+
388+
@pytest.mark.parametrize(
389+
"mode", [api.PersistenceMode.PERSISTING, api.PersistenceMode.OPERATOR_PERSISTING]
390+
)
391+
def test_with_universe_of(tmp_path, mode):
392+
class InputSchema(pw.Schema):
393+
a: int = pw.column_definition(primary_key=True)
394+
b: int
395+
396+
def logic(t_1: pw.Table, t_2: pw.Table) -> pw.Table:
397+
return t_1.with_universe_of(t_2).with_columns(c=t_2.b)
398+
399+
run, input_path_1, input_path_2 = get_two_tables_runner(
400+
tmp_path, mode, logic, InputSchema, terminate_on_error=False
401+
)
402+
403+
run(["a,b", "1,2", "2,3"], ["a,b", "1,3", "2,4"], {"1,2,3,1", "2,3,4,1"})
404+
run(["a,b", "3,3", "5,1"], ["a,b", "3,4", "5,0"], {"3,3,4,1", "5,1,0,1"})
405+
os.remove(input_path_1 / "2")
406+
os.remove(input_path_2 / "2")
407+
run(
408+
["a,b", "3,4"],
409+
["a,b", "3,5"],
410+
{
411+
"3,3,4,-1",
412+
"5,1,0,-1",
413+
"3,4,5,1",
414+
},
415+
)
416+
417+
418+
@pytest.mark.parametrize(
419+
"mode", [api.PersistenceMode.PERSISTING, api.PersistenceMode.OPERATOR_PERSISTING]
420+
)
421+
def test_intersect(tmp_path, mode):
422+
class InputSchema(pw.Schema):
423+
a: int = pw.column_definition(primary_key=True)
424+
425+
def logic(t_1: pw.Table, t_2: pw.Table) -> pw.Table:
426+
return t_1.intersect(t_2)
427+
428+
run, _, input_path_2 = get_two_tables_runner(tmp_path, mode, logic, InputSchema)
429+
430+
run(["a", "1", "2", "3"], ["a", "1"], {"1,1"})
431+
run(["a"], ["a", "3"], {"3,1"})
432+
run(["a", "4", "5"], ["a", "5", "6"], {"5,1"})
433+
run(["a", "6"], ["a", "4"], {"4,1", "6,1"})
434+
os.remove(input_path_2 / "3")
435+
run(["a"], ["a"], {"5,-1", "6,-1"})
436+
437+
438+
@pytest.mark.parametrize(
439+
"mode", [api.PersistenceMode.PERSISTING, api.PersistenceMode.OPERATOR_PERSISTING]
440+
)
441+
def test_difference(tmp_path, mode):
442+
class InputSchema(pw.Schema):
443+
a: int = pw.column_definition(primary_key=True)
444+
445+
def logic(t_1: pw.Table, t_2: pw.Table) -> pw.Table:
446+
return t_1.difference(t_2)
447+
448+
run, _, input_path_2 = get_two_tables_runner(tmp_path, mode, logic, InputSchema)
449+
450+
run(["a", "1", "2", "3"], ["a", "1"], {"2,1", "3,1"})
451+
run(["a"], ["a", "3"], {"3,-1"})
452+
run(["a", "4", "5"], ["a", "5", "6"], {"4,1"})
453+
run(["a", "6"], ["a", "4"], {"4,-1"})
454+
os.remove(input_path_2 / "3")
455+
run(["a"], ["a"], {"5,1", "6,1"})
456+
457+
458+
@pytest.mark.parametrize(
459+
"mode", [api.PersistenceMode.PERSISTING, api.PersistenceMode.OPERATOR_PERSISTING]
460+
)
461+
def test_sorting_ix(tmp_path, mode):
462+
class InputSchema(pw.Schema):
463+
a: int = pw.column_definition(primary_key=True)
464+
465+
def logic(t_1: pw.Table) -> pw.Table:
466+
t_1 += t_1.sort(pw.this.a)
467+
t_1_filtered = t_1.filter(pw.this.prev.is_not_none())
468+
return t_1_filtered.select(b=t_1.ix(pw.this.prev).a, a=pw.this.a)
469+
470+
run, input_path = get_one_table_runner(tmp_path, mode, logic, InputSchema)
471+
472+
run(["a", "1", "6"], {"1,6,1"})
473+
run(["a", "3"], {"1,6,-1", "1,3,1", "3,6,1"})
474+
run(["a", "4", "5"], {"3,6,-1", "3,4,1", "4,5,1", "5,6,1"})
475+
os.remove(input_path / "2")
476+
run(["a"], {"1,3,-1", "3,4,-1", "1,4,1"})
477+
run(["a", "2"], {"1,4,-1", "1,2,1", "2,4,1"})
478+
479+
480+
@pytest.mark.parametrize(
481+
"mode", [api.PersistenceMode.PERSISTING, api.PersistenceMode.OPERATOR_PERSISTING]
482+
)
483+
def test_update_rows(tmp_path, mode):
484+
class InputSchema(pw.Schema):
485+
a: int = pw.column_definition(primary_key=True)
486+
b: int
487+
488+
def logic(t_1: pw.Table, t_2: pw.Table) -> pw.Table:
489+
return t_1.update_rows(t_2)
490+
491+
run, _, input_path_2 = get_two_tables_runner(tmp_path, mode, logic, InputSchema)
492+
493+
run(["a,b", "1,2", "2,4"], ["a,b", "1,3", "3,5"], {"1,3,1", "2,4,1", "3,5,1"})
494+
run(["a,b", "3,3"], ["a,b", "2,6", "5,1"], {"2,4,-1", "2,6,1", "5,1,1"})
495+
os.remove(input_path_2 / "1")
496+
run(["a,b"], ["a,b"], {"3,5,-1", "3,3,1", "1,3,-1", "1,2,1"})
497+
run(["a,b", "7,10"], ["a,b", "3,8"], {"3,3,-1", "3,8,1", "7,10,1"})
498+
499+
500+
@pytest.mark.parametrize(
501+
"mode", [api.PersistenceMode.PERSISTING, api.PersistenceMode.OPERATOR_PERSISTING]
502+
)
503+
def test_update_cells(tmp_path, mode):
504+
class InputSchema(pw.Schema):
505+
a: int = pw.column_definition(primary_key=True)
506+
b: int
507+
508+
def logic(t_1: pw.Table, t_2: pw.Table) -> pw.Table:
509+
t_2.promise_universe_is_subset_of(t_1)
510+
return t_1.update_cells(t_2)
511+
512+
run, _, input_path_2 = get_two_tables_runner(
513+
tmp_path, mode, logic, InputSchema, terminate_on_error=False
514+
)
515+
516+
run(["a,b", "1,2", "2,4"], ["a,b", "1,3"], {"1,3,1", "2,4,1"})
517+
run(["a,b", "3,3"], ["a,b", "2,6"], {"2,4,-1", "2,6,1", "3,3,1"})
518+
os.remove(input_path_2 / "1")
519+
run(["a,b"], ["a,b"], {"1,3,-1", "1,2,1"})
520+
run(["a,b", "7,10"], ["a,b", "3,8"], {"3,3,-1", "3,8,1", "7,10,1"})
521+
522+
523+
@pytest.mark.parametrize(
524+
"mode", [api.PersistenceMode.PERSISTING, api.PersistenceMode.OPERATOR_PERSISTING]
525+
)
526+
def test_join(tmp_path, mode):
527+
class InputSchema(pw.Schema):
528+
a: int = pw.column_definition(primary_key=True)
529+
b: int
530+
531+
def logic(t_1: pw.Table, t_2: pw.Table) -> pw.Table:
532+
return t_1.join(t_2, t_1.a == t_2.a).select(
533+
pw.this.a, b=pw.left.b, c=pw.right.b
534+
)
535+
536+
run, _, input_path_2 = get_two_tables_runner(tmp_path, mode, logic, InputSchema)
537+
538+
run(["a,b", "1,2", "2,4"], ["a,b", "1,3"], {"1,2,3,1"})
539+
run(["a,b", "3,3"], ["a,b", "2,6", "1,4"], {"2,4,6,1", "1,2,4,1"})
540+
os.remove(input_path_2 / "1")
541+
run(["a,b"], ["a,b"], {"1,2,3,-1"})
542+
run(["a,b", "1,4"], ["a,b", "1,8"], {"1,2,8,1", "1,4,8,1", "1,4,4,1"})
543+
544+
545+
@pytest.mark.parametrize(
546+
"mode", [api.PersistenceMode.PERSISTING, api.PersistenceMode.OPERATOR_PERSISTING]
547+
)
548+
def test_groupby(tmp_path, mode):
549+
class InputSchema(pw.Schema):
550+
a: int
551+
b: int
552+
553+
def logic(t_1: pw.Table) -> pw.Table:
554+
return t_1.groupby(pw.this.a).reduce(
555+
pw.this.a,
556+
c=pw.reducers.count(),
557+
s=pw.reducers.sum(pw.this.b),
558+
m=pw.reducers.max(pw.this.b),
559+
)
560+
561+
run, input_path = get_one_table_runner(tmp_path, mode, logic, InputSchema)
562+
563+
run(["a,b", "1,3", "2,4"], {"1,1,3,3,1", "2,1,4,4,1"})
564+
run(["a,b", "1,1"], {"1,1,3,3,-1", "1,2,4,3,1"})
565+
run(["a,b", "2,5"], {"2,1,4,4,-1", "2,2,9,5,1"})
566+
os.remove(input_path / "2")
567+
run(["a,b"], {"1,1,3,3,1", "1,2,4,3,-1"})
568+
run(["a,b", "2,0"], {"2,2,9,5,-1", "2,3,9,5,1"})

python/pathway/tests/utils.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -693,6 +693,11 @@ def write_lines(path: str | pathlib.Path, data: str | list[str]):
693693
f.writelines(data)
694694

695695

696+
def read_lines(path: str | pathlib.Path) -> list[str]:
697+
with open(path) as f:
698+
return f.readlines()
699+
700+
696701
def get_aws_s3_settings():
697702
return pw.io.s3.AwsS3Settings(
698703
bucket_name="aws-integrationtest",
@@ -777,3 +782,28 @@ def deprecated_call_here(
777782
*, match: str | re.Pattern[str] | None = None
778783
) -> AbstractContextManager[pytest.WarningsRecorder]:
779784
return warns_here((DeprecationWarning, PendingDeprecationWarning), match=match)
785+
786+
787+
def consolidate(df: pd.DataFrame) -> pd.DataFrame:
788+
values = None
789+
for column in df.columns:
790+
if column in ["time", "diff"]:
791+
continue
792+
if values is None:
793+
values = df[column].astype(str)
794+
else:
795+
values = values + "," + df[column].astype(str)
796+
df["_all_values"] = values
797+
798+
total = {}
799+
for _, row in df.iterrows():
800+
value = row["_all_values"]
801+
if value not in total:
802+
total[value] = 0
803+
total[value] += row["diff"]
804+
805+
for i in range(df.shape[0]):
806+
value = df.at[i, "_all_values"]
807+
df.at[i, "diff"] = total[value]
808+
total[value] = 0
809+
return df[df["diff"] != 0].drop(columns=["_all_values"])

0 commit comments

Comments
 (0)