s3select_plus/create_files.py at main · gabriel-oana/s3select_plus · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67

from select_plus.src.aws.s3 import S3
from select_plus.src.engine.parallel_engine import ParallelEngine


s3 = S3()
bucket_name = 's3selectplus'
with open('tests/files/sample.json', 'r') as j:
    json_body = j.read()


def create_one_json_file(i):
    json_key = f'large_json/file{i}.json'
    s3.put_object(bucket_name=bucket_name, key=json_key, body=json_body)


def create_json_files_parallel():
    p = ParallelEngine(
        bucket_name=bucket_name,
        prefix='large_json',
        threads=32,
        verbose=True
    )
    func_args = list(range(0, 2000))
    # print(func_args)
    p.execute_callable(create_one_json_file, func_args)


def create_files():

    prefix = 1
    for i in range(2000):
        print(i)
        if i % 100 == 0:
            prefix = i

        # Create json files
        with open('tests/files/sample.json', 'r') as j:
            json_body = j.read()
        json_key = f'large_json/{prefix}/file{i}.json'
        s3.put_object(bucket_name=bucket_name, key=json_key, body=json_body)


def create_tabular_files():
    import farsante
    from mimesis import Person
    prefix = 1
    for i in range(200):
        print(i)
        if i % 10 == 0:
            prefix = i

        # Create csv files
        with open('tests/files/sample.csv', 'r') as f:
            csv_body = f.read()
        csv_key = f'csv/{prefix}/file{i}.csv'

        # Create large parquet files
        mx = Person('en')
        df = farsante.pandas_df([mx.first_name, mx.last_name, mx.university, mx.title, mx.views_on, mx.weight, mx.sex, mx.political_views, mx.occupation, mx.nationality], 26000)
        df.to_parquet(f's3://{bucket_name}/parquet/{prefix}/file{i}.parquet', index=False, compression='snappy')

        s3.put_object(bucket_name=bucket_name, key=csv_key, body=csv_body)


if __name__ == '__main__':
    create_json_files_parallel()