1
1
import pandas as pd
2
- import requests
2
+ import os
3
+ import pickle
4
+ from datetime import datetime
3
5
4
6
class GuepardDataFrame (pd .DataFrame ):
5
7
def __init__ (self , * args , ** kwargs ):
6
8
super ().__init__ (* args , ** kwargs )
7
- self .api_url = "https://api.guepard.com"
8
- self .dataset_id = kwargs .get ('dataset_id' , 'default' )
9
+ self .version_dir = kwargs .pop ('version_dir' , './versions' )
10
+ if not os .path .exists (self .version_dir ):
11
+ os .makedirs (self .version_dir )
9
12
10
13
def commit (self , message = "" ):
11
14
version_id = self ._generate_version_id ()
12
- data = self .to_parquet ()
13
- response = requests .post (f"{ self .api_url } /datasets/{ self .dataset_id } /versions" ,
14
- files = {"data" : data },
15
- data = {"message" : message , "version_id" : version_id })
16
- response .raise_for_status ()
15
+ version_path = os .path .join (self .version_dir , f"{ version_id } .pkl" )
16
+ with open (version_path , 'wb' ) as f :
17
+ pickle .dump (self , f )
17
18
return version_id
18
19
19
20
def list_versions (self ):
20
- response = requests .get (f"{ self .api_url } /datasets/{ self .dataset_id } /versions" )
21
- response .raise_for_status ()
22
- return response .json ()
21
+ versions = []
22
+ for filename in os .listdir (self .version_dir ):
23
+ if filename .endswith (".pkl" ):
24
+ version_id = filename .split ('.' )[0 ]
25
+ versions .append (version_id )
26
+ return versions
23
27
24
28
def rollback (self , version_id ):
25
- response = requests .get (f"{ self .api_url } /datasets/{ self .dataset_id } /versions/{ version_id } " )
26
- response .raise_for_status ()
27
- data = response .content
28
- df = pd .read_parquet (data )
29
+ version_path = os .path .join (self .version_dir , f"{ version_id } .pkl" )
30
+ if not os .path .exists (version_path ):
31
+ raise ValueError ("Version ID not found" )
32
+ with open (version_path , 'rb' ) as f :
33
+ df = pickle .load (f )
29
34
self .__init__ (df )
30
35
31
36
def next_version (self ):
32
37
return self .commit ()
33
38
34
39
def _generate_version_id (self ):
35
- from datetime import datetime
36
- return datetime .now ().strftime ("%Y%m%d_%H%M%S" )
40
+ return datetime .now ().strftime ("%Y%m%d_%H%M%S" )
41
+
42
+ # Example usage:
43
+ # df = GuepardDataFrame(pd.read_csv("data.csv"), version_dir="path/to/versions")
44
+ # df["new_col"] = df["existing_col"] * 2
45
+ # df.commit("Added new column")
46
+ # print(df.list_versions())
47
+ # df.rollback(version_id="20240326_123456")
0 commit comments