Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions common/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -241,10 +241,10 @@ type Config struct {
ActivityMaximumAttempts int32 `env:"OPENCSG_DATAVIEWER_ACTIVITY_MAXIMUM_ATTEMPTS, default=2"`
CacheDir string `env:"OPENCSG_DATAVIEWER_CACHE_DIR, default=/tmp/opencsg"`
DownloadLfsFile bool `env:"OPENCSG_DATAVIEWER_DOWNLOAD_LFS_FILE, default=true"`
ThreadNumOfExport int `env:"OPENCSG_DATAVIEWER_THREAD_NUM_OF_EXPORT, default=4"`
MaxFileSize int64 `env:"OPENCSG_DATAVIEWER_MAX_FILE_SIZE, default=104857600"` // 100 MB
MaxThreadNumOfExport int `env:"OPENCSG_DATAVIEWER_MAX_THREAD_NUM_OF_EXPORT, default=8"`
MaxConcurrentSessionExecutionSize int `env:"OPENCSG_DATAVIEWER_MAX_CONCURRENT_SESSION_EXECUTION_SIZE, default=1"`
SessionExecutionTimeout int `env:"OPENCSG_DATAVIEWER_SESSION_EXECUTION_TIMEOUT, default=240"` // 240 minutes
SessionExecutionTimeout int `env:"OPENCSG_DATAVIEWER_SESSION_EXECUTION_TIMEOUT, default=240"` // 240 mins
ConvertLimitSize int64 `env:"OPENCSG_DATAVIEWER_CONVERT_LIMIT_SIZE, default=5368709120"` // 5G
}

Proxy struct {
Expand Down
47 changes: 35 additions & 12 deletions dataviewer/common/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,19 +67,27 @@ type Split struct {
}

type RepoFilesReq struct {
Namespace string
RepoName string
RepoType types.RepositoryType
Ref string
Folder string
GSTree func(ctx context.Context, req gitserver.GetRepoInfoByPathReq) ([]*types.File, error)
Namespace string
RepoName string
RepoType types.RepositoryType
Ref string
Folder string
GSTree func(ctx context.Context, req gitserver.GetRepoInfoByPathReq) ([]*types.File, error)
TotalLimitSize int64
}

type RepoFile struct {
*types.File
DownloadSize int64
}

type RepoFilesClass struct {
AllFiles map[string]*types.File
ParquetFiles map[string]*types.File
JsonlFiles map[string]*types.File
CsvFiles map[string]*types.File
AllFiles map[string]*RepoFile
ParquetFiles map[string]*RepoFile
JsonlFiles map[string]*RepoFile
CsvFiles map[string]*RepoFile
TotalJsonSize int64
TotalCsvSize int64
}

type DownloadCard struct {
Expand Down Expand Up @@ -111,6 +119,7 @@ type FileObject struct {
ObjectKey string `yaml:"object_key" json:"object_key"`
LocalRepoPath string `yaml:"local_repo_path" json:"local_repo_path"`
LocalFileName string `yaml:"local_file_name" json:"local_file_name"`
DownloadSize int64 `yaml:"download_size" json:"download_size"`
}

type CataLogRespone struct {
Expand All @@ -126,8 +135,8 @@ type WorkflowUpdateParams struct {
}

type ScanRepoFileReq struct {
Req types.UpdateViewerReq
MaxFileSize int64
Req types.UpdateViewerReq
ConvertLimitSize int64
}

type DetermineCardReq struct {
Expand Down Expand Up @@ -172,3 +181,17 @@ type UpdateWorkflowStatusReq struct {
WorkflowErrMsg string
ShouldUpdateViewer bool
}

type FileExtName struct {
Parquet string
Jsonl string
Json string
Csv string
}

type SplitName struct {
Train string
Test string
Val string
Other string
}
18 changes: 9 additions & 9 deletions dataviewer/component/dataset_viewer.go
Original file line number Diff line number Diff line change
Expand Up @@ -602,11 +602,11 @@ func (c *datasetViewerComponentImpl) getParquetFilesBySplit(ctx context.Context,

var validator func(string) bool
switch split {
case workflows.TrainSplitName:
case workflows.SplitName.Train:
validator = workflows.IsTrainFile
case workflows.TestSplitName:
case workflows.SplitName.Test:
validator = workflows.IsTestFile
case workflows.ValSplitName:
case workflows.SplitName.Val:
validator = workflows.IsValidationFile
default:
return nil, fmt.Errorf("unknown split type: %s", split)
Expand Down Expand Up @@ -661,24 +661,24 @@ func (c *datasetViewerComponentImpl) genDefaultCatalog(ctx context.Context, req
if calcTotal {
total = c.getFilesRowCount(ctx, req, trainFiles)
}
configData.DataFiles = append(configData.DataFiles, dvCom.DataFiles{Split: workflows.TrainSplitName, Path: trainFiles})
datasetInfo.Splits = append(datasetInfo.Splits, dvCom.Split{Name: workflows.TrainSplitName, NumExamples: total})
configData.DataFiles = append(configData.DataFiles, dvCom.DataFiles{Split: workflows.SplitName.Train, Path: trainFiles})
datasetInfo.Splits = append(datasetInfo.Splits, dvCom.Split{Name: workflows.SplitName.Train, NumExamples: total})
}
if len(testFiles) > 0 {
total := 0
if calcTotal {
total = c.getFilesRowCount(ctx, req, testFiles)
}
configData.DataFiles = append(configData.DataFiles, dvCom.DataFiles{Split: workflows.TestSplitName, Path: testFiles})
datasetInfo.Splits = append(datasetInfo.Splits, dvCom.Split{Name: workflows.TestSplitName, NumExamples: total})
configData.DataFiles = append(configData.DataFiles, dvCom.DataFiles{Split: workflows.SplitName.Test, Path: testFiles})
datasetInfo.Splits = append(datasetInfo.Splits, dvCom.Split{Name: workflows.SplitName.Test, NumExamples: total})
}
if len(valFiles) > 0 {
total := 0
if calcTotal {
total = c.getFilesRowCount(ctx, req, valFiles)
}
configData.DataFiles = append(configData.DataFiles, dvCom.DataFiles{Split: workflows.ValSplitName, Path: valFiles})
datasetInfo.Splits = append(datasetInfo.Splits, dvCom.Split{Name: workflows.ValSplitName, NumExamples: total})
configData.DataFiles = append(configData.DataFiles, dvCom.DataFiles{Split: workflows.SplitName.Val, Path: valFiles})
datasetInfo.Splits = append(datasetInfo.Splits, dvCom.Split{Name: workflows.SplitName.Val, NumExamples: total})
}
configData.ConfigName = workflows.DefaultSubsetName
datasetInfo.ConfigName = workflows.DefaultSubsetName
Expand Down
Loading
Loading