Skip to content

Commit f08c508

Browse files
[dataviewer] Support preview json/csv large file and optimize thread num (#268)
* [dataviewer] Support preview json/csv large file and optimize thread num * fix golint --------- Co-authored-by: Haihui.Wang <wanghh2000@163.com>
1 parent fbe3692 commit f08c508

File tree

13 files changed

+490
-142
lines changed

13 files changed

+490
-142
lines changed

common/config/config.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -241,10 +241,10 @@ type Config struct {
241241
ActivityMaximumAttempts int32 `env:"OPENCSG_DATAVIEWER_ACTIVITY_MAXIMUM_ATTEMPTS, default=2"`
242242
CacheDir string `env:"OPENCSG_DATAVIEWER_CACHE_DIR, default=/tmp/opencsg"`
243243
DownloadLfsFile bool `env:"OPENCSG_DATAVIEWER_DOWNLOAD_LFS_FILE, default=true"`
244-
ThreadNumOfExport int `env:"OPENCSG_DATAVIEWER_THREAD_NUM_OF_EXPORT, default=4"`
245-
MaxFileSize int64 `env:"OPENCSG_DATAVIEWER_MAX_FILE_SIZE, default=104857600"` // 100 MB
244+
MaxThreadNumOfExport int `env:"OPENCSG_DATAVIEWER_MAX_THREAD_NUM_OF_EXPORT, default=8"`
246245
MaxConcurrentSessionExecutionSize int `env:"OPENCSG_DATAVIEWER_MAX_CONCURRENT_SESSION_EXECUTION_SIZE, default=1"`
247-
SessionExecutionTimeout int `env:"OPENCSG_DATAVIEWER_SESSION_EXECUTION_TIMEOUT, default=240"` // 240 minutes
246+
SessionExecutionTimeout int `env:"OPENCSG_DATAVIEWER_SESSION_EXECUTION_TIMEOUT, default=240"` // 240 mins
247+
ConvertLimitSize int64 `env:"OPENCSG_DATAVIEWER_CONVERT_LIMIT_SIZE, default=5368709120"` // 5G
248248
}
249249

250250
Proxy struct {

component/model.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1073,7 +1073,7 @@ func (c *modelComponentImpl) SetRuntimeFrameworkModes(ctx context.Context, curre
10731073
if err != nil {
10741074
return nil, err
10751075
}
1076-
if relations == nil || len(relations) < 1 {
1076+
if len(relations) < 1 {
10771077
err = c.repoRuntimeFrameworkStore.Add(ctx, id, model.Repository.ID, deployType)
10781078
if err != nil {
10791079
failedModels = append(failedModels, model.Repository.Path)
@@ -1135,7 +1135,7 @@ func (c *modelComponentImpl) ListModelsOfRuntimeFrameworks(ctx context.Context,
11351135
return nil, 0, fmt.Errorf("failed to get repo by deploy type, error:%w", err)
11361136
}
11371137

1138-
if runtimeRepos == nil || len(runtimeRepos) < 1 {
1138+
if len(runtimeRepos) < 1 {
11391139
return nil, 0, nil
11401140
}
11411141

dataviewer/common/types.go

Lines changed: 35 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -67,19 +67,27 @@ type Split struct {
6767
}
6868

6969
type RepoFilesReq struct {
70-
Namespace string
71-
RepoName string
72-
RepoType types.RepositoryType
73-
Ref string
74-
Folder string
75-
GSTree func(ctx context.Context, req gitserver.GetRepoInfoByPathReq) ([]*types.File, error)
70+
Namespace string
71+
RepoName string
72+
RepoType types.RepositoryType
73+
Ref string
74+
Folder string
75+
GSTree func(ctx context.Context, req gitserver.GetRepoInfoByPathReq) ([]*types.File, error)
76+
TotalLimitSize int64
77+
}
78+
79+
type RepoFile struct {
80+
*types.File
81+
DownloadSize int64
7682
}
7783

7884
type RepoFilesClass struct {
79-
AllFiles map[string]*types.File
80-
ParquetFiles map[string]*types.File
81-
JsonlFiles map[string]*types.File
82-
CsvFiles map[string]*types.File
85+
AllFiles map[string]*RepoFile
86+
ParquetFiles map[string]*RepoFile
87+
JsonlFiles map[string]*RepoFile
88+
CsvFiles map[string]*RepoFile
89+
TotalJsonSize int64
90+
TotalCsvSize int64
8391
}
8492

8593
type DownloadCard struct {
@@ -111,6 +119,7 @@ type FileObject struct {
111119
ObjectKey string `yaml:"object_key" json:"object_key"`
112120
LocalRepoPath string `yaml:"local_repo_path" json:"local_repo_path"`
113121
LocalFileName string `yaml:"local_file_name" json:"local_file_name"`
122+
DownloadSize int64 `yaml:"download_size" json:"download_size"`
114123
}
115124

116125
type CataLogRespone struct {
@@ -126,8 +135,8 @@ type WorkflowUpdateParams struct {
126135
}
127136

128137
type ScanRepoFileReq struct {
129-
Req types.UpdateViewerReq
130-
MaxFileSize int64
138+
Req types.UpdateViewerReq
139+
ConvertLimitSize int64
131140
}
132141

133142
type DetermineCardReq struct {
@@ -172,3 +181,17 @@ type UpdateWorkflowStatusReq struct {
172181
WorkflowErrMsg string
173182
ShouldUpdateViewer bool
174183
}
184+
185+
type FileExtName struct {
186+
Parquet string
187+
Jsonl string
188+
Json string
189+
Csv string
190+
}
191+
192+
type SplitName struct {
193+
Train string
194+
Test string
195+
Val string
196+
Other string
197+
}

dataviewer/component/dataset_viewer.go

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -602,11 +602,11 @@ func (c *datasetViewerComponentImpl) getParquetFilesBySplit(ctx context.Context,
602602

603603
var validator func(string) bool
604604
switch split {
605-
case workflows.TrainSplitName:
605+
case workflows.SplitName.Train:
606606
validator = workflows.IsTrainFile
607-
case workflows.TestSplitName:
607+
case workflows.SplitName.Test:
608608
validator = workflows.IsTestFile
609-
case workflows.ValSplitName:
609+
case workflows.SplitName.Val:
610610
validator = workflows.IsValidationFile
611611
default:
612612
return nil, fmt.Errorf("unknown split type: %s", split)
@@ -661,24 +661,24 @@ func (c *datasetViewerComponentImpl) genDefaultCatalog(ctx context.Context, req
661661
if calcTotal {
662662
total = c.getFilesRowCount(ctx, req, trainFiles)
663663
}
664-
configData.DataFiles = append(configData.DataFiles, dvCom.DataFiles{Split: workflows.TrainSplitName, Path: trainFiles})
665-
datasetInfo.Splits = append(datasetInfo.Splits, dvCom.Split{Name: workflows.TrainSplitName, NumExamples: total})
664+
configData.DataFiles = append(configData.DataFiles, dvCom.DataFiles{Split: workflows.SplitName.Train, Path: trainFiles})
665+
datasetInfo.Splits = append(datasetInfo.Splits, dvCom.Split{Name: workflows.SplitName.Train, NumExamples: total})
666666
}
667667
if len(testFiles) > 0 {
668668
total := 0
669669
if calcTotal {
670670
total = c.getFilesRowCount(ctx, req, testFiles)
671671
}
672-
configData.DataFiles = append(configData.DataFiles, dvCom.DataFiles{Split: workflows.TestSplitName, Path: testFiles})
673-
datasetInfo.Splits = append(datasetInfo.Splits, dvCom.Split{Name: workflows.TestSplitName, NumExamples: total})
672+
configData.DataFiles = append(configData.DataFiles, dvCom.DataFiles{Split: workflows.SplitName.Test, Path: testFiles})
673+
datasetInfo.Splits = append(datasetInfo.Splits, dvCom.Split{Name: workflows.SplitName.Test, NumExamples: total})
674674
}
675675
if len(valFiles) > 0 {
676676
total := 0
677677
if calcTotal {
678678
total = c.getFilesRowCount(ctx, req, valFiles)
679679
}
680-
configData.DataFiles = append(configData.DataFiles, dvCom.DataFiles{Split: workflows.ValSplitName, Path: valFiles})
681-
datasetInfo.Splits = append(datasetInfo.Splits, dvCom.Split{Name: workflows.ValSplitName, NumExamples: total})
680+
configData.DataFiles = append(configData.DataFiles, dvCom.DataFiles{Split: workflows.SplitName.Val, Path: valFiles})
681+
datasetInfo.Splits = append(datasetInfo.Splits, dvCom.Split{Name: workflows.SplitName.Val, NumExamples: total})
682682
}
683683
configData.ConfigName = workflows.DefaultSubsetName
684684
datasetInfo.ConfigName = workflows.DefaultSubsetName

0 commit comments

Comments
 (0)