Skip to content

Commit c9b4e52

Browse files
wanghh2000HaiHui886
authored andcommitted
[dataviewer] Support preview json/csv large file and optimize thread num
1 parent bdb6e54 commit c9b4e52

File tree

12 files changed

+487
-139
lines changed

12 files changed

+487
-139
lines changed

common/config/config.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -241,10 +241,10 @@ type Config struct {
241241
ActivityMaximumAttempts int32 `env:"OPENCSG_DATAVIEWER_ACTIVITY_MAXIMUM_ATTEMPTS, default=2"`
242242
CacheDir string `env:"OPENCSG_DATAVIEWER_CACHE_DIR, default=/tmp/opencsg"`
243243
DownloadLfsFile bool `env:"OPENCSG_DATAVIEWER_DOWNLOAD_LFS_FILE, default=true"`
244-
ThreadNumOfExport int `env:"OPENCSG_DATAVIEWER_THREAD_NUM_OF_EXPORT, default=4"`
245-
MaxFileSize int64 `env:"OPENCSG_DATAVIEWER_MAX_FILE_SIZE, default=104857600"` // 100 MB
244+
MaxThreadNumOfExport int `env:"OPENCSG_DATAVIEWER_MAX_THREAD_NUM_OF_EXPORT, default=8"`
246245
MaxConcurrentSessionExecutionSize int `env:"OPENCSG_DATAVIEWER_MAX_CONCURRENT_SESSION_EXECUTION_SIZE, default=1"`
247-
SessionExecutionTimeout int `env:"OPENCSG_DATAVIEWER_SESSION_EXECUTION_TIMEOUT, default=240"` // 240 minutes
246+
SessionExecutionTimeout int `env:"OPENCSG_DATAVIEWER_SESSION_EXECUTION_TIMEOUT, default=240"` // 240 mins
247+
ConvertLimitSize int64 `env:"OPENCSG_DATAVIEWER_CONVERT_LIMIT_SIZE, default=5368709120"` // 5G
248248
}
249249

250250
Proxy struct {

dataviewer/common/types.go

Lines changed: 35 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -67,19 +67,27 @@ type Split struct {
6767
}
6868

6969
type RepoFilesReq struct {
70-
Namespace string
71-
RepoName string
72-
RepoType types.RepositoryType
73-
Ref string
74-
Folder string
75-
GSTree func(ctx context.Context, req gitserver.GetRepoInfoByPathReq) ([]*types.File, error)
70+
Namespace string
71+
RepoName string
72+
RepoType types.RepositoryType
73+
Ref string
74+
Folder string
75+
GSTree func(ctx context.Context, req gitserver.GetRepoInfoByPathReq) ([]*types.File, error)
76+
TotalLimitSize int64
77+
}
78+
79+
type RepoFile struct {
80+
*types.File
81+
DownloadSize int64
7682
}
7783

7884
type RepoFilesClass struct {
79-
AllFiles map[string]*types.File
80-
ParquetFiles map[string]*types.File
81-
JsonlFiles map[string]*types.File
82-
CsvFiles map[string]*types.File
85+
AllFiles map[string]*RepoFile
86+
ParquetFiles map[string]*RepoFile
87+
JsonlFiles map[string]*RepoFile
88+
CsvFiles map[string]*RepoFile
89+
TotalJsonSize int64
90+
TotalCsvSize int64
8391
}
8492

8593
type DownloadCard struct {
@@ -111,6 +119,7 @@ type FileObject struct {
111119
ObjectKey string `yaml:"object_key" json:"object_key"`
112120
LocalRepoPath string `yaml:"local_repo_path" json:"local_repo_path"`
113121
LocalFileName string `yaml:"local_file_name" json:"local_file_name"`
122+
DownloadSize int64 `yaml:"download_size" json:"download_size"`
114123
}
115124

116125
type CataLogRespone struct {
@@ -126,8 +135,8 @@ type WorkflowUpdateParams struct {
126135
}
127136

128137
type ScanRepoFileReq struct {
129-
Req types.UpdateViewerReq
130-
MaxFileSize int64
138+
Req types.UpdateViewerReq
139+
ConvertLimitSize int64
131140
}
132141

133142
type DetermineCardReq struct {
@@ -172,3 +181,17 @@ type UpdateWorkflowStatusReq struct {
172181
WorkflowErrMsg string
173182
ShouldUpdateViewer bool
174183
}
184+
185+
type FileExtName struct {
186+
Parquet string
187+
Jsonl string
188+
Json string
189+
Csv string
190+
}
191+
192+
type SplitName struct {
193+
Train string
194+
Test string
195+
Val string
196+
Other string
197+
}

dataviewer/component/dataset_viewer.go

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -602,11 +602,11 @@ func (c *datasetViewerComponentImpl) getParquetFilesBySplit(ctx context.Context,
602602

603603
var validator func(string) bool
604604
switch split {
605-
case workflows.TrainSplitName:
605+
case workflows.SplitName.Train:
606606
validator = workflows.IsTrainFile
607-
case workflows.TestSplitName:
607+
case workflows.SplitName.Test:
608608
validator = workflows.IsTestFile
609-
case workflows.ValSplitName:
609+
case workflows.SplitName.Val:
610610
validator = workflows.IsValidationFile
611611
default:
612612
return nil, fmt.Errorf("unknown split type: %s", split)
@@ -661,24 +661,24 @@ func (c *datasetViewerComponentImpl) genDefaultCatalog(ctx context.Context, req
661661
if calcTotal {
662662
total = c.getFilesRowCount(ctx, req, trainFiles)
663663
}
664-
configData.DataFiles = append(configData.DataFiles, dvCom.DataFiles{Split: workflows.TrainSplitName, Path: trainFiles})
665-
datasetInfo.Splits = append(datasetInfo.Splits, dvCom.Split{Name: workflows.TrainSplitName, NumExamples: total})
664+
configData.DataFiles = append(configData.DataFiles, dvCom.DataFiles{Split: workflows.SplitName.Train, Path: trainFiles})
665+
datasetInfo.Splits = append(datasetInfo.Splits, dvCom.Split{Name: workflows.SplitName.Train, NumExamples: total})
666666
}
667667
if len(testFiles) > 0 {
668668
total := 0
669669
if calcTotal {
670670
total = c.getFilesRowCount(ctx, req, testFiles)
671671
}
672-
configData.DataFiles = append(configData.DataFiles, dvCom.DataFiles{Split: workflows.TestSplitName, Path: testFiles})
673-
datasetInfo.Splits = append(datasetInfo.Splits, dvCom.Split{Name: workflows.TestSplitName, NumExamples: total})
672+
configData.DataFiles = append(configData.DataFiles, dvCom.DataFiles{Split: workflows.SplitName.Test, Path: testFiles})
673+
datasetInfo.Splits = append(datasetInfo.Splits, dvCom.Split{Name: workflows.SplitName.Test, NumExamples: total})
674674
}
675675
if len(valFiles) > 0 {
676676
total := 0
677677
if calcTotal {
678678
total = c.getFilesRowCount(ctx, req, valFiles)
679679
}
680-
configData.DataFiles = append(configData.DataFiles, dvCom.DataFiles{Split: workflows.ValSplitName, Path: valFiles})
681-
datasetInfo.Splits = append(datasetInfo.Splits, dvCom.Split{Name: workflows.ValSplitName, NumExamples: total})
680+
configData.DataFiles = append(configData.DataFiles, dvCom.DataFiles{Split: workflows.SplitName.Val, Path: valFiles})
681+
datasetInfo.Splits = append(datasetInfo.Splits, dvCom.Split{Name: workflows.SplitName.Val, NumExamples: total})
682682
}
683683
configData.ConfigName = workflows.DefaultSubsetName
684684
datasetInfo.ConfigName = workflows.DefaultSubsetName

0 commit comments

Comments
 (0)