Skip to content

Commit 14532fe

Browse files
authored
Support build unknown type (#210)
* feat: support handle unknown file type Signed-off-by: Zhao Chen <[email protected]> * feat: add support for distributed checkpoint file format Signed-off-by: Zhao Chen <[email protected]> * feat: expand supported file formats for config Signed-off-by: Zhao Chen <[email protected]> * feat: expand supported file formats for weight Signed-off-by: Zhao Chen <[email protected]> * feat: expand supported file formats for code Signed-off-by: Zhao Chen <[email protected]> * feat: expand supported file formats for doc Signed-off-by: Zhao Chen <[email protected]> * feat: implement workspace limits for file size, count, and total size Added validation in the workspace generation process to enforce limits on single file size (128GB), maximum file count (1024), and total workspace size (8TB). Included unit tests to verify these constraints. Signed-off-by: Zhao Chen <[email protected]> * feat: add unit tests for model configuration generation and workspace validation Implemented comprehensive unit tests for the `generateByModelConfig` and `generateByConfig` methods, ensuring correct handling of various model configuration scenarios. Enhanced the `validateWorkspace` method tests to cover edge cases, including empty directories and symbolic links. Signed-off-by: Zhao Chen <[email protected]> * feat: deprecate and hide the ignore-unrecognized-file-types flag Updated the command-line flag for ignoring unrecognized file types to mark it as deprecated and hidden, indicating it will be removed in the next release. Signed-off-by: Zhao Chen <[email protected]> * refactor: reorganize constants for file size thresholds and workspace limits Consolidated file size thresholds and workspace limits into a single constants block for improved readability and maintainability. Updated comments for clarity on each constant's purpose. Signed-off-by: Zhao Chen <[email protected]> * refactor: replace hardcoded byte size constants with humanize package Updated file size constants to utilize the go-humanize package for improved readability and maintainability. Adjusted the formatBytes function to leverage humanize.Bytes for converting byte sizes to a human-readable format. Signed-off-by: Zhao Chen <[email protected]> * fix: revert error message for existing modelfile check * chore: increase maximum workspace file count from 1024 to 2048 Updated the MaxWorkspaceFileCount constant to allow for a higher limit of files in the workspace, enhancing flexibility for users managing larger projects. --------- Signed-off-by: Zhao Chen <[email protected]>
1 parent 1bd9957 commit 14532fe

File tree

5 files changed

+1036
-102
lines changed

5 files changed

+1036
-102
lines changed

cmd/modelfile/generate.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,10 @@ func init() {
6565
flags.BoolVar(&generateConfig.IgnoreUnrecognizedFileTypes, "ignore-unrecognized-file-types", false, "ignore the unrecognized file types in the workspace")
6666
flags.BoolVar(&generateConfig.Overwrite, "overwrite", false, "overwrite the existing modelfile")
6767

68+
// Mark the ignore-unrecognized-file-types flag as deprecated and hidden
69+
flags.MarkDeprecated("ignore-unrecognized-file-types", "this flag will be removed in the next release")
70+
flags.MarkHidden("ignore-unrecognized-file-types")
71+
6872
if err := viper.BindPFlags(flags); err != nil {
6973
panic(fmt.Errorf("bind cache list flags to viper: %w", err))
7074
}

pkg/config/modelfile/modelfile.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ type GenerateConfig struct {
3131
Name string
3232
Version string
3333
Output string
34-
IgnoreUnrecognizedFileTypes bool
34+
IgnoreUnrecognizedFileTypes bool // [deprecated] will be removed in the next release
3535
Overwrite bool
3636
Arch string
3737
Family string

pkg/modelfile/constants.go

Lines changed: 260 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,17 @@ package modelfile
1919
import (
2020
"path/filepath"
2121
"strings"
22+
23+
"github.com/dustin/go-humanize"
2224
)
2325

2426
var (
2527
// Config file patterns - supported configuration file extensions.
2628
ConfigFilePatterns = []string{
2729
"*.json", // JSON configuration files
2830
"*.jsonl", // JSON Lines format
31+
"*.json5", // JSON5 files
32+
"*.jsonc", // JSON with comments
2933
"*.yaml", // YAML configuration files
3034
"*.yml", // YAML alternative extension
3135
"*.toml", // TOML configuration files
@@ -45,6 +49,12 @@ var (
4549
"*.meta", // Model metadata
4650
"*tokenizer.model*", // Tokenizer files (e.g., Mistral v3)
4751
"config.json.*", // Model configuration variants
52+
"*.hparams", // Hyperparameter files
53+
"*.params", // Parameter files
54+
"*.hyperparams", // Hyperparameter configuration
55+
"*.wandb", // Weights & Biases configuration
56+
"*.mlflow", // MLflow configuration
57+
"*.tensorboard", // TensorBoard configuration
4858
}
4959

5060
// Model file patterns - supported model file extensions.
@@ -56,29 +66,75 @@ var (
5666
"*.bin", // General binary format
5767
"*.pt", // PyTorch model
5868
"*.pth", // PyTorch model (alternative extension)
69+
"*.mar", // PyTorch Model Archive
70+
"*.pte", // PyTorch ExecuTorch format
71+
"*.pt2", // PyTorch 2.0 export format
72+
"*.ptl", // PyTorch Mobile format
5973

6074
// TensorFlow formats.
6175
"*.tflite", // TensorFlow Lite
6276
"*.h5", // Keras HDF5 format
6377
"*.hdf", // Hierarchical Data Format
6478
"*.hdf5", // HDF5 (alternative extension)
79+
"*.pb", // TensorFlow SavedModel/Frozen Graph
80+
"*.meta", // TensorFlow checkpoint metadata
81+
"*.data-*", // TensorFlow checkpoint data files
82+
"*.index", // TensorFlow checkpoint index
83+
84+
// GGML formats.
85+
"*.gguf", // GGML Universal Format
86+
"*.ggml", // GGML format (legacy)
87+
"*.ggmf", // GGMF format (deprecated)
88+
"*.ggjt", // GGJT format (deprecated)
89+
"*.q4_0", // GGML Q4_0 quantization
90+
"*.q4_1", // GGML Q4_1 quantization
91+
"*.q5_0", // GGML Q5_0 quantization
92+
"*.q5_1", // GGML Q5_1 quantization
93+
"*.q8_0", // GGML Q8_0 quantization
94+
"*.f16", // GGML F16 format
95+
"*.f32", // GGML F32 format
96+
97+
// checkpoint formats.
98+
"*.ckpt", // Checkpoint format
99+
"*.checkpoint", // Checkpoint format (alternative extension)
100+
"*.dist_ckpt", // Distributed checkpoint format
101+
102+
// Semantics-specific formats
103+
"*.tensor", // Generic tensor format
104+
"*.weights", // Generic weights format
105+
"*.state", // State files
106+
"*.embedding", // Embedding files
107+
"*.vocab", // Vocabulary files (when binary)
65108

66109
// Other ML frameworks.
67110
"*.ot", // OpenVINO format
68111
"*.engine", // TensorRT format
69112
"*.trt", // TensorRT format (alternative extension)
70113
"*.onnx", // Open Neural Network Exchange format
71-
"*.gguf", // GGML Universal Format
72114
"*.msgpack", // MessagePack serialization
73115
"*.model", // Some NLP frameworks
74116
"*.pkl", // Pickle format
75117
"*.pickle", // Pickle format (alternative extension)
76-
"*.ckpt", // Checkpoint format
77-
"*.checkpoint", // Checkpoint format (alternative extension)
118+
"*.keras", // Keras native format
119+
"*.joblib", // Joblib serialization (scikit-learn)
120+
"*.npy", // NumPy array format
121+
"*.npz", // NumPy compressed archive
122+
"*.nc", // NetCDF format
123+
"*.mlmodel", // Apple Core ML format
124+
"*.coreml", // Apple Core ML format (alternative)
125+
"*.mleap", // MLeap format (Spark ML)
126+
"*.surml", // SurrealML format
127+
"*.llamafile", // Llamafile format
128+
"*.caffemodel", // Caffe model format
129+
"*.prototxt", // Caffe model definition
130+
"*.dlc", // Qualcomm Deep Learning Container
131+
"*.circle", // Samsung Circle format
132+
"*.nb", // Neural Network Binary format
78133
}
79134

80135
// Code file patterns - supported script and notebook files.
81136
CodeFilePatterns = []string{
137+
// language source files
82138
"*.py", // Python source files
83139
"*.ipynb", // Jupyter notebooks
84140
"*.sh", // Shell scripts
@@ -88,42 +144,137 @@ var (
88144
"*.hxx", // C++ header files
89145
"*.cpp", // C++ source files
90146
"*.cc", // C++ source files
147+
"*.cxx", // C++ source files (alternative)
148+
"*.c++", // C++ source files (alternative)
91149
"*.hpp", // C++ header files
92150
"*.hh", // C++ header files
151+
"*.h++", // C++ header files (alternative)
93152
"*.java", // Java source files
94153
"*.js", // JavaScript source files
154+
"*.mjs", // JavaScript ES6 modules
155+
"*.cjs", // CommonJS modules
156+
"*.jsx", // React JSX files
95157
"*.ts", // TypeScript source files
158+
"*.tsx", // TypeScript JSX files
96159
"*.go", // Go source files
97160
"*.rs", // Rust source files
98161
"*.swift", // Swift source files
99162
"*.rb", // Ruby source files
100163
"*.php", // PHP source files
101164
"*.scala", // Scala source files
102165
"*.kt", // Kotlin source files
166+
"*.kts", // Kotlin script files
103167
"*.r", // R source files
168+
"*.R", // R source files (alternative)
104169
"*.m", // MATLAB/Objective-C source files
170+
"*.mm", // Objective-C++ source files
105171
"*.f", // Fortran source files
106172
"*.f90", // Fortran 90 source files
173+
"*.f95", // Fortran 95 source files
174+
"*.f03", // Fortran 2003 source files
175+
"*.f08", // Fortran 2008 source files
107176
"*.jl", // Julia source files
108177
"*.lua", // Lua source files
109178
"*.pl", // Perl source files
179+
"*.pm", // Perl modules
110180
"*.cs", // C# source files
111181
"*.vb", // Visual Basic source files
112182
"*.dart", // Dart source files
113183
"*.groovy", // Groovy source files
114184
"*.elm", // Elm source files
115185
"*.erl", // Erlang source files
186+
"*.hrl", // Erlang header files
116187
"*.ex", // Elixir source files
188+
"*.exs", // Elixir script files
117189
"*.hs", // Haskell source files
190+
"*.lhs", // Literate Haskell source files
118191
"*.clj", // Clojure source files
119192
"*.cljs", // ClojureScript source files
120-
"*.cljc", // Clojure Common Lisp source files
193+
"*.cljc", // Clojure Common source files
121194
"*.cl", // Common Lisp source files
122195
"*.lisp", // Lisp source files
196+
"*.lsp", // Lisp source files (alternative)
123197
"*.scm", // Scheme source files
198+
"*.ss", // Scheme source files (alternative)
199+
"*.rkt", // Racket source files
200+
"*.sql", // SQL files
201+
"*.psql", // PostgreSQL files
202+
"*.mysql", // MySQL files
203+
"*.sqlite", // SQLite files
204+
"*.zig", // Zig source files
124205
"*.cu", // CUDA source files
125206
"*.cuh", // CUDA header files
126207

208+
// Scripting and automation
209+
"*.bash", // Bash scripts
210+
"*.zsh", // Zsh scripts
211+
"*.fish", // Fish shell scripts
212+
"*.csh", // C shell scripts
213+
"*.tcsh", // TC shell scripts
214+
"*.ksh", // Korn shell scripts
215+
"*.ps1", // PowerShell scripts
216+
"*.psm1", // PowerShell modules
217+
"*.psd1", // PowerShell data files
218+
"*.bat", // Windows batch files
219+
"*.cmd", // Windows command files
220+
"*.vbs", // VBScript files
221+
"*.wsf", // Windows Script Files
222+
"*.applescript", // AppleScript files
223+
"*.scpt", // AppleScript compiled files
224+
"*.awk", // AWK scripts
225+
"*.sed", // sed scripts
226+
"*.expect", // Expect scripts
227+
228+
// Build and project files
229+
"*.env", // Environment variable files
230+
"*.env.*", // Environment files with suffixes
231+
".env*", // Environment files (hidden)
232+
"Makefile*", // Makefile variants
233+
"*.dockerfile", // Dockerfile configurations
234+
"Dockerfile*", // Dockerfile variants
235+
"*.mk", // Make include files
236+
"*.cmake", // CMake files
237+
"CMakeLists.txt", // CMake configuration
238+
"*.gradle", // Gradle build files
239+
"*.gradle.kts", // Kotlin DSL Gradle files
240+
"build.gradle*", // Gradle build files
241+
"settings.gradle*", // Gradle settings files
242+
"*.sbt", // SBT build files
243+
"*.mill", // Mill build files
244+
"*.bazel", // Bazel build files
245+
"*.bzl", // Bazel extension files
246+
"BUILD*", // Bazel BUILD files
247+
"WORKSPACE*", // Bazel WORKSPACE files
248+
"*.buck", // Buck build files
249+
"BUCK*", // Buck BUILD files
250+
"*.ninja", // Ninja build files
251+
"*.gyp", // GYP build files
252+
"*.gypi", // GYP include files
253+
"*.waf", // Waf build files
254+
"wscript*", // Waf build scripts
255+
"package.json", // Node.js package file
256+
"package-lock.json", // Node.js lock file
257+
"yarn.lock", // Yarn lock file
258+
"pnpm-lock.yaml", // PNPM lock file
259+
"requirements*.txt", // Python requirements
260+
"Pipfile*", // Python Pipenv files
261+
"pyproject.toml", // Python project configuration
262+
"setup.cfg", // Python setup configuration
263+
"tox.ini", // Python tox configuration
264+
"poetry.lock", // Python Poetry lock file
265+
"Cargo.toml", // Rust package configuration
266+
"Cargo.lock", // Rust lock file
267+
"go.mod", // Go module file
268+
"go.sum", // Go checksum file
269+
"composer.json", // PHP Composer file
270+
"composer.lock", // PHP Composer lock file
271+
"Gemfile*", // Ruby Gemfile
272+
"*.gemspec", // Ruby gem specification
273+
"mix.exs", // Elixir Mix file
274+
"mix.lock", // Elixir Mix lock file
275+
"rebar.config", // Erlang Rebar config
276+
"rebar.lock", // Erlang Rebar lock file
277+
127278
// Library files.
128279
"*.so", // Shared object files
129280
"*.dll", // Dynamic Link Library
@@ -144,6 +295,93 @@ var (
144295
"*requirements*", // Dependency specifications
145296
"*.log", // Log files
146297

298+
// Office documents
299+
"*.doc", // Microsoft Word 97-2003 Document
300+
"*.docx", // Microsoft Word Document
301+
"*.docm", // Word Macro-Enabled Document
302+
"*.dot", // Word 97-2003 Template
303+
"*.dotx", // Word Template
304+
"*.dotm", // Word Macro-Enabled Template
305+
"*.rtf", // Rich Text Format
306+
"*.odt", // OpenDocument Text
307+
"*.ott", // OpenDocument Text Template
308+
"*.fodt", // Flat OpenDocument Text
309+
"*.pages", // Apple Pages document
310+
"*.wpd", // WordPerfect document
311+
312+
// Spreadsheet documents
313+
"*.xls", // Microsoft Excel 97-2003 Workbook
314+
"*.xlsx", // Microsoft Excel Workbook
315+
"*.xlsm", // Excel Macro-Enabled Workbook
316+
"*.xlsb", // Excel Binary Workbook
317+
"*.xlt", // Excel 97-2003 Template
318+
"*.xltx", // Excel Template
319+
"*.xltm", // Excel Macro-Enabled Template
320+
"*.ods", // OpenDocument Spreadsheet
321+
"*.ots", // OpenDocument Spreadsheet Template
322+
"*.fods", // Flat OpenDocument Spreadsheet
323+
"*.numbers", // Apple Numbers spreadsheet
324+
"*.csv", // Comma-Separated Values
325+
326+
// Presentation documents
327+
"*.ppt", // Microsoft PowerPoint 97-2003 Presentation
328+
"*.pptx", // Microsoft PowerPoint Presentation
329+
"*.pptm", // PowerPoint Macro-Enabled Presentation
330+
"*.pps", // PowerPoint 97-2003 Show
331+
"*.ppsx", // PowerPoint Show
332+
"*.ppsm", // PowerPoint Macro-Enabled Show
333+
"*.pot", // PowerPoint 97-2003 Template
334+
"*.potx", // PowerPoint Template
335+
"*.potm", // PowerPoint Macro-Enabled Template
336+
"*.odp", // OpenDocument Presentation
337+
"*.otp", // OpenDocument Presentation Template
338+
"*.fodp", // Flat OpenDocument Presentation
339+
"*.key", // Apple Keynote presentation
340+
341+
// eBook formats
342+
"*.epub", // Electronic Publication
343+
"*.mobi", // Mobipocket eBook
344+
"*.azw", // Amazon Kindle eBook
345+
"*.azw3", // Amazon Kindle eBook (KF8)
346+
"*.fb2", // FictionBook 2.0
347+
"*.fb3", // FictionBook 3.0
348+
"*.lit", // Microsoft Literature
349+
"*.pdb", // Palm Database/Document File
350+
"*.djvu", // DjVu document
351+
"*.djv", // DjVu document (alternative extension)
352+
353+
// Web and markup documents
354+
"*.html", // HyperText Markup Language
355+
"*.htm", // HyperText Markup Language (alternative)
356+
"*.xhtml", // Extensible HyperText Markup Language
357+
"*.mhtml", // MIME HTML (Web Archive)
358+
"*.mht", // MIME HTML (Web Archive, alternative)
359+
"*.xml", // eXtensible Markup Language
360+
"*.xsl", // eXtensible Stylesheet Language
361+
"*.xslt", // XSL Transformations
362+
363+
// Technical documentation formats
364+
"*.tex", // LaTeX document
365+
"*.latex", // LaTeX document (alternative)
366+
"*.ltx", // LaTeX document (alternative)
367+
"*.bib", // BibTeX bibliography
368+
"*.rst", // reStructuredText
369+
"*.asciidoc", // AsciiDoc
370+
"*.adoc", // AsciiDoc (alternative)
371+
"*.textile", // Textile markup
372+
"*.wiki", // Wiki markup
373+
"*.mediawiki", // MediaWiki markup
374+
"*.org", // Org-mode document
375+
"*.texi", // Texinfo document
376+
"*.texinfo", // Texinfo document (alternative)
377+
"*.info", // GNU Info document
378+
"*.man", // Manual page
379+
380+
// Archive and compressed documents
381+
"*.chm", // Compiled HTML Help
382+
"*.hlp", // Windows Help File
383+
"*.xps", // XML Paper Specification
384+
147385
// Image assets.
148386
"*.jpg", // JPEG image format
149387
"*.jpeg", // JPEG alternative extension
@@ -182,6 +420,14 @@ var (
182420
}
183421
)
184422

423+
const (
424+
// File size thresholds and workspace limits
425+
WeightFileSizeThreshold int64 = 128 * humanize.MByte // 128MB - threshold for considering file as weight file
426+
MaxSingleFileSize int64 = 128 * humanize.GByte // 128GB - maximum size for a single file
427+
MaxWorkspaceFileCount int = 2048 // 2048 files - maximum number of files in workspace
428+
MaxTotalWorkspaceSize int64 = 8 * humanize.TByte // 8TB - maximum total workspace size
429+
)
430+
185431
// IsFileType checks if the filename matches any of the given patterns
186432
func IsFileType(filename string, patterns []string) bool {
187433
// Convert filename to lowercase for case-insensitive comparison
@@ -216,3 +462,13 @@ func isSkippable(filename string) bool {
216462

217463
return false
218464
}
465+
466+
// For large unknown file type, usually it is a weight file.
467+
func SizeShouldBeWeightFile(size int64) bool {
468+
return size > WeightFileSizeThreshold
469+
}
470+
471+
// formatBytes converts byte size to human-readable format using go-humanize
472+
func formatBytes(bytes int64) string {
473+
return humanize.Bytes(uint64(bytes))
474+
}

0 commit comments

Comments
 (0)