From 61f9eb28fcaa9361d07b6ff6a5a304b9ace96beb Mon Sep 17 00:00:00 2001
From: Dipti Pai 
Date: Thu, 10 Apr 2025 15:05:31 -0700
Subject: [PATCH] Sparse Checkout Directories in GitRepositories.
    - Add `.spec.sparseCheckout` and `.status.observedSparseCheckout` fields to `GitRepository`.
    - Add controller support to send the sparse checkout directories to go-git via pkg methods.
    - Use `.status/observedSparseCheckout` to detect drift in configuration.
    - Trim leading "./" in directory paths.
    - Validate spec configuration by checking directories specified in spec exist in the cloned repository after successful checkout
    - Add tests for testing the observed sparse checkout behavior.
    - Add docs describing the new fields.
Signed-off-by: Dipti Pai 
---
 api/v1/gitrepository_types.go                 | 11 +++++
 api/v1/zz_generated.deepcopy.go               | 10 +++++
 ...rce.toolkit.fluxcd.io_gitrepositories.yaml | 15 +++++++
 docs/api/v1/source.md                         | 41 ++++++++++++++++++
 docs/spec/v1/gitrepositories.md               | 43 +++++++++++++++++++
 .../controller/gitrepository_controller.go    | 42 +++++++++++++++++-
 .../gitrepository_controller_test.go          | 32 ++++++++++++++
 7 files changed, 193 insertions(+), 1 deletion(-)
diff --git a/api/v1/gitrepository_types.go b/api/v1/gitrepository_types.go
index 20ef37d0c..590f1a38e 100644
--- a/api/v1/gitrepository_types.go
+++ b/api/v1/gitrepository_types.go
@@ -148,6 +148,12 @@ type GitRepositorySpec struct {
 	// should be included in the Artifact produced for this GitRepository.
 	// +optional
 	Include []GitRepositoryInclude `json:"include,omitempty"`
+
+	// SparseCheckout specifies a list of directories to checkout when cloning
+	// the repository. If specified, only these directories are included in the
+	// Artifact produced for this GitRepository.
+	// +optional
+	SparseCheckout []string `json:"sparseCheckout,omitempty"`
 }
 
 // GitRepositoryInclude specifies a local reference to a GitRepository which
@@ -266,6 +272,11 @@ type GitRepositoryStatus struct {
 	// +optional
 	ObservedInclude []GitRepositoryInclude `json:"observedInclude,omitempty"`
 
+	// ObservedSparseCheckout is the observed list of directories used to
+	// produce the current Artifact.
+	// +optional
+	ObservedSparseCheckout []string `json:"observedSparseCheckout,omitempty"`
+
 	// SourceVerificationMode is the last used verification mode indicating
 	// which Git object(s) have been verified.
 	// +optional
diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go
index 12e537fae..9ac5d593d 100644
--- a/api/v1/zz_generated.deepcopy.go
+++ b/api/v1/zz_generated.deepcopy.go
@@ -347,6 +347,11 @@ func (in *GitRepositorySpec) DeepCopyInto(out *GitRepositorySpec) {
 		*out = make([]GitRepositoryInclude, len(*in))
 		copy(*out, *in)
 	}
+	if in.SparseCheckout != nil {
+		in, out := &in.SparseCheckout, &out.SparseCheckout
+		*out = make([]string, len(*in))
+		copy(*out, *in)
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GitRepositorySpec.
@@ -395,6 +400,11 @@ func (in *GitRepositoryStatus) DeepCopyInto(out *GitRepositoryStatus) {
 		*out = make([]GitRepositoryInclude, len(*in))
 		copy(*out, *in)
 	}
+	if in.ObservedSparseCheckout != nil {
+		in, out := &in.ObservedSparseCheckout, &out.ObservedSparseCheckout
+		*out = make([]string, len(*in))
+		copy(*out, *in)
+	}
 	if in.SourceVerificationMode != nil {
 		in, out := &in.SourceVerificationMode, &out.SourceVerificationMode
 		*out = new(GitVerificationMode)
diff --git a/config/crd/bases/source.toolkit.fluxcd.io_gitrepositories.yaml b/config/crd/bases/source.toolkit.fluxcd.io_gitrepositories.yaml
index 0e37a7b49..10cf1162e 100644
--- a/config/crd/bases/source.toolkit.fluxcd.io_gitrepositories.yaml
+++ b/config/crd/bases/source.toolkit.fluxcd.io_gitrepositories.yaml
@@ -174,6 +174,14 @@ spec:
                 required:
                 - name
                 type: object
+              sparseCheckout:
+                description: |-
+                  SparseCheckout specifies a list of directories to checkout when cloning
+                  the repository. If specified, only these directories are included in the
+                  Artifact produced for this GitRepository.
+                items:
+                  type: string
+                type: array
               suspend:
                 description: |-
                   Suspend tells the controller to suspend the reconciliation of this
@@ -443,6 +451,13 @@ spec:
                   ObservedRecurseSubmodules is the observed resource submodules
                   configuration used to produce the current Artifact.
                 type: boolean
+              observedSparseCheckout:
+                description: |-
+                  ObservedSparseCheckout is the observed list of directories used to
+                  produce the current Artifact.
+                items:
+                  type: string
+                type: array
               sourceVerificationMode:
                 description: |-
                   SourceVerificationMode is the last used verification mode indicating
diff --git a/docs/api/v1/source.md b/docs/api/v1/source.md
index 121a056cd..df1b800ce 100644
--- a/docs/api/v1/source.md
+++ b/docs/api/v1/source.md
@@ -523,6 +523,20 @@ the GitRepository as cloned from the URL, using their default settings.
 should be included in the Artifact produced for this GitRepository.
 
 
+
+| + +sparseCheckout+
+[]string
+
+
 | +(Optional)
+ +SparseCheckout specifies a list of directories to checkout when cloning
+the repository. If specified, only these directories are included in the
+Artifact produced for this GitRepository.+ | 
 
 
 
@@ -1863,6 +1877,20 @@ the GitRepository as cloned from the URL, using their default settings.
 should be included in the Artifact produced for this GitRepository.
 
 
+
+| + +sparseCheckout+
+[]string
+
+
 | +(Optional)
+ +SparseCheckout specifies a list of directories to checkout when cloning
+the repository. If specified, only these directories are included in the
+Artifact produced for this GitRepository.+ | 
 
 
 
@@ -1983,6 +2011,19 @@ produce the current Artifact.
 
 
 | + +observedSparseCheckout+
+[]string
+
+
 | +(Optional)
+ +ObservedSparseCheckout is the observed list of directories used to
+produce the current Artifact.+ | 
+
+| sourceVerificationModediff --git a/docs/spec/v1/gitrepositories.md b/docs/spec/v1/gitrepositories.md
index a9c5d2a2f..b57e2b9da 100644
--- a/docs/spec/v1/gitrepositories.md
+++ b/docs/spec/v1/gitrepositories.md
@@ -615,6 +615,28 @@ list](#default-exclusions), and may overrule the [`.sourceignore` file
 exclusions](#sourceignore-file). See [excluding files](#excluding-files)
 for more information.
 
+### Sparse checkout
+
+`.spec.sparseCheckout` is an optional field to specify list of directories to
+checkout when cloning the repository. If specified, only the specified directory
+contents will be present in the artifact produced for this repository.
+
+```yaml
+apiVersion: source.toolkit.fluxcd.io/v1
+kind: GitRepository
+metadata:
+  name: podinfo
+  namespace: default
+spec:
+  interval: 5m
+  url: https://github.com/stefanprodan/podinfo
+  ref:
+    branch: master
+  sparseCheckout:
+  - charts
+  - kustomize
+```
+
 ### Suspend
 
 `.spec.suspend` is an optional field to suspend the reconciliation of a
@@ -1157,6 +1179,27 @@ status:
   ...
 ```
 
+### Observed Sparse Checkout
+
+The source-controller reports observed sparse checkout in the GitRepository's
+`.status.observedSparseCheckout`. The observed sparse checkout is the latest
+`.spec.sparseCheckout` value which resulted in a [ready
+state](#ready-gitrepository), or stalled due to error it can not recover from
+without human intervention. The value is the same as the [sparseCheckout in
+spec](#sparse-checkout). It indicates the sparse checkout configuration used in
+building the current artifact in storage. It is also used by the controller to
+determine if an artifact needs to be rebuilt.
+
+Example:
+```yaml
+status:
+  ...
+  observedSparseCheckout:
+  - charts
+  - kustomize
+  ...
+```
+
 ### Source Verification Mode
 
 The source-controller reports the Git object(s) it verified in the Git
diff --git a/internal/controller/gitrepository_controller.go b/internal/controller/gitrepository_controller.go
index 6b68af55b..d5361be33 100644
--- a/internal/controller/gitrepository_controller.go
+++ b/internal/controller/gitrepository_controller.go
@@ -590,6 +590,16 @@ func (r *GitRepositoryReconciler) reconcileSource(ctx context.Context, sp *patch
 	ctrl.LoggerFrom(ctx).V(logger.DebugLevel).Info("git repository checked out", "url", obj.Spec.URL, "revision", commitReference(obj, commit))
 	conditions.Delete(obj, sourcev1.FetchFailedCondition)
 
+	// Validate sparse checkout paths after successful checkout.
+	if err := r.validateSparseCheckoutPaths(ctx, obj, dir); err != nil {
+		e := serror.NewStalling(
+			fmt.Errorf("failed to sparse checkout directories : %w", err),
+			sourcev1.GitOperationFailedReason,
+		)
+		conditions.MarkTrue(obj, sourcev1.FetchFailedCondition, e.Reason, "%s", e)
+		return sreconcile.ResultEmpty, e
+	}
+
 	// Verify commit signature
 	if result, err := r.verifySignature(ctx, obj, *commit); err != nil || result == sreconcile.ResultEmpty {
 		return result, err
@@ -812,6 +822,7 @@ func (r *GitRepositoryReconciler) reconcileArtifact(ctx context.Context, sp *pat
 	obj.Status.ObservedIgnore = obj.Spec.Ignore
 	obj.Status.ObservedRecurseSubmodules = obj.Spec.RecurseSubmodules
 	obj.Status.ObservedInclude = obj.Spec.Include
+	obj.Status.ObservedSparseCheckout = obj.Spec.SparseCheckout
 
 	// Remove the deprecated symlink.
 	// TODO(hidde): remove 2 minor versions from introduction of v1.
@@ -884,6 +895,7 @@ func (r *GitRepositoryReconciler) reconcileInclude(ctx context.Context, sp *patc
 // performs a git checkout.
 func (r *GitRepositoryReconciler) gitCheckout(ctx context.Context, obj *sourcev1.GitRepository,
 	authOpts *git.AuthOptions, proxyOpts *transport.ProxyOptions, dir string, optimized bool) (*git.Commit, error) {
+
 	// Configure checkout strategy.
 	cloneOpts := repository.CloneConfig{
 		RecurseSubmodules: obj.Spec.RecurseSubmodules,
@@ -896,7 +908,14 @@ func (r *GitRepositoryReconciler) gitCheckout(ctx context.Context, obj *sourcev1
 		cloneOpts.SemVer = ref.SemVer
 		cloneOpts.RefName = ref.Name
 	}
-
+	if obj.Spec.SparseCheckout != nil {
+		// Trim any leading "./" in the directory paths since underlying go-git API does not honor them.
+		sparseCheckoutDirs := make([]string, len(obj.Spec.SparseCheckout))
+		for i, path := range obj.Spec.SparseCheckout {
+			sparseCheckoutDirs[i] = strings.TrimPrefix(path, "./")
+		}
+		cloneOpts.SparseCheckoutDirectories = sparseCheckoutDirs
+	}
 	// Only if the object has an existing artifact in storage, attempt to
 	// short-circuit clone operation. reconcileStorage has already verified
 	// that the artifact exists.
@@ -1172,6 +1191,14 @@ func gitContentConfigChanged(obj *sourcev1.GitRepository, includes *artifactSet)
 	if requiresVerification(obj) {
 		return true
 	}
+	if len(obj.Spec.SparseCheckout) != len(obj.Status.ObservedSparseCheckout) {
+		return true
+	}
+	for index, dir := range obj.Spec.SparseCheckout {
+		if dir != obj.Status.ObservedSparseCheckout[index] {
+			return true
+		}
+	}
 
 	// Convert artifactSet to index addressable artifacts and ensure that it and
 	// the included artifacts include all the include from the spec.
@@ -1206,6 +1233,19 @@ func gitContentConfigChanged(obj *sourcev1.GitRepository, includes *artifactSet)
 	return false
 }
 
+// validateSparseCheckoutPaths checks if the sparse checkout paths exist in the cloned repository.
+func (r *GitRepositoryReconciler) validateSparseCheckoutPaths(ctx context.Context, obj *sourcev1.GitRepository, dir string) error {
+	if obj.Spec.SparseCheckout != nil {
+		for _, path := range obj.Spec.SparseCheckout {
+			fullPath := filepath.Join(dir, path)
+			if _, err := os.Lstat(fullPath); err != nil {
+				return fmt.Errorf("sparse checkout dir '%s' does not exist in repository: %w", path, err)
+			}
+		}
+	}
+	return nil
+}
+
 // Returns true if both GitRepositoryIncludes are equal.
 func gitRepositoryIncludeEqual(a, b sourcev1.GitRepositoryInclude) bool {
 	if a.GitRepositoryRef != b.GitRepositoryRef {
diff --git a/internal/controller/gitrepository_controller_test.go b/internal/controller/gitrepository_controller_test.go
index 5eb4713f0..5dca5b160 100644
--- a/internal/controller/gitrepository_controller_test.go
+++ b/internal/controller/gitrepository_controller_test.go
@@ -3130,6 +3130,38 @@ func TestGitContentConfigChanged(t *testing.T) {
 			},
 			want: false,
 		},
+		{
+			name: "unobserved sparse checkout",
+			obj: sourcev1.GitRepository{
+				Spec:   sourcev1.GitRepositorySpec{SparseCheckout: []string{"a/b/c", "x/y/z"}},
+				Status: sourcev1.GitRepositoryStatus{ObservedSparseCheckout: []string{"a/b/c"}},
+			},
+			want: true,
+		},
+		{
+			name: "unobserved case sensitive sparse checkout",
+			obj: sourcev1.GitRepository{
+				Spec:   sourcev1.GitRepositorySpec{SparseCheckout: []string{"a/b/c", "x/y/Z"}},
+				Status: sourcev1.GitRepositoryStatus{ObservedSparseCheckout: []string{"a/b/c", "x/y/z"}},
+			},
+			want: true,
+		},
+		{
+			name: "observed sparse checkout",
+			obj: sourcev1.GitRepository{
+				Spec:   sourcev1.GitRepositorySpec{SparseCheckout: []string{"a/b/c", "x/y/z"}},
+				Status: sourcev1.GitRepositoryStatus{ObservedSparseCheckout: []string{"a/b/c", "x/y/z"}},
+			},
+			want: false,
+		},
+		{
+			name: "observed sparse checkout with leading slash",
+			obj: sourcev1.GitRepository{
+				Spec:   sourcev1.GitRepositorySpec{SparseCheckout: []string{"./a/b/c", "./x/y/z"}},
+				Status: sourcev1.GitRepositoryStatus{ObservedSparseCheckout: []string{"./a/b/c", "./x/y/z"}},
+			},
+			want: false,
+		},
 		{
 			name: "unobserved include",
 			obj: sourcev1.GitRepository{
 |