Skip to content

Commit 1b555b0

Browse files
CSPL-3499: Fix Segmentation Fault in Splunk Operator Due to App Volume Name Change (#1468)
* handle error from remoteDataClientMgr * implement unit test for bug fix * add comment in documentation * update comment on test case * udpate error log to be easier to debug
1 parent 61f7583 commit 1b555b0

File tree

4 files changed

+133
-5
lines changed

4 files changed

+133
-5
lines changed

docs/AppFramework.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -577,6 +577,8 @@ Here is a typical App framework configuration in a Custom Resource definition:
577577

578578
`appSources` defines the name and scope of the appSource, the remote storage volume, and its location.
579579

580+
NOTE: If an app source name needs to be changed, make sure the name change is persisted across the app framework spec and CR status. The Splunk Operator should automatically update the app path in the CR and on the pod on the next reconciliation.
581+
580582
* `name` uniquely identifies the App source configuration within a CR. This used locally by the Operator to identify the App source.
581583
* `scope` defines the scope of the app to be installed.
582584
* If the scope is `local`, the apps will be installed and run locally on the pod referred to by the CR.

pkg/splunk/enterprise/afwscheduler.go

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -557,9 +557,6 @@ downloadWork:
557557
continue
558558
}
559559

560-
// increment the count in worker waitgroup
561-
downloadWorker.waiter.Add(1)
562-
563560
// update the download state of app to be DownloadInProgress
564561
updatePplnWorkerPhaseInfo(ctx, downloadWorker.appDeployInfo, downloadWorker.appDeployInfo.PhaseInfo.FailCount, enterpriseApi.AppPkgDownloadInProgress)
565562

@@ -577,7 +574,18 @@ downloadWork:
577574
}
578575

579576
// get the remoteDataClientMgr instance
580-
remoteDataClientMgr, _ := getRemoteDataClientMgr(ctx, downloadWorker.client, downloadWorker.cr, downloadWorker.afwConfig, downloadWorker.appSrcName)
577+
remoteDataClientMgr, err := getRemoteDataClientMgr(ctx, downloadWorker.client, downloadWorker.cr, downloadWorker.afwConfig, downloadWorker.appSrcName)
578+
if err != nil {
579+
scopedLog.Error(err, "unable to get remote data client manager")
580+
// increment the retry count and mark this app as download error
581+
updatePplnWorkerPhaseInfo(ctx, appDeployInfo, appDeployInfo.PhaseInfo.FailCount+1, enterpriseApi.AppPkgDownloadError)
582+
583+
<-downloadWorkersRunPool
584+
continue
585+
}
586+
587+
// increment the count in worker waitgroup
588+
downloadWorker.waiter.Add(1)
581589

582590
// start the actual download
583591
go downloadWorker.download(ctx, pplnPhase, *remoteDataClientMgr, localPath, downloadWorkersRunPool)

pkg/splunk/enterprise/afwscheduler_test.go

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1828,6 +1828,124 @@ func TestScheduleDownloads(t *testing.T) {
18281828
downloadPhaseWaiter.Wait()
18291829
}
18301830

1831+
// This test case is to verify that the operator does not crash when the app name is changed.
1832+
func TestScheduleDownloadsFailRemoteDataClientMgr(t *testing.T) {
1833+
ctx := context.TODO()
1834+
var ppln *AppInstallPipeline
1835+
appDeployContext := &enterpriseApi.AppDeploymentContext{}
1836+
1837+
client := spltest.NewMockClient()
1838+
cr := enterpriseApi.Standalone{
1839+
ObjectMeta: metav1.ObjectMeta{
1840+
Name: "s1",
1841+
Namespace: "test",
1842+
},
1843+
TypeMeta: metav1.TypeMeta{
1844+
Kind: "Standalone",
1845+
},
1846+
Spec: enterpriseApi.StandaloneSpec{
1847+
Replicas: 1,
1848+
AppFrameworkConfig: enterpriseApi.AppFrameworkSpec{
1849+
PhaseMaxRetries: 3,
1850+
AppsRepoPollInterval: 60,
1851+
MaxConcurrentAppDownloads: 5,
1852+
VolList: []enterpriseApi.VolumeSpec{
1853+
{
1854+
Name: "test_volume",
1855+
Endpoint: "https://s3-eu-west-2.amazonaws.com",
1856+
Path: "testbucket-rs-london",
1857+
SecretRef: "s3-secret",
1858+
Provider: "aws",
1859+
},
1860+
},
1861+
AppSources: []enterpriseApi.AppSourceSpec{
1862+
{
1863+
Name: "appSrc1",
1864+
Location: "adminAppsRepo",
1865+
AppSourceDefaultSpec: enterpriseApi.AppSourceDefaultSpec{
1866+
VolName: "test_volume",
1867+
Scope: "local",
1868+
},
1869+
},
1870+
},
1871+
},
1872+
},
1873+
}
1874+
sts := &appsv1.StatefulSet{
1875+
ObjectMeta: metav1.ObjectMeta{
1876+
Name: "splunk-s1-standalone",
1877+
Namespace: "test",
1878+
},
1879+
}
1880+
sts.Spec.Replicas = new(int32)
1881+
*sts.Spec.Replicas = 1
1882+
1883+
ppln = initAppInstallPipeline(ctx, appDeployContext, client, &cr)
1884+
pplnPhase := ppln.pplnPhases[enterpriseApi.PhaseDownload]
1885+
1886+
testApps := []string{"app2.tgz"}
1887+
testHashes := []string{"efgh2222"}
1888+
testSizes := []int64{20}
1889+
1890+
appDeployInfoList := make([]*enterpriseApi.AppDeploymentInfo, 1)
1891+
for index := range testApps {
1892+
appDeployInfoList[index] = &enterpriseApi.AppDeploymentInfo{
1893+
AppName: testApps[index],
1894+
PhaseInfo: enterpriseApi.PhaseInfo{
1895+
Phase: enterpriseApi.PhaseDownload,
1896+
Status: enterpriseApi.AppPkgDownloadPending,
1897+
FailCount: 0,
1898+
},
1899+
ObjectHash: testHashes[index],
1900+
Size: uint64(testSizes[index]),
1901+
}
1902+
}
1903+
1904+
// create the local directory
1905+
localPath := filepath.Join(splcommon.AppDownloadVolume, "downloadedApps", "test" /*namespace*/, "Standalone", cr.Name, "local", "appSrc1") + "/"
1906+
err := createAppDownloadDir(ctx, localPath)
1907+
if err != nil {
1908+
t.Errorf("Unable to create the download directory")
1909+
}
1910+
defer os.RemoveAll(splcommon.AppDownloadVolume)
1911+
1912+
// create the dummy app package for appSrc1 locally, to test the case
1913+
// where an app is already downloaded and hence we dont re-download it
1914+
appFileName := testApps[0] + "_" + testHashes[0]
1915+
appLoc := filepath.Join(localPath, appFileName)
1916+
err = createOrTruncateAppFileLocally(appLoc, testSizes[0])
1917+
if err != nil {
1918+
t.Errorf("Unable to create the app files locally")
1919+
}
1920+
defer os.Remove(appLoc)
1921+
1922+
// create pipeline workers
1923+
ppln.createAndAddPipelineWorker(ctx, enterpriseApi.PhaseDownload, appDeployInfoList[0], "appSrc2", "", &cr.Spec.AppFrameworkConfig, client, &cr, sts)
1924+
1925+
maxWorkers := 1
1926+
downloadPhaseWaiter := new(sync.WaitGroup)
1927+
1928+
downloadPhaseWaiter.Add(1)
1929+
// schedule the download threads to do actual download work
1930+
go pplnPhase.downloadWorkerHandler(ctx, ppln, uint64(maxWorkers), downloadPhaseWaiter)
1931+
1932+
// add the workers to msgChannel so that scheduleDownlads thread can pick them up
1933+
for _, downloadWorker := range pplnPhase.q {
1934+
downloadWorker.waiter = &pplnPhase.workerWaiter
1935+
pplnPhase.msgChannel <- downloadWorker
1936+
downloadWorker.isActive = true
1937+
}
1938+
1939+
close(pplnPhase.msgChannel)
1940+
1941+
downloadPhaseWaiter.Add(1)
1942+
close(ppln.sigTerm)
1943+
// schedule the download threads to do actual download work
1944+
go pplnPhase.downloadWorkerHandler(ctx, ppln, uint64(maxWorkers), downloadPhaseWaiter)
1945+
1946+
downloadPhaseWaiter.Wait()
1947+
}
1948+
18311949
func TestCreateDownloadDirOnOperator(t *testing.T) {
18321950
ctx := context.TODO()
18331951
standalone := enterpriseApi.Standalone{

pkg/splunk/enterprise/util.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -535,7 +535,7 @@ func getRemoteObjectKey(ctx context.Context, cr splcommon.MetaObject, appFramewo
535535

536536
appSrc, err := getAppSrcSpec(appFrameworkConfig.AppSources, appSrcName)
537537
if err != nil {
538-
scopedLog.Error(err, "unable to get appSrcSpc")
538+
scopedLog.Error(err, "unable to get appSourceSpec")
539539
return remoteObjectKey, err
540540
}
541541

0 commit comments

Comments
 (0)