Skip to content
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
c1b79d3
feat(traces): implement periodic trace collection and timestamp repor…
CasLubbers Jan 27, 2026
9a2297e
refactor(traces): remove trace collection on installation and apply f…
CasLubbers Jan 27, 2026
2f9b239
feat(traces): implement trace collection loop with dynamic timing bas…
CasLubbers Jan 27, 2026
a32f5bd
fix(traces): update collection interval to 5 minutes for improved tra…
CasLubbers Jan 27, 2026
9680a23
Merge branch 'main' into APL-1444
svcAPLBot Jan 27, 2026
f0924ab
Merge branch 'main' into APL-1444
svcAPLBot Jan 27, 2026
e4e8a09
Merge branch 'main' into APL-1444
svcAPLBot Jan 28, 2026
fbd7e7d
Merge branch 'main' into APL-1444
svcAPLBot Jan 28, 2026
20a874c
Merge branch 'main' into APL-1444
svcAPLBot Jan 28, 2026
725b133
fix: use seconds instead of ms
CasLubbers Jan 28, 2026
e6bd552
Merge branch 'main' into APL-1444
svcAPLBot Jan 28, 2026
807f3ae
Merge branch 'main' into APL-1444
svcAPLBot Jan 28, 2026
bc528b8
Merge branch 'main' into APL-1444
svcAPLBot Jan 29, 2026
d46a238
Merge branch 'main' into APL-1444
svcAPLBot Jan 29, 2026
d22eb89
Merge branch 'main' into APL-1444
svcAPLBot Jan 29, 2026
f064022
Merge branch 'main' into APL-1444
svcAPLBot Jan 29, 2026
a8d2b6f
Merge branch 'main' into APL-1444
svcAPLBot Jan 29, 2026
7c8d355
Merge branch 'main' into APL-1444
svcAPLBot Jan 29, 2026
1314b31
Merge branch 'main' into APL-1444
svcAPLBot Jan 29, 2026
11b8faf
Merge branch 'main' into APL-1444
svcAPLBot Jan 29, 2026
3a8888e
Merge branch 'main' into APL-1444
svcAPLBot Jan 29, 2026
c372340
Merge branch 'main' into APL-1444
svcAPLBot Jan 29, 2026
43039b0
Merge branch 'main' into APL-1444
svcAPLBot Jan 29, 2026
64d61f4
Merge branch 'main' into APL-1444
svcAPLBot Jan 29, 2026
ae7a51e
Merge branch 'main' into APL-1444
svcAPLBot Jan 29, 2026
970300d
Merge branch 'main' into APL-1444
svcAPLBot Jan 30, 2026
dbb2288
Merge branch 'main' into APL-1444
svcAPLBot Jan 30, 2026
6047dbf
fix: review comments
CasLubbers Jan 30, 2026
d0716d9
Merge branch 'main' into APL-1444
svcAPLBot Jan 30, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions src/cmd/apply.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ import { runtimeUpgrade } from '../common/runtime-upgrade'
import { applyAsApps } from './apply-as-apps'
import { applyTeams } from './apply-teams'
import { commit } from './commit'
import { collectTraces } from './traces'

const cmdName = getFilename(__filename)
const dir = '/tmp/otomi/'
Expand Down Expand Up @@ -84,12 +83,6 @@ export const apply = async (): Promise<void> => {
await applyAll()
} catch (e) {
d.error(e)
// Collect traces on apply failure
try {
await collectTraces()
} catch (traceError) {
d.error('Failed to collect traces:', traceError)
}
d.info(`Retrying in ${retryOptions.maxTimeout} ms`)
await deletePendingHelmReleases()
throw e
Expand Down
17 changes: 1 addition & 16 deletions src/cmd/install.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,7 @@ import { cleanupHandler, prepareEnvironment } from 'src/common/cli'
import { logLevelString, terminal } from 'src/common/debug'
import { env } from 'src/common/envalid'
import { deployEssential, hf, HF_DEFAULT_SYNC_ARGS } from 'src/common/hf'
import {
applyServerSide,
deletePendingHelmReleases,
getDeploymentState,
getHelmReleases,
setDeploymentState,
waitForCRD,
} from 'src/common/k8s'
import { applyServerSide, getDeploymentState, getHelmReleases, setDeploymentState, waitForCRD } from 'src/common/k8s'
import { getFilename, rootDir } from 'src/common/utils'
import { getImageTagFromValues, getPackageVersion, writeValuesToFile } from 'src/common/values'
import { getParsedArgs, HelmArguments, helmOptions, setParsedArgs } from 'src/common/yargs'
Expand All @@ -24,7 +17,6 @@ import {
createWelcomeConfigMap,
initialSetupData,
} from './commit'
import { collectTraces } from './traces'

const cmdName = getFilename(__filename)
const dir = '/tmp/otomi/'
Expand Down Expand Up @@ -129,13 +121,6 @@ const install = async (): Promise<void> => {
await installAll()
} catch (e) {
d.error(e)
// Collect traces on installation failure
try {
await collectTraces()
await deletePendingHelmReleases()
} catch (traceError) {
d.error('Failed to collect traces:', traceError)
}
throw e
}
return
Expand Down
45 changes: 32 additions & 13 deletions src/cmd/traces.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,13 @@ describe('Collect Traces Command', () => {
jest.clearAllMocks()
})

// Helper function to extract report from timestamped key
const extractReportFromConfigMap = (data: Record<string, string>): any => {
const reportKey = Object.keys(data).find((key) => key.startsWith('report-'))
if (!reportKey) throw new Error('No report key found in ConfigMap data')
return JSON.parse(data[reportKey])
}

it('should detect all types of failed resources and store in ConfigMap', async () => {
// Mock various failing resources
mockCoreApi.listPodForAllNamespaces.mockResolvedValue({
Expand Down Expand Up @@ -480,12 +487,15 @@ describe('Collect Traces Command', () => {

await collectTraces()

expect(mockCreateUpdateConfigMap).toHaveBeenCalledWith(mockCoreApi, 'apl-traces-report', 'apl-operator', {
report: expect.any(String),
})
expect(mockCreateUpdateConfigMap).toHaveBeenCalledWith(
mockCoreApi,
'apl-traces-report',
'apl-operator',
expect.objectContaining({}),
)

const configMapCall = mockCreateUpdateConfigMap.mock.calls[0]
const reportData = JSON.parse(configMapCall[3].report)
const [, , , configMapData] = mockCreateUpdateConfigMap.mock.calls[0]
const reportData = extractReportFromConfigMap(configMapData)

// Should have all resource types
expect(reportData.failedResources.length).toBeGreaterThan(0)
Expand Down Expand Up @@ -516,8 +526,12 @@ describe('Collect Traces Command', () => {

await collectTraces()

// Should not create ConfigMap for healthy cluster
expect(mockCreateUpdateConfigMap).not.toHaveBeenCalled()
// Should always create ConfigMap (even when healthy, for timestamp visibility)
expect(mockCreateUpdateConfigMap).toHaveBeenCalled()

const [, , , configMapData] = mockCreateUpdateConfigMap.mock.calls[0]
const reportData = extractReportFromConfigMap(configMapData)
expect(reportData.failedResources).toEqual([])
})

it('should call createUpdateConfigMap when there are issues', async () => {
Expand Down Expand Up @@ -586,8 +600,8 @@ describe('Collect Traces Command', () => {
// Should create ConfigMap with deployment issues
expect(mockCreateUpdateConfigMap).toHaveBeenCalled()

const configMapCall = mockCreateUpdateConfigMap.mock.calls[0]
const reportData = JSON.parse(configMapCall[3].report)
const [, , , configMapData] = mockCreateUpdateConfigMap.mock.calls[0]
const reportData = extractReportFromConfigMap(configMapData)

// Should have deployment in failed resources
expect(reportData.failedResources).toEqual(
Expand Down Expand Up @@ -638,8 +652,8 @@ describe('Collect Traces Command', () => {

await collectTraces()

const configMapCall = mockCreateUpdateConfigMap.mock.calls[0]
const reportData = JSON.parse(configMapCall[3].report)
const [, , , configMapData] = mockCreateUpdateConfigMap.mock.calls[0]
const reportData = extractReportFromConfigMap(configMapData)

// Should not have errors field when all collections succeed
expect(reportData.errors).toBeUndefined()
Expand All @@ -659,7 +673,12 @@ describe('Collect Traces Command', () => {

// Should complete without throwing despite all failures
expect(mockCoreApi.listPodForAllNamespaces).toHaveBeenCalled()
// Should not create ConfigMap when no issues found and all failed
expect(mockCreateUpdateConfigMap).not.toHaveBeenCalled()
// Should always create ConfigMap (even when all fail, for timestamp visibility and error reporting)
expect(mockCreateUpdateConfigMap).toHaveBeenCalled()

const [, , , configMapData] = mockCreateUpdateConfigMap.mock.calls[0]
const reportData = extractReportFromConfigMap(configMapData)
expect(reportData.errors).toBeDefined()
expect(reportData.errors.length).toBeGreaterThan(0)
})
})
88 changes: 81 additions & 7 deletions src/cmd/traces.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,19 @@ import { createUpdateConfigMap, k8s } from 'src/common/k8s'
import { getFilename } from 'src/common/utils'
import { BasicArguments, setParsedArgs } from 'src/common/yargs'
import { Argv } from 'yargs'
import { getErrorMessage } from 'src/operator/utils'
import { env } from 'src/common/envalid'

const cmdName = getFilename(__filename)

const { COLLECTION_INTERVAL_SECONDS, COLLECTION_DURATION_SECONDS } = env
const COLLECTION_INTERVAL_MS = COLLECTION_INTERVAL_SECONDS * 1000
const COLLECTION_DURATION_MS = COLLECTION_DURATION_SECONDS * 1000
interface ResourceReport {
kind: string
name: string
namespace: string
value: string
timestamp: string // When this event was captured
}

interface TraceReport {
Expand Down Expand Up @@ -118,6 +123,7 @@ async function getPodsWithIssues(): Promise<ResourceReport[]> {
name: podName,
namespace,
value: issue,
timestamp: new Date().toISOString(),
})
})
}),
Expand All @@ -132,6 +138,7 @@ async function getPodsWithIssues(): Promise<ResourceReport[]> {
async function getDeploymentsWithIssues(): Promise<ResourceReport[]> {
const appsApi = k8s.app()
const response = await appsApi.listDeploymentForAllNamespaces()
const timestamp = new Date().toISOString()

return response.items
.filter((deployment) => deployment.status?.replicas !== deployment.status?.availableReplicas)
Expand All @@ -140,6 +147,7 @@ async function getDeploymentsWithIssues(): Promise<ResourceReport[]> {
name: deployment.metadata?.name || 'unknown',
namespace: deployment.metadata?.namespace || 'default',
value: `Desired ${deployment.status?.replicas}, Available ${deployment.status?.availableReplicas}`,
timestamp,
}))
}

Expand All @@ -158,6 +166,7 @@ async function getStatefulSetsWithIssues(): Promise<ResourceReport[]> {
if (!namespace) return

const response = await appsApi.listNamespacedStatefulSet({ namespace })
const timestamp = new Date().toISOString()
response.items.forEach((sts) => {
const replicas = sts.spec?.replicas || 0
const readyReplicas = sts.status?.readyReplicas || 0
Expand All @@ -167,6 +176,7 @@ async function getStatefulSetsWithIssues(): Promise<ResourceReport[]> {
name: sts.metadata?.name || 'unknown',
namespace,
value: `Desired ${replicas}, Ready ${readyReplicas}`,
timestamp,
})
}
})
Expand All @@ -182,6 +192,7 @@ async function getStatefulSetsWithIssues(): Promise<ResourceReport[]> {
async function getNodesWithIssues(): Promise<ResourceReport[]> {
const coreApi = k8s.core()
const response = await coreApi.listNode()
const timestamp = new Date().toISOString()

return response.items
.filter((node) => node.status?.conditions?.some((cond) => cond.type === 'Ready' && cond.status !== 'True'))
Expand All @@ -190,6 +201,7 @@ async function getNodesWithIssues(): Promise<ResourceReport[]> {
name: node.metadata?.name || 'unknown',
namespace: 'N/A',
value: 'Node not Ready',
timestamp,
}))
}

Expand All @@ -199,6 +211,7 @@ async function getNodesWithIssues(): Promise<ResourceReport[]> {
async function getServicesWithIssues(): Promise<ResourceReport[]> {
const coreApi = k8s.core()
const response = await coreApi.listServiceForAllNamespaces()
const timestamp = new Date().toISOString()

return response.items
.map((service) => {
Expand All @@ -217,6 +230,7 @@ async function getServicesWithIssues(): Promise<ResourceReport[]> {
name,
namespace,
value: issue,
timestamp,
}
}
return null
Expand All @@ -238,6 +252,7 @@ async function getPVCsWithIssues(): Promise<ResourceReport[]> {
if (!namespace) return

const response = await coreApi.listNamespacedPersistentVolumeClaim({ namespace })
const timestamp = new Date().toISOString()
response.items.forEach((pvc) => {
if (pvc.status?.phase !== 'Bound') {
const conditions = pvc.status?.conditions?.map((c) => `${c.type}: ${c.message}`).join('; ') || ''
Expand All @@ -246,6 +261,7 @@ async function getPVCsWithIssues(): Promise<ResourceReport[]> {
name: pvc.metadata?.name || 'unknown',
namespace,
value: `Phase: ${pvc.status?.phase}${conditions ? `. ${conditions}` : ''}`,
timestamp,
})
}
})
Expand All @@ -261,6 +277,7 @@ async function getPVCsWithIssues(): Promise<ResourceReport[]> {
async function getPVsWithIssues(): Promise<ResourceReport[]> {
const coreApi = k8s.core()
const response = await coreApi.listPersistentVolume()
const timestamp = new Date().toISOString()

return response.items
.filter((pv) => pv.status?.phase !== 'Available' && pv.status?.phase !== 'Bound')
Expand All @@ -269,6 +286,7 @@ async function getPVsWithIssues(): Promise<ResourceReport[]> {
name: pv.metadata?.name || 'unknown',
namespace: 'N/A',
value: `Phase: ${pv.status?.phase}`,
timestamp,
}))
}

Expand All @@ -286,6 +304,7 @@ async function getArgoApplicationsWithIssues(): Promise<ResourceReport[]> {
})

const items = (response as any).items || []
const timestamp = new Date().toISOString()

items.forEach((app: any) => {
const name = app.metadata?.name || 'unknown'
Expand Down Expand Up @@ -315,6 +334,7 @@ async function getArgoApplicationsWithIssues(): Promise<ResourceReport[]> {
name,
namespace,
value: issue,
timestamp,
})
})
})
Expand All @@ -328,8 +348,10 @@ async function getArgoApplicationsWithIssues(): Promise<ResourceReport[]> {
async function writeReportToConfigMap(name: string, namespace: string, report: TraceReport): Promise<void> {
const coreApi = k8s.core()
const reportJson = JSON.stringify(report, null, 2)
// ConfigMap keys must match [-._a-zA-Z0-9]+, so replace colons with dots
const reportKey = `report-${report.timestamp.replace(/:/g, '.')}`

await createUpdateConfigMap(coreApi, name, namespace, { report: reportJson })
await createUpdateConfigMap(coreApi, name, namespace, { [reportKey]: reportJson })
}

/**
Expand Down Expand Up @@ -398,18 +420,70 @@ export async function collectTraces(): Promise<void> {

if (failedResources.length === 0) {
d.info('No failing resources found. Your APL instance seems to be healthy.')
} else {
await writeReportToConfigMap(configMapName, targetNamespace, report)
d.info(
`Trace report stored in ConfigMap ${targetNamespace}/${configMapName} (${failedResources.length} failed resources)`,
)
}

// Always write the report to ConfigMap (even when healthy, for timestamp visibility)
await writeReportToConfigMap(configMapName, targetNamespace, report)
d.info(`Trace report stored in ConfigMap ${targetNamespace}/${configMapName} (${failedResources.length} issues)`)
} catch (error) {
d.error('Failed to collect traces:', error)
throw error
}
}

async function getCollectionStartTime(name: string, namespace: string): Promise<number> {
const coreApi = k8s.core()

try {
const configMap = await coreApi.readNamespacedConfigMap({ name, namespace })
// Use ConfigMap's creation timestamp as the start time
if (configMap.metadata?.creationTimestamp) {
return new Date(configMap.metadata.creationTimestamp).getTime()
}
} catch {
// ConfigMap doesn't exist yet, will be created by first collectTraces() call
}

// No ConfigMap yet, use current time (ConfigMap will be created by collectTraces)
return Date.now()
}

export async function runTraceCollectionLoop(): Promise<void> {
const d = terminal('cmd:traces:runTraceCollectionLoop')
const configMapName = 'apl-traces-report'
const namespace = 'apl-operator'

// Get collection start time from ConfigMap creation timestamp
const startTime = await getCollectionStartTime(configMapName, namespace)
const endTime = startTime + COLLECTION_DURATION_MS
const now = Date.now()

if (now >= endTime) {
d.info('Trace collection window (30 minutes) already elapsed, skipping')
return
}

const remainingMs = endTime - now
d.info(`Starting trace collection loop (${Math.round(remainingMs / 60000)} minutes remaining)`)

while (Date.now() < endTime) {
try {
d.info('Running periodic trace collection')
await collectTraces()
} catch (error) {
d.warn('Failed to collect traces:', getErrorMessage(error))
}

const remainingTime = endTime - Date.now()
if (remainingTime > 0) {
const waitTime = Math.min(COLLECTION_INTERVAL_MS, remainingTime)
await new Promise((resolve) => setTimeout(resolve, waitTime))
}
}

d.info('Trace collection loop completed')
}

export const module = {
command: 'traces',
describe: 'Collect traces of failed resources and store report in ConfigMap',
Expand Down
2 changes: 2 additions & 0 deletions src/common/envalid.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ export const cliEnvSpec = {
desc: 'Target revision to set for ArgoCD applications. If not set, uses the current image tag.',
default: undefined,
}),
COLLECTION_DURATION_SECONDS: num({ desc: 'Traces collection duration (default 30 min)', default: 1800 }),
COLLECTION_INTERVAL_SECONDS: num({ desc: 'Traces collection interval (default 5 min)', default: 300 }),
}

export function cleanEnv<T>(spec: { [K in keyof T]: ValidatorSpec<T[K]> }, options?: CleanOptions<T>) {
Expand Down
Loading
Loading