diff --git a/.github/workflows/docker-security-scan.yml b/.github/workflows/docker-security-scan.yml new file mode 100644 index 00000000..2ddad760 --- /dev/null +++ b/.github/workflows/docker-security-scan.yml @@ -0,0 +1,218 @@ +name: Docker Security Scan + +on: + push: + branches: [main, develop] + paths: + - "docker/**" + - "templates/docker-compose/**" + - ".github/workflows/docker-security-scan.yml" + + pull_request: + paths: + - "docker/**" + - "templates/docker-compose/**" + - ".github/workflows/docker-security-scan.yml" + + # Scheduled scans are important because new CVEs appear + # even if the code or images didn’t change + schedule: + - cron: "0 6 * * *" # Daily at 6 AM UTC + + workflow_dispatch: + +jobs: + scan-project-images: + name: Scan Project-Built Docker Images + runs-on: ubuntu-latest + timeout-minutes: 15 + permissions: + contents: read + + strategy: + fail-fast: false + matrix: + image: + - dockerfile: docker/provisioned-instance/Dockerfile + context: docker/provisioned-instance + name: provisioned-instance + - dockerfile: docker/ssh-server/Dockerfile + context: docker/ssh-server + name: ssh-server + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + # Build images locally so Trivy scans exactly + # what this repository produces + - name: Build Docker image + run: | + docker build \ + -t torrust-tracker-deployer/${{ matrix.image.name }}:latest \ + -f ${{ matrix.image.dockerfile }} \ + . + + # Human-readable output in logs + # This NEVER fails the job; it’s only for visibility + - name: Display vulnerabilities (table format) + uses: aquasecurity/trivy-action@0.33.1 + with: + image-ref: torrust-tracker-deployer/${{ matrix.image.name }}:latest + format: "table" + severity: "HIGH,CRITICAL" + exit-code: "0" + + # SARIF generation for GitHub Code Scanning + # + # IMPORTANT: + # - exit-code MUST be 0 + # - Trivy sometimes exits with 1 even when no vulns exist + # - GitHub Security UI is responsible for enforcement + - name: Generate SARIF (Code Scanning) + uses: aquasecurity/trivy-action@0.33.1 + with: + image-ref: torrust-tracker-deployer/${{ matrix.image.name }}:latest + format: "sarif" + output: "trivy-${{ matrix.image.name }}.sarif" + severity: "HIGH,CRITICAL" + exit-code: "0" + scanners: "vuln" + + - name: Upload SARIF artifact + uses: actions/upload-artifact@v4 + if: always() + with: + name: sarif-project-${{ matrix.image.name }}-${{ github.run_id }} + path: trivy-${{ matrix.image.name }}.sarif + retention-days: 30 + + scan-third-party-images: + name: Scan Third-Party Docker Images + runs-on: ubuntu-latest + timeout-minutes: 15 + permissions: + contents: read + + strategy: + fail-fast: false + matrix: + # These must match docker-compose templates + # in templates/docker-compose/docker-compose.yml.tera + image: + - torrust/tracker:develop + - mysql:8.0 + - grafana/grafana:11.4.0 + - prom/prometheus:v3.0.1 + + steps: + - name: Display vulnerabilities (table format) + uses: aquasecurity/trivy-action@0.33.1 + with: + image-ref: ${{ matrix.image }} + format: "table" + severity: "HIGH,CRITICAL" + exit-code: "0" + + # Third-party images should NEVER block CI. + # We only report findings to GitHub Security. + - name: Generate SARIF (Code Scanning) + uses: aquasecurity/trivy-action@0.33.1 + with: + image-ref: ${{ matrix.image }} + format: "sarif" + output: "trivy.sarif" + severity: "HIGH,CRITICAL" + exit-code: "0" + scanners: "vuln" + + # Needed to produce stable artifact names + - name: Sanitize image name + id: sanitize + run: | + echo "name=$(echo '${{ matrix.image }}' | tr '/:' '-')" >> "$GITHUB_OUTPUT" + + - name: Upload SARIF artifact + uses: actions/upload-artifact@v4 + if: always() + with: + name: sarif-third-party-${{ steps.sanitize.outputs.name }}-${{ github.run_id }} + path: trivy.sarif + retention-days: 30 + + upload-sarif-results: + name: Upload SARIF Results to GitHub Security + runs-on: ubuntu-latest + needs: + - scan-project-images + - scan-third-party-images + + # Always run so we don’t lose security visibility + if: always() + + permissions: + security-events: write + + steps: + - name: Download all SARIF artifacts + uses: actions/download-artifact@v4 + with: + pattern: sarif-*-${{ github.run_id }} + + # Upload each SARIF file with CodeQL Action using unique categories. + # The category parameter enables proper alert tracking per image. + # Must use CodeQL Action (not gh API) - API doesn't support category field. + # + # VIEWING RESULTS: + # - For pull requests: /security/code-scanning?query=pr:NUMBER+is:open + # - For branches: /security/code-scanning?query=is:open+branch:BRANCH-NAME + # - For main branch: /security/code-scanning?query=is:open+branch:main (default view) + # The default Security tab filters by "is:open branch:main" which only shows + # alerts from the main branch, not from PR branches. + - name: Upload project provisioned-instance SARIF + if: always() + uses: github/codeql-action/upload-sarif@v4 + with: + sarif_file: sarif-project-provisioned-instance-${{ github.run_id }}/trivy-provisioned-instance.sarif + category: docker-project-provisioned-instance + continue-on-error: true + + - name: Upload project ssh-server SARIF + if: always() + uses: github/codeql-action/upload-sarif@v4 + with: + sarif_file: sarif-project-ssh-server-${{ github.run_id }}/trivy-ssh-server.sarif + category: docker-project-ssh-server + continue-on-error: true + + - name: Upload third-party mysql SARIF + if: always() + uses: github/codeql-action/upload-sarif@v4 + with: + sarif_file: sarif-third-party-mysql-8.0-${{ github.run_id }}/trivy.sarif + category: docker-third-party-mysql-8.0 + continue-on-error: true + + - name: Upload third-party tracker SARIF + if: always() + uses: github/codeql-action/upload-sarif@v4 + with: + sarif_file: sarif-third-party-torrust-tracker-develop-${{ github.run_id }}/trivy.sarif + category: docker-third-party-torrust-tracker-develop + continue-on-error: true + + - name: Upload third-party grafana SARIF + if: always() + uses: github/codeql-action/upload-sarif@v4 + with: + sarif_file: sarif-third-party-grafana-grafana-11.4.0-${{ github.run_id }}/trivy.sarif + category: docker-third-party-grafana-grafana-11.4.0 + continue-on-error: true + + - name: Upload third-party prometheus SARIF + if: always() + uses: github/codeql-action/upload-sarif@v4 + with: + sarif_file: sarif-third-party-prom-prometheus-v3.0.1-${{ github.run_id }}/trivy.sarif + category: docker-third-party-prom-prometheus-v3.0.1 + continue-on-error: true diff --git a/README.md b/README.md index ab07bda4..95365445 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -[![Linting](https://github.com/torrust/torrust-tracker-deployer/actions/workflows/linting.yml/badge.svg)](https://github.com/torrust/torrust-tracker-deployer/actions/workflows/linting.yml) [![Testing](https://github.com/torrust/torrust-tracker-deployer/actions/workflows/testing.yml/badge.svg)](https://github.com/torrust/torrust-tracker-deployer/actions/workflows/testing.yml) [![E2E Infrastructure Tests](https://github.com/torrust/torrust-tracker-deployer/actions/workflows/test-e2e-infrastructure.yml/badge.svg)](https://github.com/torrust/torrust-tracker-deployer/actions/workflows/test-e2e-infrastructure.yml) [![E2E Deployment Tests](https://github.com/torrust/torrust-tracker-deployer/actions/workflows/test-e2e-deployment.yml/badge.svg)](https://github.com/torrust/torrust-tracker-deployer/actions/workflows/test-e2e-deployment.yml) [![Test LXD Container Provisioning](https://github.com/torrust/torrust-tracker-deployer/actions/workflows/test-lxd-provision.yml/badge.svg)](https://github.com/torrust/torrust-tracker-deployer/actions/workflows/test-lxd-provision.yml) [![Coverage](https://github.com/torrust/torrust-tracker-deployer/actions/workflows/coverage.yml/badge.svg)](https://github.com/torrust/torrust-tracker-deployer/actions/workflows/coverage.yml) +[![Linting](https://github.com/torrust/torrust-tracker-deployer/actions/workflows/linting.yml/badge.svg)](https://github.com/torrust/torrust-tracker-deployer/actions/workflows/linting.yml) [![Testing](https://github.com/torrust/torrust-tracker-deployer/actions/workflows/testing.yml/badge.svg)](https://github.com/torrust/torrust-tracker-deployer/actions/workflows/testing.yml) [![E2E Infrastructure Tests](https://github.com/torrust/torrust-tracker-deployer/actions/workflows/test-e2e-infrastructure.yml/badge.svg)](https://github.com/torrust/torrust-tracker-deployer/actions/workflows/test-e2e-infrastructure.yml) [![E2E Deployment Tests](https://github.com/torrust/torrust-tracker-deployer/actions/workflows/test-e2e-deployment.yml/badge.svg)](https://github.com/torrust/torrust-tracker-deployer/actions/workflows/test-e2e-deployment.yml) [![Test LXD Container Provisioning](https://github.com/torrust/torrust-tracker-deployer/actions/workflows/test-lxd-provision.yml/badge.svg)](https://github.com/torrust/torrust-tracker-deployer/actions/workflows/test-lxd-provision.yml) [![Coverage](https://github.com/torrust/torrust-tracker-deployer/actions/workflows/coverage.yml/badge.svg)](https://github.com/torrust/torrust-tracker-deployer/actions/workflows/coverage.yml) [![Docker Security Scan](https://github.com/torrust/torrust-tracker-deployer/actions/workflows/docker-security-scan.yml/badge.svg)](https://github.com/torrust/torrust-tracker-deployer/actions/workflows/docker-security-scan.yml) # Torrust Tracker Deployer diff --git a/docs/decisions/README.md b/docs/decisions/README.md index 9f224108..64ad8996 100644 --- a/docs/decisions/README.md +++ b/docs/decisions/README.md @@ -6,6 +6,7 @@ This directory contains architectural decision records for the Torrust Tracker D | Status | Date | Decision | Summary | | ------------- | ---------- | --------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------ | +| ✅ Accepted | 2025-12-23 | [Docker Security Scan Exit Code Zero](./docker-security-scan-exit-code-zero.md) | Use exit-code 0 for security scanning - Trivy detects, GitHub Security decides, CI green | | ✅ Accepted | 2025-12-20 | [Grafana Integration Pattern](./grafana-integration-pattern.md) | Enable Grafana by default with hard Prometheus dependency and environment variable config | | ✅ Accepted | 2025-12-17 | [Secrecy Crate for Sensitive Data Handling](./secrecy-crate-for-sensitive-data.md) | Use secrecy crate for type-safe secret handling with memory zeroing | | ✅ Accepted | 2025-12-14 | [Database Configuration Structure in Templates](./database-configuration-structure-in-templates.md) | Expose structured database fields in templates rather than pre-resolved connection strings | diff --git a/docs/decisions/docker-security-scan-exit-code-zero.md b/docs/decisions/docker-security-scan-exit-code-zero.md new file mode 100644 index 00000000..4fa3c6c0 --- /dev/null +++ b/docs/decisions/docker-security-scan-exit-code-zero.md @@ -0,0 +1,138 @@ +# Decision: Exit Code Zero for Docker Security Scanning + +## Status + +Accepted + +## Date + +2025-12-23 + +## Context + +When implementing automated Docker vulnerability scanning with Trivy in GitHub Actions, we faced a critical decision about how the CI/CD pipeline should respond to discovered vulnerabilities. + +Traditional approaches make CI fail when vulnerabilities are found, blocking all development until issues are resolved. However, this creates several problems: + +1. **False Positives**: Security scanners can report issues that don't apply to our context or are accepted risks +2. **Third-Party Dependencies**: We cannot immediately fix vulnerabilities in upstream images (mysql, prometheus, grafana) +3. **Scanner Quirks**: Trivy occasionally exits with code 1 even when no vulnerabilities are found +4. **Development Flow**: Security findings should not block unrelated development work +5. **Policy Enforcement**: Security decisions should be made by security teams, not automated tooling +6. **Partial Data Loss**: If CI fails early, later scans never run and we lose visibility into other images + +The initial implementation used `exit-code: "1"` which caused the workflow to fail on any HIGH or CRITICAL vulnerability, including when scanning third-party production images with known CVEs that we cannot immediately fix. + +## Decision + +Implement a **security-first philosophy** where: + +1. **Exit Code Zero Everywhere**: All Trivy scan steps use `exit-code: "0"` - the scanner never fails the CI pipeline +2. **Dual Output Strategy**: + - Human-readable table format in workflow logs for immediate visibility + - SARIF format uploaded to GitHub Security tab for tracking and alerting +3. **Separation of Concerns**: + - Trivy's role: **Detect** vulnerabilities and provide data + - GitHub Security's role: **Decide** enforcement policies and alert routing + - CI's role: **Stay green** and maintain development velocity +4. **Always Run Policy**: Upload job uses `if: always()` to ensure partial results are never lost +5. **Unique Categories**: Each image gets a unique SARIF category for proper alert tracking and deduplication +6. **Scheduled Scanning**: Daily cron ensures continuous monitoring without blocking code changes + +This philosophy is summarized as: **"Trivy detects, GitHub Security decides, CI stays green"** + +## Consequences + +### Positive + +- **No False Failures**: Development work never blocked by scanner quirks or edge cases +- **Continuous Visibility**: All scans complete even if one fails, providing complete security picture +- **Flexible Enforcement**: Security team can configure GitHub Security policies without changing code +- **Third-Party Tolerance**: Known vulnerabilities in upstream images don't block development +- **Developer Experience**: Green builds maintain team velocity while security team reviews findings +- **Policy Separation**: Security enforcement decoupled from CI/CD implementation +- **Audit Trail**: All findings recorded in GitHub Security tab for compliance and tracking +- **Incremental Improvement**: Can address vulnerabilities based on priority without CI pressure + +### Negative + +- **Potential Complacency**: Green CI might lead to ignoring security findings (mitigated by GitHub Security alerts) +- **Requires Monitoring**: Security team must actively monitor GitHub Security tab +- **Policy Configuration**: Requires additional GitHub Security policy setup for enforcement +- **Learning Curve**: Non-traditional approach may confuse developers expecting red builds for vulnerabilities + +### Risks Introduced + +- **Missed Critical Issues**: If GitHub Security is not properly configured or monitored, critical vulnerabilities might go unaddressed + - **Mitigation**: Daily scheduled scans ensure consistent monitoring; GitHub Security sends email notifications +- **Organizational Resistance**: Some organizations mandate CI failure on security issues + - **Mitigation**: GitHub Security can be configured to block PRs or deployments if needed + +## Alternatives Considered + +### 1. Exit Code 1 (Fail on Vulnerabilities) + +**Approach**: Use `exit-code: "1"` to fail CI when HIGH/CRITICAL vulnerabilities are found. + +**Rejected Because**: + +- Blocks development on third-party image vulnerabilities we cannot fix immediately +- Scanner quirks cause false CI failures even with zero vulnerabilities +- No flexibility for security team to make risk-based decisions +- Partial data loss when early scans fail + +### 2. Mixed Exit Codes (Project vs Third-Party) + +**Approach**: Use `exit-code: "1"` for project images but `exit-code: "0"` for third-party images. + +**Rejected Because**: + +- Inconsistent philosophy creates confusion +- Project images can have legitimate accepted risks +- Still susceptible to scanner quirks on project images +- Doesn't solve the fundamental policy enforcement problem + +### 3. Continue-on-Error Pattern + +**Approach**: Use `exit-code: "1"` but add `continue-on-error: true` to allow workflow to proceed. + +**Rejected Because**: + +- Shows misleading "failed" status even though workflow continues +- Scanner errors appear as failures in UI, creating noise +- Doesn't fundamentally change the enforcement model +- Confusing to developers seeing "failed" steps that don't actually fail + +### 4. CodeQL Action with Single Category + +**Approach**: Upload all SARIF files using github/codeql-action/upload-sarif with same category. + +**Rejected Because**: + +- CodeQL Action rejects multiple SARIF uploads with identical categories (as of July 2025) +- Results in "multiple SARIF runs with same category" error +- Cannot distinguish alerts between different images + +## Viewing Security Results + +Security scan results are uploaded to GitHub's Security tab, but the default view filters by `is:open branch:main`. This means: + +- **Pull Request Results**: Must use filter `pr:NUMBER is:open` (e.g., `/security/code-scanning?query=pr:256+is:open`) +- **Branch Results**: Must use filter `is:open branch:BRANCH-NAME` for non-main branches +- **Main Branch Results**: Visible in default view after merging to main + +Results uploaded from PR branches are not visible in the default Security tab view because the default filter excludes them. This is GitHub's standard behavior for code scanning across all analysis tools. + +## Related Decisions + +- [GitHub Actions Workflow Structure](https://github.com/torrust/torrust-tracker-deployer/pull/256) - How the three-job structure enables this philosophy +- Future: Security Policy Configuration (to be documented when GitHub Security policies are configured) + +## References + +- [Issue #251: Implement basic Trivy scanning workflow](https://github.com/torrust/torrust-tracker-deployer/issues/251) +- [Pull Request #256: Implement Basic Trivy Scanning Workflow](https://github.com/torrust/torrust-tracker-deployer/pull/256) +- [Trivy Action Documentation](https://github.com/aquasecurity/trivy-action) +- [GitHub Code Scanning Documentation](https://docs.github.com/en/code-security/code-scanning) +- [GitHub Security Policy Enforcement](https://docs.github.com/en/code-security/code-scanning/managing-code-scanning-alerts) +- [Security-First Philosophy Discussion](https://github.com/torrust/torrust-tracker-deployer/pull/256#discussion) - External review recommending exit-code 0 approach diff --git a/project-words.txt b/project-words.txt index 8bad0a88..de5685eb 100644 --- a/project-words.txt +++ b/project-words.txt @@ -1,67 +1,20 @@ AAAAB AAAAC AAAAI -AGENTS -Alertmanager -aquasecurity -Ashburn -Avalonia -CIFS -Cockburn -Crossplane -Dockerfiles -EAAAADAQABAAABAQC -EPEL -Falkenstein -Gossman -Grafana -Grafonnet -Graça -Herberto -Hillsboro -Hostnames -Liskov -MAAACBA -MVVM -Mermaid -NOPASSWD -OAAAAN -Osherove -Preinstalling -Pulumi -RAII -RUSTDOCFLAGS -Repomix -Rustdoc -SCRIPTDIR -Scriptability -Silverlight -Subissue -Swatinem -Taplo -Tera -Testcontain -Testcontainers -Testinfra -Torrust -Traefik -VARCHAR -Zeroize addgroup adduser -BBDBE -bencoded -completei -downloadedi -filesd -incompletei -intervali -peerslee +AGENTS +Alertmanager appender appendonly +aquasecurity architecting +Ashburn autorestart +Avalonia backlinks +BBDBE +bencoded bootcmd browsable buildx @@ -70,17 +23,21 @@ checkmarks childlogdir chkdsk chrono +CIFS clig clippy clonable cloneable cloudinit +Cockburn +completei concepsts configurator connrefused containerd cpus creds +Crossplane custompass customuser dearmor @@ -92,40 +49,57 @@ devpass distro distroless distutils +Dockerfiles doctest doctests downcasted downcasting +downloadedi dpkg drwxr dtolnay +EAAAADAQABAAABAQC ehthumbs elif +Émojis endfor endraw entr epel +EPEL eprint eprintln equalto executability exfiltration exitcode +Falkenstein +filesd flatlined frontends fswc getent getopt +Gossman +Graça +Grafana +Grafonnet handleable hashset healthcheck +Herberto hetznercloud hexdigit hexdump +HIDS +Hillsboro +Hostnames hotfixes htdocs hugepages impls +incompletei +intervali isreg journalctl jsonlint @@ -141,6 +115,7 @@ leechers libc lifecycles lineinfile +Liskov listenfd listhost logfile @@ -149,7 +124,9 @@ logicaldisk loglevel lspconfig lxdbr +MAAACBA maxbytes +Mermaid mgmt millis mkdir @@ -164,6 +141,7 @@ mprotect mpsc mtorrust multiprocess +MVVM myapp myenv mysqladmin @@ -181,29 +159,38 @@ nocapture noconfirm nodaemon noninteractive +NOPASSWD nslookup nullglob +OAAAAN oneline +Osherove +OSSEC pacman parameterizing parseable passwordless pathbuf +peerslee pidfile pids pipefail pkill postconditions preconfigured +Preinstalling preinstalls prereq println promtool publickey +Pulumi pytest +RAII readlink realpath reentrancy +Repomix reprioritize reprovision reprovisioning @@ -217,20 +204,25 @@ runbooks runcmd runnability rustc +Rustdoc +RUSTDOCFLAGS rustflags rustls rustup sarif SARIF schemars +Scriptability +SCRIPTDIR secureboot selectattr serde serverurl shellcheck +Silverlight smorimoto -spki spëcial +spki sqlx sshpass startretries @@ -238,21 +230,29 @@ stringly subcontroller subcontrollers subhandlers +Subissue subissues subshell substates supervisorctl supervisord swappability +Swatinem sysfs sysv taiki +Taplo taskkill tasklist +Tera terraformrc +tést +Testcontain testcontainer testcontainers +Testcontainers testhost +Testinfra testkey testpass testuser @@ -268,10 +268,11 @@ tmpfiles tmpfs tmptu torrust +Torrust +Traefik tulnp tulpn turbofish -tést ulpn undertested unergonomic @@ -284,17 +285,17 @@ userpass userspace usize utmp +VARCHAR vbqajnc viewmodel +vulns +Wazuh webservers writeln wrongpassword youruser zeroize -HIDS -OSSEC -Wazuh -Émojis +Zeroize значение ключ конфиг