diff --git a/.github/workflows/actions_release.yml b/.github/workflows/actions_release.yml new file mode 100644 index 0000000..b2c7eb2 --- /dev/null +++ b/.github/workflows/actions_release.yml @@ -0,0 +1,22 @@ +name: Release GitHub Actions + +on: + workflow_dispatch: + inputs: + tag: + description: "Tag for the release" + required: true + +permissions: + contents: read + +jobs: + release: + permissions: + actions: read + id-token: write + contents: write + + uses: step-security/reusable-workflows/.github/workflows/actions_release.yaml@v1 + with: + tag: "${{ github.event.inputs.tag }}" \ No newline at end of file diff --git a/.github/workflows/auto_cherry_pick.yml b/.github/workflows/auto_cherry_pick.yml new file mode 100644 index 0000000..1e83c9c --- /dev/null +++ b/.github/workflows/auto_cherry_pick.yml @@ -0,0 +1,32 @@ +name: Auto Cherry-Pick from Upstream + +on: + workflow_dispatch: + inputs: + base_branch: + description: "Base branch to create the PR against" + required: true + default: "main" + mode: + description: "Run mode: cherry-pick or verify" + required: false + default: "cherry-pick" + + pull_request: + types: [opened, synchronize, labeled] + +permissions: + contents: write + pull-requests: write + packages: read + issues: write + +jobs: + cherry-pick: + if: github.event_name == 'workflow_dispatch' || contains(fromJson(toJson(github.event.pull_request.labels)).*.name, 'review-required') + uses: step-security/reusable-workflows/.github/workflows/auto_cherry_pick.yaml@v1 + with: + original-owner: "anishathalye" + repo-name: "proof-html" + base_branch: ${{ inputs.base_branch }} + mode: ${{ github.event_name == 'pull_request' && 'verify' || inputs.mode }} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..4e8d864 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,16 @@ +name: CI +on: + push: + pull_request: + +jobs: + fmt: + name: Format + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v5 + - uses: ruby/setup-ruby@v1 + with: + ruby-version: '3.2' + - run: gem install rufo + - run: rufo -c . diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml new file mode 100644 index 0000000..6e91af9 --- /dev/null +++ b/.github/workflows/docker.yml @@ -0,0 +1,55 @@ +name: Publish docker image + +on: + workflow_dispatch: + inputs: + release_tag: + description: 'Tag to release' + required: true + type: string + +permissions: + contents: read + packages: write + +jobs: + build: + runs-on: ubuntu-latest + if: startsWith(github.event.inputs.release_tag, 'v') + steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@v2 + with: + egress-policy: audit + + - name: Checkout + uses: actions/checkout@v5 + - name: Validate tag format + run: | + TAG=${{ github.event.inputs.release_tag }} + if ! echo "$TAG" | grep -Eq '^v[0-9]+\.[0-9]+\.[0-9]+$'; then + echo "❌ Invalid tag format: $TAG" + exit 1 + fi + echo "✅ Valid semver tag: $TAG" + - name: Log in to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Set up QEMU for ARM builds + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build and push Docker image + uses: docker/build-push-action@v6 + with: + context: . + push: true + platforms: linux/amd64,linux/arm64 + tags: | + ghcr.io/${{ github.repository }}:${{ github.event.inputs.release_tag }} \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..ee97c97 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,26 @@ +FROM alpine:3.22 AS base + +RUN apk --no-cache add openjdk21 + +FROM base AS build-vnu + +RUN apk add git python3 + +RUN git clone -n https://github.com/validator/validator.git \ + && cd validator \ + && git checkout 84a1b28ff4cc28b7e9a31784688dbee6366b3467 \ + && JAVA_HOME=/usr/lib/jvm/java-21-openjdk python checker.py update-shallow dldeps build jar + +FROM base + +RUN apk --no-cache add build-base linux-headers ruby-dev +RUN apk --no-cache add curl +RUN gem install html-proofer -v 5.0.10 + +RUN apk --no-cache add bash + +COPY --from=build-vnu /validator/build/dist/vnu.jar /bin/vnu.jar + +COPY entrypoint.sh proof-html.rb / + +ENTRYPOINT ["/entrypoint.sh"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..6861658 --- /dev/null +++ b/LICENSE @@ -0,0 +1,22 @@ +The MIT License (MIT) + +Copyright (c) Anish Athalye (me@anishathalye.com) +Copyright (c) 2025 StepSecurity + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/README.md b/README.md index 971bb88..9c9a6a7 100644 --- a/README.md +++ b/README.md @@ -1 +1,172 @@ -# proof-html \ No newline at end of file +# proof-html + +proof-html is a [GitHub Action](https://github.com/features/actions) to validate HTML and CSS using the [Nu HTML Validator](https://github.com/validator/validator) and check links, images, and more using [HTMLProofer](https://github.com/gjtorikian/html-proofer). + +## Usage + +```yaml +- uses: step-security/proof-html@v2 + with: + directory: ./site +``` + +See below for a [full example](#full-example). + +## Options + +| Name | Description | Default | +| --- | --- | --- | +| `directory` | The directory to scan | (required) | +| `check_html` | Validate HTML | true | +| `check_css` | Validate CSS | true | +| `validator_ignore` | Regex of HTML/CSS validator errors to ignore | (empty) | +| `check_external_hash` | Check whether external anchors exist | true | +| `check_favicon` | Check whether favicons are valid | true | +| `check_opengraph` | Check images and URLs in Open Graph metadata | true | +| `ignore_empty_alt` | Allow images with empty alt tags | false | +| `ignore_missing_alt` | Allow images with missing alt tags | false | +| `allow_missing_href` | Allow anchors with missing href tags | false | +| `enforce_https` | Require that links use HTTPS | true | +| `swap_urls` | JSON-encoded map of URL rewrite rules | (empty) | +| `disable_external` | Disables the external link checker | false | +| `ignore_url` | Newline-separated list of URLs to ignore | (empty) | +| `ignore_url_re` | Newline-separated list of URL regexes to ignore | (empty) | +| `connect_timeout` | HTTP connection timeout | 30 | +| `tokens` | JSON-encoded map of domains to authorization tokens | (empty) | +| `max_concurrency` | Maximum number of concurrent requests | 50 | +| `timeout` | HTTP request timeout | 120 | +| `retries` | Number of times to retry checking links | 3 | + +Most of the options correspond directly to [configuration options for +HTMLProofer](https://github.com/gjtorikian/html-proofer#configuration). + +**validator_ignore** + +`validator_ignore` is a _regex pattern_ of HTML/CSS validation errors to +ignore, corresponding to the [`--filterpattern` +option](https://github.com/validator/validator?tab=readme-ov-file#--filterpattern-regexp) +of the Nu validator. + +For example, you might see the following errors: + +``` +"file:/build/index.html":0.1-0.6: error: Start tag seen without seeing a doctype first. Expected “”. +"file:/build/index.html":1.9-1.15: error: Element “head” is missing a required instance of child element “title”. +"file:/build/style.css":2.8-2.8: error: CSS: “foo”: Property “foo” doesn't exist. +``` + +If you wanted to ignore the first error, and you wanted to ignore all +non-existent properties in CSS, you could set the `validator_ignore` argument +to: + +``` +Start tag seen without seeing a doctype first.*|CSS: “.*”: Property “.*” doesn't exist. +``` + +**tokens** + +`tokens` is a _JSON-encoded_ map of domains to authorization tokens. So it's +"doubly encoded": the workflow file is written in YAML and `tokens` is a string +(not a map!), a JSON encoding of the data. This option can be used to provide +bearer tokens to use in certain scenarios, which is useful for e.g. avoiding +rate limiting. Tokens are only sent to the specified websites. Note that +domains must not have a trailing slash. Here is an example of an encoding of +tokens: + +```yaml +tokens: | + {"https://github.com": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", + "https://twitter.com": "yyyyyyyyyyyyyyyyyyyyyyy"} +``` + +You can also see the full example below for how to pass on the `GITHUB_TOKEN` +supplied by the workflow runner. + +**swap_urls** + +`swap_urls` is a _JSON-encoded_ map, mapping regexes to strings. This can be +useful to strip a base path for an internal domain. For example: + +```yaml +swap_urls: | + {"^https:\\/\\/example\\.com\\/": "/"} +``` + +You can also use capture groups and back-references here. For example, to +ignore checking hashes for GitHub URLs (like +`https://github.com/step-security/proof-html#options`), you can use: + +```yaml +swap_urls: | + {"^(https:\\/\\/github\\.com\\/.*)#.*$": "\\1"} +``` + +## Full Example + +This is the entire `.github/workflows/build.yml` file for a GitHub Pages / +[Jekyll](https://jekyllrb.com/docs/github-pages/) site. + +```yaml +name: CI +on: + push: + schedule: + - cron: '0 8 * * 6' +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v5 + - uses: actions/setup-ruby@v1 + with: + ruby-version: 2.7.x + - uses: actions/cache@v4 + with: + path: vendor/bundle + key: ${{ runner.os }}-gems-${{ hashFiles('**/Gemfile.lock') }} + restore-keys: | + ${{ runner.os }}-gems- + - run: | + bundle config path vendor/bundle + bundle install --jobs 4 --retry 3 + - run: bundle exec jekyll build + - uses: step-security/proof-html@v2 + with: + directory: ./_site + enforce_https: false + tokens: | + {"https://github.com": "${{ secrets.GITHUB_TOKEN }}"} + ignore_url: | + http://www.example.com/ + https://en.wikipedia.org/wiki/Main_Page + ignore_url_re: | + ^https://twitter.com/ +``` + +## Running locally + +You can build the Docker container locally with `docker build . -t proof-html`. + +The GitHub Action is set up to pass arguments as strings through environment +variables, where an argument like `ignore_url` is passed as `INPUT_IGNORE_URL` +(capitalize and prepend `INPUT_`) to the Docker container, so you will need to +do this translation yourself if you're running the Docker container locally. +You can mount a local directory in the Docker container with the `-v` argument +and pass the directory name as the `INPUT_DIRECTORY` argument. For example, if +you compiled a site into the `build` directory, you can run: + +```bash +docker run --rm \ + -e INPUT_DIRECTORY=build \ + -v "${PWD}/build:/build" \ + proof-html:latest +``` + +You can pass additional arguments as additional environment variables, e.g. +`-e INPUT_FORCE_HTTPS=0` or +`-e INPUT_TOKENS='{"https://github.com": "your-token-here"}'`. + +## License + +Copyright (c) Anish Athalye. Copyright (c) StepSecurity. Released under the MIT License. See +[LICENSE](LICENSE) for details. diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..77568b2 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,5 @@ +# Security Policy + +## Reporting a Vulnerability + +Please report security vulnerabilities to security@stepsecurity.io diff --git a/action.yml b/action.yml new file mode 100644 index 0000000..4944370 --- /dev/null +++ b/action.yml @@ -0,0 +1,73 @@ +name: Proof HTML +author: step-security +description: Validate your HTML and CSS and check for broken links. + +inputs: + directory: + description: The directory to scan + required: true + check_html: + description: Validate HTML + required: false + check_css: + description: Validate CSS + required: false + validator_ignore: + description: Regex of HTML/CSS validator errors to ignore + required: false + check_external_hash: + description: Check whether external anchors exist + required: false + check_opengraph: + description: Check images and URLs in Open Graph metadata + required: false + check_favicon: + description: Check whether favicons are valid + required: false + ignore_empty_alt: + description: Allow images with empty alt tags + required: false + ignore_missing_alt: + description: Allow images with missing alt tags + required: false + allow_missing_href: + description: Allow anchors with missing href tags + required: false + enforce_https: + description: Require that links use HTTPS + required: false + tokens: + description: JSON-encoded map of domains to authorization tokens + required: false + swap_urls: + description: JSON-encoded map of URL rewrite rules + required: false + max_concurrency: + description: Maximum number of concurrent requests + required: false + connect_timeout: + description: HTTP connection timeout + required: false + timeout: + description: HTTP request timeout + required: false + disable_external: + description: Disables the external link checker + required: false + ignore_url: + description: Newline-separated list of URLs to ignore + required: false + ignore_url_re: + description: Newline-separated list of URL regexes to ignore + required: false + retries: + description: Number of times to retry checking links + required: false + +runs: + using: docker + image: "Dockerfile" + +branding: + icon: check-square + color: green diff --git a/entrypoint.sh b/entrypoint.sh new file mode 100755 index 0000000..6ce2592 --- /dev/null +++ b/entrypoint.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash + +# validate subscription status +API_URL="https://agent.api.stepsecurity.io/v1/github/$GITHUB_REPOSITORY/actions/subscription" + +# Set a timeout for the curl command (3 seconds) +RESPONSE=$(curl --max-time 3 -s -w "%{http_code}" "$API_URL" -o /dev/null) || true +CURL_EXIT_CODE=$? + +# Decide based on curl exit code and HTTP status +if [ $CURL_EXIT_CODE -ne 0 ]; then + echo "Timeout or API not reachable. Continuing to next step." +elif [ "$RESPONSE" = "200" ]; then + : +elif [ "$RESPONSE" = "403" ]; then + echo "Subscription is not valid. Reach out to support@stepsecurity.io" + exit 1 +else + echo "Timeout or API not reachable. Continuing to next step." +fi + +failed=0 + +check_html="${INPUT_CHECK_HTML:-true}" +if [[ "$check_html" =~ ^t.*|^T.*|^y.*|^Y.*|^1.* ]]; then + check_css="${INPUT_CHECK_CSS:-true}" + check_css_arg="" + if [[ "$check_css" =~ ^t.*|^T.*|^y.*|^Y.*|^1.* ]]; then + check_css_arg="--also-check-css" + fi + if ! java -jar /bin/vnu.jar --errors-only --filterpattern "${INPUT_VALIDATOR_IGNORE}" --skip-non-html ${check_css_arg} "${INPUT_DIRECTORY}"; then + failed=1 + fi +fi + +tries="${INPUT_RETRIES:-3}" + +while [ "$tries" -ge 1 ]; do + tries=$((tries-1)) + if RUBYOPT="-W0" ruby /proof-html.rb; then + break + fi + if [ "$tries" -ge 1 ]; then + sleep 5 + fi + if [ "$tries" -eq 0 ]; then + failed=1 + fi +done + +exit $failed diff --git a/proof-html.rb b/proof-html.rb new file mode 100644 index 0000000..febfa5f --- /dev/null +++ b/proof-html.rb @@ -0,0 +1,91 @@ +require "html-proofer" +require "json" +require "uri" + +CHROME_FROZEN_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.0.0 Safari/537.36" + +def get_bool(name, fallback) + s = ENV["INPUT_#{name}"] + return fallback if s.nil? or s == "" + case s + when /^t/i # matches "t", "true", "True" + true + when /^y/i # matches "y", "yes", "Yes" + true + when "1" + true + else + false + end +end + +def get_int(name, fallback) + s = ENV["INPUT_#{name}"] + return fallback if s.nil? or s == "" + s.to_i +end + +def get_str(name) + s = ENV["INPUT_#{name}"] + s.nil? ? "" : s +end + +ignore_url_re = get_str("IGNORE_URL_RE").split("\n").map { |s| Regexp.new s } +ignore_url = get_str("IGNORE_URL").split("\n").concat ignore_url_re +tokens_str = get_str("TOKENS") +tokens = JSON.parse (tokens_str == "" ? "{}" : tokens_str) + +swap_urls_str = get_str("SWAP_URLS") +swap_urls = JSON.parse (swap_urls_str == "" ? "{}" : swap_urls_str) +swap_urls.transform_keys! { |k| Regexp.new k } + +checks = ["Links", "Scripts", "Images"] +if get_bool("CHECK_FAVICON", true) + checks.push("Favicon") +end +if get_bool("CHECK_OPENGRAPH", true) + checks.push("OpenGraph") +end + +options = { + :checks => checks, + :cache => { :timeframe => { + :internal => "1d", + :external => "1d", + } }, + :check_external_hash => get_bool("CHECK_EXTERNAL_HASH", true), + :ignore_empty_alt => get_bool("IGNORE_EMPTY_ALT", false), + :ignore_missing_alt => get_bool("IGNORE_MISSING_ALT", false), + :allow_missing_href => get_bool("ALLOW_MISSING_HREF", false), + :enforce_https => get_bool("ENFORCE_HTTPS", true), + :hydra => { + :max_concurrency => get_int("MAX_CONCURRENCY", 50), + }, + :typhoeus => { + :cookiefile => ".cookies", + :cookiejar => ".cookies", + :connecttimeout => get_int("CONNECT_TIMEOUT", 30), + :followlocation => true, + :headers => { + "User-Agent" => CHROME_FROZEN_UA, + }, + :timeout => get_int("TIMEOUT", 120), + }, + :disable_external => get_bool("DISABLE_EXTERNAL", false), + :ignore_urls => ignore_url, + :swap_urls => swap_urls, +} + +begin + proofer = HTMLProofer.check_directory(get_str("DIRECTORY"), options) + proofer.before_request do |request| + uri = URI.parse request.url + base = "#{uri.scheme}://#{uri.host}" + token = tokens[base] + request.options[:headers]["Authorization"] = "Bearer #{token}" unless token.nil? + end + proofer.run +rescue => msg + puts "#{msg}" + exit 1 +end