Skip to content

Commit d46ff58

Browse files
committed
detect broken links using wget and sitemap
1 parent 4a2da1e commit d46ff58

File tree

8 files changed

+101
-10
lines changed

8 files changed

+101
-10
lines changed
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
name: check-broken-links action
2+
description: check broken links application
3+
4+
inputs:
5+
aks_environment:
6+
description: Environment
7+
required: true
8+
event_name:
9+
description: type of event that triggred the test
10+
required: true
11+
12+
runs:
13+
using: composite
14+
steps:
15+
- name: Checkout code
16+
uses: actions/checkout@v6
17+
18+
- name: Prepare application environment
19+
uses: ./.github/actions/prepare-app-env
20+
21+
- name: set environment (scheduled test)
22+
shell: bash
23+
if: ${{ inputs.event_name == 'schedule' }}
24+
run: echo "AKS_ENVIRONMENT=production" >> $GITHUB_ENV
25+
26+
- name: set environment
27+
shell: bash
28+
if: ${{ inputs.event_name != 'schedule' }}
29+
run: |
30+
echo "AKS_ENVIRONMENT=${{ inputs.aks_environment }}" >> $GITHUB_ENV
31+
32+
- name: Run deployment link check
33+
shell: bash
34+
run: ./check_broken_links.sh "https://teaching-vacancies-${{ inputs.aks_environment }}.test.teacherservices.cloud" "${{ inputs.http_basic_user }}" "${{ inputs.http_basic_password }}"
35+
36+
- name: print environment
37+
shell: bash
38+
run: echo ${{ env.AKS_ENVIRONMENT}}

.github/workflows/build_and_deploy.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,15 @@ jobs:
317317
if: steps.smoke-test.conclusion != 'success'
318318
run: exit 1
319319

320+
- name: Trigger Broken Link action
321+
id: check-broken-links
322+
uses: ./.github/actions/check-broken-links/
323+
with:
324+
aks_environment: ${{ env.ENVIRONMENT }}
325+
event_name: ${{ github.event_name }}
326+
http_basic_user: ${{ secrets.HTTP_BASIC_USER }}
327+
http_basic_password: ${{ secrets.HTTP_BASIC_PASSWORD }}
328+
320329
- name: Post sticky pull request comment
321330
if: github.event_name == 'pull_request'
322331
uses: marocchino/sticky-pull-request-comment@v2

.simplecov

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ if ENV.fetch("COVERAGE", 0).to_i.positive?
3232

3333
# Filters out files from coverage reports
3434
add_filter "app/services/custom_log_formatter.rb"
35-
add_filter "app/controllers/robots_controller.rb"
3635
add_filter "app/controllers/previews_controller.rb"
3736
add_filter "app/controllers/sha_controller.rb"
3837
add_filter "app/jobs/set_organisation_slugs_job.rb"

app/controllers/sitemap_controller.rb

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
class SitemapController < ApplicationController
2-
def show # rubocop:disable Metrics/AbcSize
2+
STATIC_PAGES = %w[terms-and-conditions savings-methodology accessibility vision-statement].freeze
3+
POST_SECTION_NAMES = %w[get-help-hiring jobseeker-guides].freeze
4+
5+
def show # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
36
map = XmlSitemap::Map.new(service_domain, secure: !Rails.env.development?) do |m|
47
# Live vacancies
58
PublishedVacancy.live.applicable.find_each do |vacancy|
@@ -16,10 +19,29 @@ def show # rubocop:disable Metrics/AbcSize
1619
m.add location_landing_page_path(location.parameterize), period: "hourly"
1720
end
1821

19-
# Static pages
20-
m.add page_path("terms-and-conditions"), period: "weekly"
21-
m.add page_path("cookies"), period: "weekly"
22-
m.add page_path("accessibility"), period: "weekly"
22+
STATIC_PAGES.each { |static_page| m.add page_path(static_page), period: "weekly" }
23+
24+
POST_SECTION_NAMES.each do |section|
25+
m.add posts_path(section), period: "weekly"
26+
27+
MarkdownDocument.all_subcategories(section).each do |sub_category|
28+
m.add subcategory_path(section, sub_category.post_name), period: "weekly"
29+
posts = MarkdownDocument.all(section, sub_category.post_name)
30+
posts.each do |post|
31+
m.add post_path(section, sub_category.post_name, post.post_name), period: "weekly"
32+
end
33+
end
34+
end
35+
# POST_SECTIONS.each do |section, subcats|
36+
# m.add posts_path(section), period: "weekly"
37+
# subcats.each do |subcat, post|
38+
# m.add subcategory_path(section, subcat), period: "weekly"
39+
#
40+
# post.each do |post|
41+
# m.add post_path(section, subcat, post), period: "weekly"
42+
# end
43+
# end
44+
# end
2345
end
2446

2547
expires_in 3.hours

app/views/robots/show.text.erb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
<%- if Rails.configuration.app_role.production? %>
2-
SITEMAP: https://teaching-vacancies.service.gov.uk/sitemap.xml
1+
<%- if Rails.configuration.app_role.production? || Rails.configuration.app_role.review? %>
2+
SITEMAP: /sitemap.xml
33
User-agent: *
44
Allow: /
55
Disallow: /check

check_broken_links.sh

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#!/bin/bash -e
2+
3+
# parameters 1 - base URL 2 - user - 3 password
4+
5+
# grab sitemap and roughly parse it into a link of pages to be passed to wget spider-mode
6+
# Try not to spider school sites or big slow govuk ones.
7+
8+
# get-information-schools.service.gov.uk refuses to be spidered so we
9+
# can't check broken links to their domain
10+
# signin.education.co.uk and friends don't respond to robots.txt
11+
#
12+
wget --auth-no-challenge -q --user=$2 --password=$3 $1/sitemap.xml -O - \
13+
| fgrep loc \
14+
| sed s'/ <loc>//' \
15+
| sed s'/<\/loc>//' \
16+
| wget -nv -np -w 0.1 --spider -H -r -l1 -i - --user=$2 --password=$3 -P /tmp/spider \
17+
--auth-no-challenge \
18+
--no-relative \
19+
--domains="gov.uk" \
20+
--exclude-domains="signin.education.gov.uk,get-information-schools.service.gov.uk,ofsted.gov.uk,nationalarchives.gov.uk"

config/routes.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
get "/get-help-hiring/accepting-job-applications-on-teaching-vacancies", to: redirect { |_params, _request|
3434
Rails.application.routes.url_helpers.post_path(section: "get-help-hiring", subcategory: "how-to-create-job-listings-and-accept-applications", post_name: "accepting-job-applications-on-teaching-vacancies")
3535
}
36-
get "/get-help-hiring/communicating-with-jobskeers", to: redirect { |_params, _request|
36+
get "/get-help-hiring/communicating-with-jobseekers", to: redirect { |_params, _request|
3737
Rails.application.routes.url_helpers.post_path(section: "get-help-hiring", subcategory: "how-to-create-job-listings-and-accept-applications", post_name: "communicating-with-jobseekers")
3838
}
3939
get "/jobseeker-guides/write-a-great-teaching-job-application-in-five-steps", to: redirect { |_params, _request|

spec/requests/sitemap_spec.rb

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,13 @@
4040

4141
it "includes static pages in the sitemap" do
4242
expect(xml).to include(page_path("terms-and-conditions"))
43-
expect(xml).to include(page_path("cookies"))
4443
expect(xml).to include(page_path("accessibility"))
4544
end
4645

46+
it "includes posts in the sitemap" do
47+
expect(xml).to include(post_path("get-help-hiring", "how-to-create-job-listings-and-accept-applications", "creating-the-perfect-teacher-job-advert"))
48+
end
49+
4750
it "sets cache expiry" do
4851
expect(response.headers["Cache-Control"]).to include("max-age=10800")
4952
end

0 commit comments

Comments
 (0)