workback-accessibility-project/extract-issue.js at main · alexander-langolf/workback-accessibility-project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
// Extraction script to be executed in browser context
(function() {
  const data = {
    title: document.querySelector('h1')?.textContent?.trim() || '',
    metadata: {},
    sections: {},
    images: []
  };

  // Extract metadata from page text
  const pageText = document.body.innerText;
  const wcagMatch = pageText.match(/WCAG:\s*([\d.]+)/);
  const severityMatch = pageText.match(/Severity:\s*(\w+)/);
  const statusMatch = pageText.match(/Status:\s*(\w+)/);
  const stageMatch = pageText.match(/Stage:\s*(\w+)/);
  const flowMatch = pageText.match(/Flow:\s*([^\n]+)/);
  const urlMatch = pageText.match(/Page URL:\s*([^\n]+)/);
  const auditDateMatch = pageText.match(/Audit Date:\s*([^\n]+)/);

  data.metadata = {
    wcag: wcagMatch ? wcagMatch[1] : '',
    severity: severityMatch ? severityMatch[1] : '',
    status: statusMatch ? statusMatch[1] : '',
    stage: stageMatch ? stageMatch[1] : '',
    flow: flowMatch ? flowMatch[1].trim() : '',
    pageUrl: urlMatch ? urlMatch[1].trim() : '',
    auditDate: auditDateMatch ? auditDateMatch[1].trim() : ''
  };

  // Extract sections by finding h2 headings and their content
  const headings = Array.from(document.querySelectorAll('h2'));
  headings.forEach(h2 => {
    const sectionName = h2.textContent.trim();
    if (!sectionName || sectionName === 'Activity') return; // Skip Activity section

    // For Audit Evidence, just note that images are included (handled separately)
    if (sectionName === 'Audit Evidence') {
      data.sections[sectionName] = 'See images section below.';
      return;
    }

    let content = '';
    let next = h2.nextElementSibling;

    while (next && next.tagName !== 'H2' && next.tagName !== 'H1') {
      if (next.tagName === 'P') {
        content += next.textContent.trim() + '\n\n';
      } else if (next.tagName === 'UL' || next.tagName === 'OL') {
        const items = Array.from(next.querySelectorAll('li')).map(li => '- ' + li.textContent.trim());
        content += items.join('\n') + '\n\n';
      } else if (next.tagName === 'PRE' || (next.tagName === 'CODE' && next.parentElement?.tagName === 'PRE')) {
        const codeText = next.textContent || (next.parentElement?.textContent || '');
        content += '\n```\n' + codeText + '\n```\n\n';
      } else if (next.tagName === 'CODE') {
        content += '`' + next.textContent + '` ';
      } else if (next.textContent.trim()) {
        content += next.textContent.trim() + '\n\n';
      }
      next = next.nextElementSibling;
    }

    if (content.trim()) {
      data.sections[sectionName] = content.trim();
    }
  });

  // Extract images
  const imageLinks = Array.from(document.querySelectorAll('a[href*="/api/gcs"]'));
  imageLinks.forEach((link, index) => {
    const img = link.querySelector('img');
    // Try to get image name from link text or img alt, fallback to extracting from URL
    let imageName = link.textContent.trim();
    if (!imageName && img) {
      imageName = img.getAttribute('alt') || '';
    }
    if (!imageName) {
      // Extract filename from URL - look for evidence/ filename pattern
      const urlMatch = link.getAttribute('href').match(/evidence%2F([^&]+)/);
      imageName = urlMatch ? decodeURIComponent(urlMatch[1]) : `image-${index + 1}.png`;
    }
    // Clean up the name but preserve extension
    const extMatch = imageName.match(/\.(png|jpg|jpeg|gif|svg|webp)$/i);
    const extension = extMatch ? extMatch[0] : '.png';
    imageName = imageName.replace(/\.[^.]+$/, '') + extension;
    imageName = imageName.replace(/[^a-zA-Z0-9.-]/g, '_');

    let imageUrl = link.getAttribute('href');

    if (!imageUrl.startsWith('http')) {
      imageUrl = window.location.origin + imageUrl;
    }

    data.images.push({
      name: imageName,
      url: imageUrl,
      alt: img ? (img.getAttribute('alt') || '') : ''
    });
  });

  return JSON.stringify(data);
})();