Skip to content

Commit f0aa9cb

Browse files
authored
Merge pull request ckan#342 from GSA/ckan-master
fix apache waf extract; add test
2 parents 1c52994 + 608c0a9 commit f0aa9cb

File tree

10 files changed

+112
-2
lines changed

10 files changed

+112
-2
lines changed

.github/workflows/test.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ jobs:
3838
runs-on: ubuntu-latest
3939
container:
4040
image: ${{ matrix.ckan-image }}
41+
options: --user root
4142
services:
4243
solr:
4344
image: ckan/ckan-solr:${{ matrix.solr-image }}
@@ -63,7 +64,8 @@ jobs:
6364

6465
- name: Install dependencies (common)
6566
run: |
66-
DEBIAN_FRONTEND=noninteractive apt-get --assume-yes --quiet install \
67+
DEBIAN_FRONTEND=noninteractive apt-get update && \
68+
apt-get --assume-yes --quiet install \
6769
python3-dev \
6870
libxml2-dev \
6971
libxslt1-dev \
@@ -72,7 +74,7 @@ jobs:
7274
- name: Install dependencies from requirements.txt
7375
run: |
7476
pip install -r requirements.txt
75-
pip install pytest-ckan
77+
pip install -r dev-requirements.txt
7678
7779
- name: Install harvester
7880
run: |

ckanext/spatial/harvesters/waf.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,8 @@ def _extract_waf(content, base_url, scraper, results = None, depth=0):
312312
if 'mailto:' in url:
313313
continue
314314
if '..' not in url and url[-1] == '/':
315+
if scraper == 'apache' and url[0] == '/':
316+
continue
315317
new_depth = depth + 1
316318
if depth > 10:
317319
log.info('Max WAF depth reached')
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
2+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
3+
<html>
4+
<head>
5+
<title>Index of /apache-folder</title>
6+
</head>
7+
<body>
8+
<h1>Index of /apache-folder</h1>
9+
<pre> <a href="?C=N;O=D">Name</a> <a href="?C=M;O=A">Last modified</a> <a href="?C=S;O=A">Size</a> <a href="?C=D;O=A">Description</a><hr> <a href="/">Parent Directory</a> -
10+
<a href="record-1.xml">record-1.xml</a> 2024-11-07 15:00 356K
11+
<a href="subfolder/">subfolder/</a> 2024-11-12 15:00 -
12+
<hr></pre>
13+
</body></html>
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
2+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
3+
<html>
4+
<head>
5+
<title>Index of /apache-folder/subfolder</title>
6+
</head>
7+
<body>
8+
<h1>Index of /apache-folder/subfolder</h1>
9+
<pre> <a href="?C=N;O=D">Name</a> <a href="?C=M;O=A">Last modified</a> <a href="?C=S;O=A">Size</a> <a href="?C=D;O=A">Description</a><hr> <a href="/folder/">Parent Directory</a> -
10+
<a href="record-2.xml">record-2.xml</a> 2024-11-07 16:59 182K
11+
<hr></pre>
12+
</body></html>
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
<html><head><title>iis.server - /iis-folder/</title></head><body><H1>iis.server - /iis-folder/</H1><hr>
2+
3+
<pre><A HREF="/">[To Parent Directory]</A><br><br> 11/7/2024 7:20 AM &lt;dir&gt; <A HREF="/iis-folder/subfolder/">subfolder</A><br> 11/7/2024 3:00 PM 168 <A HREF="/iis-folder/record-1.xml">record-1.xml</A><br></pre><hr></body></html>
4+
5+
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
<html><head><title>iis.server - /iis-folder/subfolder/</title></head><body><H1>iis.server - /iis-folder/subfolder/</H1><hr>
2+
3+
<pre><A HREF="/iis-folder/">[To Parent Directory]</A><br><br> 11/7/2024 4:59 PM 8958 <A HREF="/iis-folder/subfolder/record-2.xml">record-2.xml</A><br></pre><hr></body></html>
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
2+
<html>
3+
<head><title>Index of /nginx/</title></head>
4+
<body bgcolor="white">
5+
<h1>Index of /nginx/</h1><hr><pre><a href="../">../</a>
6+
<a href="subfolder/">subfolder/</a> 07-Nov-2024 15:00 -
7+
<a href="record-1.xml">record-1.xml</a> 07-Nov-2024 15:00 364868
8+
</pre><hr></body>
9+
</html>
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
2+
<html>
3+
<head><title>Index of /nginx/subfoler/</title></head>
4+
<body bgcolor="white">
5+
<h1>Index of /nginx/subfolder/</h1><hr><pre><a href="../">../</a>
6+
<a href="record-2.xml">record-2.xml</a> 07-Nov-2024 16:59 186150
7+
</pre><hr></body>
8+
</html>
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import os
2+
3+
from ckanext.spatial.harvesters.waf import _extract_waf
4+
5+
TEST_DIR = os.path.dirname(os.path.abspath(__file__))
6+
HTML_DIR = os.path.join(TEST_DIR, "html_files")
7+
8+
def test_extract_iis(httpserver):
9+
10+
# feed http response with these static html content
11+
with \
12+
open(f"{HTML_DIR}/iis-folder.html", "r") as iis_folder, \
13+
open(f"{HTML_DIR}/nginx-folder.html", "r") as nginx_folder, \
14+
open(f"{HTML_DIR}/apache-folder.html", "r") as apache_folder, \
15+
open(f"{HTML_DIR}/iis-subfolder.html", "r") as iis_subfolder, \
16+
open(f"{HTML_DIR}/nginx-subfolder.html", "r") as nginx_subfolder, \
17+
open(f"{HTML_DIR}/apache-subfolder.html", "r") as apache_subfolder:
18+
iis_folder_content = iis_folder.read()
19+
nginx_folder_content = nginx_folder.read()
20+
apache_folder_content = apache_folder.read()
21+
iis_subfolder_content = iis_subfolder.read()
22+
nginx_subfolder_content = nginx_subfolder.read()
23+
apache_subfolder_content = apache_subfolder.read()
24+
25+
# feed static content when it traverses the subfolder
26+
httpserver.expect_request("/iis-folder/subfolder/").respond_with_data(iis_subfolder_content)
27+
httpserver.expect_request("/nginx-folder/subfolder/").respond_with_data(nginx_subfolder_content)
28+
httpserver.expect_request("/apache-folder/subfolder/").respond_with_data(apache_subfolder_content)
29+
30+
# let it scape, traverse and extract the content
31+
iis_results = _extract_waf(
32+
iis_folder_content,
33+
httpserver.url_for("/iis-folder/"),
34+
"iis"
35+
)
36+
37+
nginx_results = _extract_waf(
38+
nginx_folder_content,
39+
httpserver.url_for("/nginx-folder/"),
40+
"nginx"
41+
)
42+
43+
apache_results = _extract_waf(
44+
apache_folder_content,
45+
httpserver.url_for("/apache-folder/"),
46+
"apache"
47+
)
48+
49+
records_expected = [('record-1.xml', '2024-11-07 15:00:00'), ('record-2.xml', '2024-11-07 16:59:00')]
50+
51+
assert records_expected == sorted([(os.path.basename(r[0]), r[1]) for r in iis_results])
52+
assert records_expected == sorted([(os.path.basename(r[0]), r[1]) for r in nginx_results])
53+
assert records_expected == sorted([(os.path.basename(r[0]), r[1]) for r in apache_results])

dev-requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
pytest-ckan
2+
pytest-httpserver == 1.0.2; python_version < '3.10'
3+
pytest-httpserver; python_version >= '3.10'

0 commit comments

Comments
 (0)