Skip to content

Commit c6be704

Browse files
authored
New tool for detecting broken XML files (#211)
1 parent 643f7dd commit c6be704

File tree

1 file changed

+154
-0
lines changed

1 file changed

+154
-0
lines changed

scripts/broken.php

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
<?php /*
2+
+----------------------------------------------------------------------+
3+
| Copyright (c) 1997-2025 The PHP Group |
4+
+----------------------------------------------------------------------+
5+
| This source file is subject to version 3.01 of the PHP license, |
6+
| that is bundled with this package in the file LICENSE, and is |
7+
| available through the world-wide-web at the following url: |
8+
| https://www.php.net/license/3_01.txt. |
9+
| If you did not receive a copy of the PHP license and are unable to |
10+
| obtain it through the world-wide-web, please send a note to |
11+
| [email protected], so we can mail you a copy immediately. |
12+
+----------------------------------------------------------------------+
13+
| Authors: André L F S Bacci <ae php.net> |
14+
+----------------------------------------------------------------------+
15+
16+
# Description
17+
18+
This command line utility test if an file is valid standalone XML file,
19+
accepting undefined entities references. If an directory is informed,
20+
the test is applied in all .xml files in directory and sub directories.
21+
22+
This tool also cares for directories marked with .xmlfragmentdir, so
23+
theses files are tested in relaxed semantics for XML fragments. */
24+
25+
ini_set( 'display_errors' , 1 );
26+
ini_set( 'display_startup_errors' , 1 );
27+
error_reporting( E_ALL );
28+
29+
if ( count( $argv ) < 2 )
30+
print_usage_exit( $argv[0] );
31+
32+
array_shift( $argv );
33+
foreach( $argv as $arg )
34+
{
35+
if ( file_exists( $arg ) )
36+
{
37+
if ( is_file( $arg ) )
38+
testFile( $arg );
39+
if ( is_dir( $arg ) )
40+
testDir( $arg );
41+
continue;
42+
}
43+
echo "Path does not exist: $arg\n";
44+
}
45+
46+
function print_usage_exit( $cmd )
47+
{
48+
fwrite( STDERR , " Wrong paramater count. Usage:\n" );
49+
fwrite( STDERR , " {$cmd} path:\n" );
50+
exit;
51+
}
52+
53+
function setup( string & $prefix , string & $suffix , string & $extra )
54+
{
55+
// Undefined entities generate TWO different error messages on libxml
56+
// - "Entity '?' not defined" (for entity inside elements)
57+
// - "Extra content at the end of the document" (entity outside elements)
58+
59+
$inside = "<x>&ZZZ;</x>";
60+
$outside = "<x/>&ZZZ;";
61+
62+
$doc = new DOMDocument();
63+
$doc->recover = true;
64+
$doc->resolveExternals = false;
65+
$doc->substituteEntities = false;
66+
libxml_use_internal_errors( true );
67+
68+
$doc->loadXML( $inside );
69+
$message = trim( libxml_get_errors()[0]->message );
70+
$message = str_replace( "ZZZ" , "\f" , $message );
71+
[ $prefix , $suffix ] = explode( "\f" , $message );
72+
libxml_clear_errors();
73+
74+
$doc->loadXML( $outside );
75+
$extra = trim( libxml_get_errors()[0]->message );
76+
libxml_clear_errors();
77+
}
78+
79+
function testFile( string $filename , bool $fragment = false )
80+
{
81+
static $prefix = "", $suffix = "", $extra = "";
82+
if ( $extra == "" )
83+
setup( $prefix , $suffix , $extra );
84+
85+
$doc = new DOMDocument();
86+
$doc->recover = true;
87+
$doc->resolveExternals = false;
88+
$doc->substituteEntities = false;
89+
libxml_use_internal_errors( true );
90+
91+
$contents = file_get_contents( $filename );
92+
if ( $fragment )
93+
$contents = "<f>{$contents}</f>";
94+
$doc->loadXML( $contents );
95+
96+
$errors = libxml_get_errors();
97+
libxml_clear_errors();
98+
99+
foreach( $errors as $error )
100+
{
101+
$message = trim( $error->message );
102+
$hintFragDir = false;
103+
104+
if ( str_starts_with( $message , $prefix ) && str_ends_with( $message , $suffix ) )
105+
continue;
106+
//if ( $message == $extra ) // Disabled as unnecessary. Also, this indicates that some
107+
// continue; // some entity reference is used at an unusual position.
108+
if ( $message == $extra )
109+
$hintFragDir = true;
110+
111+
$lin = $error->line;
112+
$col = $error->column;
113+
echo "Broken XML file:\n";
114+
echo " Path: $filename [$lin,$col]\n";
115+
echo " Error: $message\n";
116+
if ( $hintFragDir )
117+
echo " Hint: Dir is marked with .xmlfragmentdir on doc-en? If not, check entity references.\n";
118+
echo "\n";
119+
return;
120+
}
121+
}
122+
123+
function testDir( string $dir )
124+
{
125+
$dir = realpath( $dir );
126+
$files = scandir( $dir );
127+
$fragment = false;
128+
$subdirs = [];
129+
130+
foreach( $files as $file )
131+
{
132+
if ( $file == ".xmlfragmentdir" )
133+
{
134+
$fragment = true;
135+
continue;
136+
}
137+
if ( $file[0] == "." )
138+
continue;
139+
140+
$fullpath = realpath( "$dir/$file" );
141+
142+
if ( is_dir ( $fullpath ) )
143+
{
144+
$subdirs[] = $fullpath;
145+
continue;
146+
}
147+
148+
if ( str_ends_with( $fullpath , ".xml" ) )
149+
testFile( $fullpath , $fragment );
150+
}
151+
152+
foreach( $subdirs as $dir )
153+
testDir( $dir );
154+
}

0 commit comments

Comments
 (0)