@@ -379,37 +379,32 @@ private void doArgsMode(String[] args) {
379379 private void doWebMode () {
380380 reset ();
381381 int numPagesPosted ;
382- try {
383- if (type != null ) {
384- throw new IllegalArgumentException (
385- "Specifying content-type with \" --mode=web\" is not supported" );
386- }
382+ if (type != null ) {
383+ throw new IllegalArgumentException (
384+ "Specifying content-type with \" --mode=web\" is not supported" );
385+ }
387386
388- // Set Extracting handler as default
389- solrUpdateUrl = appendUrlPath (solrUpdateUrl , "/extract" );
387+ // Set Extracting handler as default
388+ solrUpdateUrl = appendUrlPath (solrUpdateUrl , "/extract" );
390389
391- info ("Posting web pages to Solr url " + solrUpdateUrl );
392- auto = true ;
393- info (
394- "Entering auto mode. Indexing pages with content-types corresponding to file endings "
395- + fileTypes );
396- if (recursive > 0 ) {
397- if (recursive > MAX_WEB_DEPTH ) {
398- recursive = MAX_WEB_DEPTH ;
399- warn ("Too large recursion depth for web mode, limiting to " + MAX_WEB_DEPTH + "..." );
400- }
401- if (delay < DEFAULT_WEB_DELAY ) {
402- warn (
403- "Never crawl an external web site faster than every 10 seconds, your IP will probably be blocked" );
404- }
405- info ("Entering recursive mode, depth=" + recursive + ", delay=" + delay + "s" );
390+ info ("Posting web pages to Solr url " + solrUpdateUrl );
391+ auto = true ;
392+ info (
393+ "Entering auto mode. Indexing pages with content-types corresponding to file endings "
394+ + fileTypes );
395+ if (recursive > 0 ) {
396+ if (recursive > MAX_WEB_DEPTH ) {
397+ recursive = MAX_WEB_DEPTH ;
398+ warn ("Too large recursion depth for web mode, limiting to " + MAX_WEB_DEPTH + "..." );
406399 }
407- numPagesPosted = postWebPages ( args , 0 , out );
408- info ( numPagesPosted + " web pages indexed." );
409-
410- } catch ( URISyntaxException e ) {
411- warn ( "Wrong URL trying to append /extract to " + solrUpdateUrl );
400+ if ( delay < DEFAULT_WEB_DELAY ) {
401+ warn (
402+ "Never crawl an external web site faster than every 10 seconds, your IP will probably be blocked" );
403+ }
404+ info ( "Entering recursive mode, depth=" + recursive + ", delay=" + delay + "s" );
412405 }
406+ numPagesPosted = postWebPages (args , 0 , out );
407+ info (numPagesPosted + " web pages indexed." );
413408 }
414409
415410 private void doStdinMode () {
@@ -533,7 +528,7 @@ int postFiles(File[] files, OutputStream out, String type) {
533528 postFile (srcFile , out , type );
534529 Thread .sleep (delay * 1000L );
535530 filesPosted ++;
536- } catch (InterruptedException | URISyntaxException e ) {
531+ } catch (InterruptedException | MalformedURLException | URISyntaxException e ) {
537532 throw new RuntimeException (e );
538533 }
539534 }
@@ -699,13 +694,14 @@ protected int webCrawl(int level, OutputStream out) {
699694 * @param link the absolute or relative link
700695 * @return the string version of the full URL
701696 */
702- protected String computeFullUrl (URL baseUrl , String link ) {
697+ protected static String computeFullUrl (URL baseUrl , String link )
698+ throws MalformedURLException , URISyntaxException {
703699 if (link == null || link .length () == 0 ) {
704700 return null ;
705701 }
706702 if (!link .startsWith ("http" )) {
707703 if (link .startsWith ("/" )) {
708- link = baseUrl .getProtocol () + "://" + baseUrl . getAuthority () + link ;
704+ link = baseUrl .toURI (). resolve ( link ). toString () ;
709705 } else {
710706 if (link .contains (":" )) {
711707 return null ; // Skip non-relative URLs
@@ -715,10 +711,12 @@ protected String computeFullUrl(URL baseUrl, String link) {
715711 int sep = path .lastIndexOf ('/' );
716712 String file = path .substring (sep + 1 );
717713 if (file .contains ("." ) || file .contains ("?" )) {
718- path = path .substring (0 , sep );
714+ path = path .substring (0 , sep + 1 );
715+ } else {
716+ path = path + "/" ;
719717 }
720718 }
721- link = baseUrl .getProtocol () + "://" + baseUrl . getAuthority () + path + "/" + link ;
719+ link = baseUrl .toURI (). resolve ( path + link ). toString () ;
722720 }
723721 }
724722 link = normalizeUrlEnding (link );
@@ -806,7 +804,8 @@ public static String appendParam(String url, String param) {
806804 }
807805
808806 /** Opens the file and posts its contents to the solrUrl, writes to response to output. */
809- public void postFile (File file , OutputStream output , String type ) throws URISyntaxException {
807+ public void postFile (File file , OutputStream output , String type )
808+ throws MalformedURLException , URISyntaxException {
810809 InputStream is = null ;
811810
812811 URI uri = solrUpdateUrl ;
@@ -884,9 +883,20 @@ public void postFile(File file, OutputStream output, String type) throws URISynt
884883 * @param append the path to append
885884 * @return the final URL version
886885 */
887- protected static URI appendUrlPath (URI uri , String append ) throws URISyntaxException {
888- var newPath = uri .getPath () + append ;
889- return new URI (uri .getScheme (), uri .getAuthority (), newPath , uri .getQuery (), uri .getFragment ());
886+ protected static URI appendUrlPath (URI uri , String append ) {
887+ if (append == null || append .isEmpty ()) {
888+ return uri ;
889+ }
890+ if (append .startsWith ("/" )) {
891+ append = append .substring (1 );
892+ }
893+ if (uri .getQuery () != null && !uri .getQuery ().isEmpty ()) {
894+ append += "?" + uri .getQuery ();
895+ }
896+ if (!uri .getPath ().endsWith ("/" )) {
897+ append = uri .getPath () + "/" + append ;
898+ }
899+ return uri .resolve (append );
890900 }
891901
892902 /**
0 commit comments