Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
import org.dom4j.Node;
import org.dom4j.io.SAXReader;

import fr.eolya.utils.ConfigHelper;

import crawlercommons.sitemaps.AbstractSiteMap;
import crawlercommons.sitemaps.SiteMap;
import crawlercommons.sitemaps.SiteMapIndex;
Expand Down Expand Up @@ -238,7 +240,7 @@ public int processItem(Map<String,Object> itemData, long threadId) {
/*
* Ignore this page if robots.txt ask it
*/
if (robots!=null && !"1".equals(config.getProperty("/crawler/param[@name='bypass_robots_file']", "0")) && !robots.isUrlAllowed(pageURL)) return 0;
if (robots!=null && !ConfigHelper.isBypassRobotsFile(config, pageURL) && !robots.isUrlAllowed(pageURL)) return 0;

int maxCrawlDepth = src.getDepth();
if (maxCrawlDepth==0) maxCrawlDepth = Integer.parseInt(config.getProperty("/crawler/param[@name='max_depth']", "2"));
Expand Down Expand Up @@ -1398,7 +1400,7 @@ private boolean isAccepetedUrl (String strLink, String normalizedStartUrl, List<
*/

// Filtre l'url par rapport aux règles du fichier robots.txt
if (robots!=null && !isStartingUrl(strLink) && !"1".equals(config.getProperty("/crawler/param[@name='bypass_robots_file']", "0")) && !robots.isUrlAllowed(urlLink))
if (robots!=null && !isStartingUrl(strLink) && !ConfigHelper.isBypassRobotsFile(config, pageURL) && !robots.isUrlAllowed(urlLink))
{
logger.log("[" + String.valueOf(threadId) + "] " + strLink + " rejected due to robots.txt exclusion rules");
if (depth<=memlogMaxDepth) src.memLogAppend(" " + strLink + " rejected due to robots.txt exclusion rules");
Expand Down
38 changes: 38 additions & 0 deletions java/utils/src/main/java/fr/eolya/utils/ConfigHelper.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package fr.eolya.utils;

import java.net.URL;
import java.util.Objects;

import fr.eolya.utils.XMLConfig;

public final class ConfigHelper {

private ConfigHelper() {
}

/**
* Checks if the robots.txt file should be bypassed by the crawler.<br>
* The base setting is the bypass_robots_file param, with overrides nested in a special element:
* <p>
* <pre>
* &lt;bypassRobotsFile&gt;
* &lt;param name=&quot;overriden.host.com&quot;&gt;1&lt;/param&gt;
* &lt;/bypassRobotsFile&gt;
* </pre>
*
* @param config
* @param url optional parameter. If null, no overrides are applied
* @return
*/
public static boolean isBypassRobotsFile(XMLConfig config, URL url) {
Objects.requireNonNull(config, "Parameter config is missing");

if (url != null) {
String perHostBypassValue = config.getProperty("/crawler/bypassRobotsFile/param[@name='" + url.getHost() + "']");
return "1".equals(perHostBypassValue);
}

String globalBypassValue = config.getProperty("/crawler/param[@name='bypass_robots_file']", "0");
return "1".equals(globalBypassValue);
}
}
43 changes: 43 additions & 0 deletions java/utils/src/test/java/fr/eolya/utils/ConfigHelperTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package fr.eolya.utils;

import java.io.IOException;
import java.net.URL;

import fr.eolya.utils.ConfigHelper;
import org.junit.Assert;
import org.junit.Test;

import fr.eolya.utils.XMLConfig;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;

public class ConfigHelperTest {

@Test
public void isBypassRobotsFile_noOverride_false() throws IOException {
// given
XMLConfig config = new XMLConfig();
config.loadFile("src/test/resources/bypass-robots-crawler-config.xml");

// when
boolean bypass = ConfigHelper.isBypassRobotsFile(config, new URL("http://any-url.com"));

// then
assertFalse(bypass);
}

@Test
public void isBypassRobotsFile_override_true() throws IOException {
// given
XMLConfig config = new XMLConfig();
config.loadFile("src/test/resources/bypass-robots-crawler-config.xml");

// when
boolean bypass = ConfigHelper.isBypassRobotsFile(config, new URL("http://bypass-robots-for-this.com/context"));

// then
assertTrue(bypass);
}
}
11 changes: 11 additions & 0 deletions java/utils/src/test/resources/bypass-robots-crawler-config.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<?xml version="1.0" encoding="utf-8"?>
<crawler>

<param name="bypass_robots_file">0</param>

<bypassRobotsFile>
<param name="bypass-robots-for-this.com">1</param>
</bypassRobotsFile>

</crawler>