Skip to content

Commit 06fc5c8

Browse files
author
Brent Cook
committed
add license, fix style violations, log with dlog
1 parent 3d489a5 commit 06fc5c8

File tree

1 file changed

+58
-33
lines changed

1 file changed

+58
-33
lines changed

lib/robots.rb

Lines changed: 58 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,44 @@
1+
#
2+
# Copyright (c) 2008 Kyle Maxwell, contributors
3+
#
4+
# Permission is hereby granted, free of charge, to any person
5+
# obtaining a copy of this software and associated documentation
6+
# files (the "Software"), to deal in the Software without
7+
# restriction, including without limitation the rights to use,
8+
# copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
# copies of the Software, and to permit persons to whom the
10+
# Software is furnished to do so, subject to the following
11+
# conditions:
12+
#
13+
# The above copyright notice and this permission notice shall be
14+
# included in all copies or substantial portions of the Software.
15+
#
16+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
18+
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
20+
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
21+
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22+
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23+
# OTHER DEALINGS IN THE SOFTWARE.
24+
#
25+
126
require "open-uri"
227
require "uri"
3-
require "rubygems"
428
require "timeout"
29+
require 'rex/logging/log_dispatcher'
530

31+
# https://github.com/fizx/robots
632
class Robots
7-
833
DEFAULT_TIMEOUT = 3
9-
34+
35+
# Represents a parsed robots.txt file
1036
class ParsedRobots
11-
1237
def initialize(uri, user_agent)
1338
@last_accessed = Time.at(1)
14-
39+
1540
io = Robots.get_robots_txt(uri, user_agent)
16-
41+
1742
if !io || io.content_type != "text/plain" || io.status.first != "200"
1843
io = StringIO.new("User-agent: *\nAllow: /\n")
1944
end
@@ -45,27 +70,25 @@ def initialize(uri, user_agent)
4570
@other[key] << value
4671
end
4772
end
48-
73+
4974
@parsed = true
5075
end
51-
76+
5277
def allowed?(uri, user_agent)
5378
return true unless @parsed
5479
allowed = true
5580
path = uri.request_uri
56-
81+
5782
@disallows.each do |key, value|
5883
if user_agent =~ key
5984
value.each do |rule|
60-
if path =~ rule
61-
allowed = false
62-
end
85+
allowed = false if path =~ rule
6386
end
6487
end
6588
end
66-
89+
6790
@allows.each do |key, value|
68-
unless allowed
91+
unless allowed
6992
if user_agent =~ key
7093
value.each do |rule|
7194
if path =~ rule
@@ -75,59 +98,61 @@ def allowed?(uri, user_agent)
7598
end
7699
end
77100
end
78-
101+
79102
if allowed && @delays[user_agent]
80103
sleep @delays[user_agent] - (Time.now - @last_accessed)
81104
@last_accessed = Time.now
82105
end
83-
106+
84107
return allowed
85108
end
86-
109+
87110
def other_values
88111
@other
89112
end
90-
91-
protected
92-
113+
114+
protected
115+
93116
def to_regex(pattern)
94117
return /should-not-match-anything-123456789/ if pattern.strip.empty?
95118
pattern = Regexp.escape(pattern)
96119
pattern.gsub!(Regexp.escape("*"), ".*")
97120
Regexp.compile("^#{pattern}")
98121
end
99122
end
100-
123+
101124
def self.get_robots_txt(uri, user_agent)
102125
begin
103-
Timeout::timeout(Robots.timeout) do
104-
io = URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
105-
end
126+
Timeout.timeout(Robots.timeout) do
127+
begin
128+
URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent)
129+
rescue StandardError
130+
nil
131+
end
132+
end
106133
rescue Timeout::Error
107-
STDERR.puts "robots.txt request timed out"
134+
dlog("robots.txt request timed out")
108135
end
109136
end
110-
111-
def self.timeout=(t)
112-
@timeout = t
113-
end
114-
137+
138+
attr_writer :timeout
139+
115140
def self.timeout
116141
@timeout || DEFAULT_TIMEOUT
117142
end
118-
143+
119144
def initialize(user_agent)
120145
@user_agent = user_agent
121146
@parsed = {}
122147
end
123-
148+
124149
def allowed?(uri)
125150
uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
126151
host = uri.host
127152
@parsed[host] ||= ParsedRobots.new(uri, @user_agent)
128153
@parsed[host].allowed?(uri, @user_agent)
129154
end
130-
155+
131156
def other_values(uri)
132157
uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
133158
host = uri.host

0 commit comments

Comments
 (0)