1
+ #
2
+ # Copyright (c) 2008 Kyle Maxwell, contributors
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person
5
+ # obtaining a copy of this software and associated documentation
6
+ # files (the "Software"), to deal in the Software without
7
+ # restriction, including without limitation the rights to use,
8
+ # copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the
10
+ # Software is furnished to do so, subject to the following
11
+ # conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
18
+ # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
20
+ # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
21
+ # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22
+ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23
+ # OTHER DEALINGS IN THE SOFTWARE.
24
+ #
25
+
1
26
require "open-uri"
2
27
require "uri"
3
- require "rubygems"
4
28
require "timeout"
29
+ require 'rex/logging/log_dispatcher'
5
30
31
+ # https://github.com/fizx/robots
6
32
class Robots
7
-
8
33
DEFAULT_TIMEOUT = 3
9
-
34
+
35
+ # Represents a parsed robots.txt file
10
36
class ParsedRobots
11
-
12
37
def initialize ( uri , user_agent )
13
38
@last_accessed = Time . at ( 1 )
14
-
39
+
15
40
io = Robots . get_robots_txt ( uri , user_agent )
16
-
41
+
17
42
if !io || io . content_type != "text/plain" || io . status . first != "200"
18
43
io = StringIO . new ( "User-agent: *\n Allow: /\n " )
19
44
end
@@ -45,27 +70,25 @@ def initialize(uri, user_agent)
45
70
@other [ key ] << value
46
71
end
47
72
end
48
-
73
+
49
74
@parsed = true
50
75
end
51
-
76
+
52
77
def allowed? ( uri , user_agent )
53
78
return true unless @parsed
54
79
allowed = true
55
80
path = uri . request_uri
56
-
81
+
57
82
@disallows . each do |key , value |
58
83
if user_agent =~ key
59
84
value . each do |rule |
60
- if path =~ rule
61
- allowed = false
62
- end
85
+ allowed = false if path =~ rule
63
86
end
64
87
end
65
88
end
66
-
89
+
67
90
@allows . each do |key , value |
68
- unless allowed
91
+ unless allowed
69
92
if user_agent =~ key
70
93
value . each do |rule |
71
94
if path =~ rule
@@ -75,59 +98,61 @@ def allowed?(uri, user_agent)
75
98
end
76
99
end
77
100
end
78
-
101
+
79
102
if allowed && @delays [ user_agent ]
80
103
sleep @delays [ user_agent ] - ( Time . now - @last_accessed )
81
104
@last_accessed = Time . now
82
105
end
83
-
106
+
84
107
return allowed
85
108
end
86
-
109
+
87
110
def other_values
88
111
@other
89
112
end
90
-
91
- protected
92
-
113
+
114
+ protected
115
+
93
116
def to_regex ( pattern )
94
117
return /should-not-match-anything-123456789/ if pattern . strip . empty?
95
118
pattern = Regexp . escape ( pattern )
96
119
pattern . gsub! ( Regexp . escape ( "*" ) , ".*" )
97
120
Regexp . compile ( "^#{ pattern } " )
98
121
end
99
122
end
100
-
123
+
101
124
def self . get_robots_txt ( uri , user_agent )
102
125
begin
103
- Timeout ::timeout ( Robots . timeout ) do
104
- io = URI . join ( uri . to_s , "/robots.txt" ) . open ( "User-Agent" => user_agent ) rescue nil
105
- end
126
+ Timeout . timeout ( Robots . timeout ) do
127
+ begin
128
+ URI . join ( uri . to_s , "/robots.txt" ) . open ( "User-Agent" => user_agent )
129
+ rescue StandardError
130
+ nil
131
+ end
132
+ end
106
133
rescue Timeout ::Error
107
- STDERR . puts "robots.txt request timed out"
134
+ dlog ( "robots.txt request timed out" )
108
135
end
109
136
end
110
-
111
- def self . timeout = ( t )
112
- @timeout = t
113
- end
114
-
137
+
138
+ attr_writer :timeout
139
+
115
140
def self . timeout
116
141
@timeout || DEFAULT_TIMEOUT
117
142
end
118
-
143
+
119
144
def initialize ( user_agent )
120
145
@user_agent = user_agent
121
146
@parsed = { }
122
147
end
123
-
148
+
124
149
def allowed? ( uri )
125
150
uri = URI . parse ( uri . to_s ) unless uri . is_a? ( URI )
126
151
host = uri . host
127
152
@parsed [ host ] ||= ParsedRobots . new ( uri , @user_agent )
128
153
@parsed [ host ] . allowed? ( uri , @user_agent )
129
154
end
130
-
155
+
131
156
def other_values ( uri )
132
157
uri = URI . parse ( uri . to_s ) unless uri . is_a? ( URI )
133
158
host = uri . host
0 commit comments