class Recluse::Profile
A profile is an atomic unit of rules for link checking.
Attributes
Array of URL patterns to check. Optional. Defaults to empty array.
Used in the user-agent to identify who is running the crawler. This is so that if there's a problem with your spidering, you will be contacted and not the author of Recluse
. Required.
Don't check external URLs. Optional. Defaults to false
.
Identifier of the profile. Make sure that it is filename friendly. Required.
When enabled, will follow redirects and report only the status code for the page that is landed upon. When disabled, will report the redirect status code. Defaults to false
.
Hash of resulting +HashTree+s.
Array of URLs to start spidering. Required.
HTTP and HTTPS schemed URLs are treated as equal. Optional. Defaults to false
.
The list of run tests.
Array of exceptions to the blacklist. Optional. Defaults to empty array.
Public Class Methods
Loads profile by name.
# File lib/recluse/profile.rb, line 151 def self.load(profile) uconf = UserConfig.new '.recluse' raise ProfileError, "Profile '#{profile}' doesn't exist" unless uconf.exist?("#{profile}.yaml") options = uconf["#{profile}.yaml"] expects = [:blacklist, :whitelist, :internal_only, :scheme_squash, :redirect] opts = {} expects.each do |e| estr = e.to_s opts[e] = options[estr] if options.key?(estr) && !options[estr].nil? end ret = Profile.new( profile, (options.key?('roots') && !options['roots'].nil? ? options['roots'] : []), (options.key?('email') && !options['email'].nil? ? options['email'] : ''), **opts ) ret end
Create a profile.
# File lib/recluse/profile.rb, line 63 def initialize( name, roots, email, blacklist: [], whitelist: [], internal_only: false, scheme_squash: false, redirect: false ) raise ProfileError, 'Profile needs roots for starting point' if roots.empty? @name = name @email = email @roots = roots.map do |root| if root.class == Link root else Link.new(root, :root) end end @blacklist = blacklist @whitelist = whitelist @internal_only = internal_only @scheme_squash = scheme_squash @redirect = redirect @tasks = {} @results = {} end
Public Instance Methods
Test if profiles share the same configuration options.
# File lib/recluse/profile.rb, line 140 def ==(other) return false if other.class != self.class instance_variables.all? do |ivar| next true if ivar == '@results'.to_sym next true if ivar == '@roots' && instance_variable_get(ivar).map(&:to_s) == other.instance_variable_get(ivar).map(&:to_s) instance_variable_get(ivar) == other.instance_variable_get(ivar) end end
Create a Mechanize
agent.
# File lib/recluse/profile.rb, line 94 def create_agent Mechanize.new do |a| a.ssl_version = 'TLSv1' a.verify_mode = OpenSSL::SSL::VERIFY_NONE a.max_history = nil a.follow_meta_refresh = true a.keep_alive = false a.redirect_ok = @redirect a.user_agent = "Mozilla/5.0 (compatible; recluse/#{Recluse::VERSION}; +#{Recluse::URL}) #{@email}" end end
Saves profile to ~/.recluse/NAME.yaml
.
# File lib/recluse/profile.rb, line 123 def save uconf = UserConfig.new '.recluse' fname = "#{@name}.yaml" options = uconf[fname] options['name'] = @name options['roots'] = @roots.map(&:to_s) options['email'] = @email options['blacklist'] = @blacklist options['whitelist'] = @whitelist options['internal_only'] = @internal_only options['scheme_squash'] = @scheme_squash options['redirect'] = @redirect options.save end
Runs test.
# File lib/recluse/profile.rb, line 108 def test(key, options = {}) unless @results.key?(key) && @results[key].class == Recluse::HashTree @results[key] = Recluse::HashTree.new do |url1, url2| url1, url2 = url2, url1 if url2.length > url1.length # Detect if URL exists already, but just has a slash at end (url1 == url2 || (url1.length == (url2.length + 1) && url1[-1] == '/' && url2[-1] != '/' && url1[0...-1] == url2)) end end @tasks[key] = Recluse::Tasks.get(key).new(self, options.merge(results: @results[key])) @tasks[key].run @results[key] end