class Spidr::Page
Represents a requested page from a website.
Constants
- RESERVED_COOKIE_NAMES
Reserved names used within Cookie strings
Attributes
Headers returned with the body
HTTP Response
URL of the page
Public Class Methods
Creates a new Page
object.
@param [URI::HTTP] url
The URL of the page.
@param [Net::HTTPResponse] response
The response from the request for the page.
# File lib/spidr/page.rb, line 27 def initialize(url,response) @url = url @response = response @headers = response.to_hash @doc = nil end
Public Instance Methods
Searches for the first occurrence an XPath or CSS Path expression.
@return [Nokogiri::HTML::Node, Nokogiri::XML::Node, nil]
The first matched node. Returns `nil` if no nodes could be matched, or if the page is not a HTML or XML document.
@example
page.at('//title')
@see nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000251
# File lib/spidr/page.rb, line 110 def at(*arguments) if doc doc.at(*arguments) end end
Determines if the page is an Atom feed.
@return [Boolean]
Specifies whether the page is an Atom feed.
# File lib/spidr/page/content_types.rb, line 193 def atom? is_content_type?('application/atom+xml') end
Determines if the response code is `400`.
@return [Boolean]
Specifies whether the response code is `400`.
# File lib/spidr/page/status_codes.rb, line 33 def bad_request? code == 400 end
The body of the response.
@return [String]
The body of the response.
# File lib/spidr/page.rb, line 40 def body (response.body || '') end
The response code from the page.
@return [Integer]
Response code from the page.
# File lib/spidr/page/status_codes.rb, line 11 def code @response.code.to_i end
The charset included in the Content-Type.
@return [String, nil]
The charset of the content.
@since 0.4.0
# File lib/spidr/page/content_types.rb, line 35 def content_charset content_types.each do |value| if value.include?(';') value.split(';').each do |param| param.strip! if param.start_with?('charset=') return param.split('=',2).last end end end end return nil end
The Content-Type of the page.
@return [String]
The Content-Type of the page.
# File lib/spidr/page/content_types.rb, line 11 def content_type @response['Content-Type'] || '' end
The content types of the page.
@return [Array<String>]
The values within the Content-Type header.
@since 0.2.2
# File lib/spidr/page/content_types.rb, line 23 def content_types @response.get_fields('content-type') || [] end
Determines if the page is a CSS stylesheet.
@return [Boolean]
Specifies whether the page is a CSS stylesheet.
# File lib/spidr/page/content_types.rb, line 172 def css? is_content_type?('text/css') end
Determines if the page is a Directory Listing.
@return [Boolean]
Specifies whether the page is a Directory Listing.
@since 0.3.0
# File lib/spidr/page/content_types.rb, line 108 def directory? is_content_type?('text/directory') end
Returns a parsed document object for HTML, XML, RSS and Atom pages.
@return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil]
The document that represents HTML or XML pages. Returns `nil` if the page is neither HTML, XML, RSS, Atom or if the page could not be parsed properly.
@see nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html @see nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
# File lib/spidr/page.rb, line 57 def doc unless body.empty? doc_class = if html? Nokogiri::HTML::Document elsif rss? || atom? || xml? || xsl? Nokogiri::XML::Document end if doc_class begin @doc ||= doc_class.parse(body, @url.to_s, content_charset) rescue end end end end
Enumerates over every link in the page.
@yield [link]
The given block will be passed every non-empty link in the page.
@yieldparam [String] link
A link in the page.
@return [Enumerator]
If no block is given, an enumerator object will be returned.
@since 0.3.0
# File lib/spidr/page/html.rb, line 183 def each_link(&block) return enum_for(__method__) unless block_given? each_redirect(&block) if is_redirect? if (html? && doc) doc.search('//a[@href[string()]]').each do |a| yield a.get_attribute('href') end doc.search('//frame[@src[string()]]').each do |iframe| yield iframe.get_attribute('src') end doc.search('//iframe[@src[string()]]').each do |iframe| yield iframe.get_attribute('src') end doc.search('//link[@href[string()]]').each do |link| yield link.get_attribute('href') end doc.search('//script[@src[string()]]').each do |script| yield script.get_attribute('src') end end end
Enumerates over every `mailto:` link in the page.
@yield [link]
The given block will be passed every `mailto:` link from the page.
@yieldparam [String] link
A `mailto:` link from the page.
@return [Enumerator]
If no block is given, an enumerator object will be returned.
@since 0.5.0
# File lib/spidr/page/html.rb, line 147 def each_mailto return enum_for(__method__) unless block_given? if (html? && doc) doc.search('//a[starts-with(@href,"mailto:")]').each do |a| yield a.get_attribute('href')[7..-1] end end end
Enumerates over the meta-redirect links in the page.
@yield [link]
If a block is given, it will be passed every meta-redirect link from the page.
@yieldparam [String] link
A meta-redirect link from the page.
@return [Enumerator]
If no block is given, an enumerator object will be returned.
@since 0.3.0
# File lib/spidr/page/html.rb, line 38 def each_meta_redirect return enum_for(__method__) unless block_given? if (html? && doc) search('//meta[@http-equiv and @content]').each do |node| if node.get_attribute('http-equiv') =~ /refresh/i content = node.get_attribute('content') if (redirect = content.match(/url=(\S+)$/)) yield redirect[1] end end end end end
Enumerates over every HTTP or meta-redirect link in the page.
@yield [link]
The given block will be passed every redirection link from the page.
@yieldparam [String] link
A HTTP or meta-redirect link from the page.
@return [Enumerator]
If no block is given, an enumerator object will be returned.
@since 0.3.0
# File lib/spidr/page/html.rb, line 108 def each_redirect(&block) return enum_for(__method__) unless block locations = @response.get_fields('Location') unless (locations.nil? || locations.empty?) # Location headers override any meta-refresh redirects in the HTML locations.each(&block) else # check page-level meta redirects if there isn't a location header each_meta_redirect(&block) end end
Enumerates over every absolute URL in the page.
@yield [url]
The given block will be passed every URL in the page.
@yieldparam [URI::HTTP] url
An absolute URL in the page.
@return [Enumerator]
If no block is given, an enumerator object will be returned.
@since 0.3.0
# File lib/spidr/page/html.rb, line 236 def each_url return enum_for(__method__) unless block_given? each_link do |link| if (url = to_absolute(link)) yield url end end end
Determines if the page is a GIF image.
@return [Boolean]
Specifies whether the page is a GIF image.
@since 0.7.0
# File lib/spidr/page/content_types.rb, line 247 def gif? is_content_type?('image/gif') end
Determines if the response code is `500`.
@return [Boolean]
Specifies whether the response code is `500`.
# File lib/spidr/page/status_codes.rb, line 91 def had_internal_server_error? code == 500 end
Determines if the page is HTML document.
@return [Boolean]
Specifies whether the page is HTML document.
# File lib/spidr/page/content_types.rb, line 118 def html? is_content_type?('text/html') end
Determines if the page is a ICO image.
@return [Boolean]
Specifies whether the page is a ICO image.
@since 0.7.0
# File lib/spidr/page/content_types.rb, line 271 def ico? is_content_type?('image/x-icon') || is_content_type?('image/vnd.microsoft.icon') end
Determines if any of the content-types of the page include a given type.
@param [String] type
The content-type to test for.
@return [Boolean]
Specifies whether the page includes the given content-type.
@example Match the Content-Type
page.is_content_type?('application/json')
@example Match the sub-type of the Content-Type
page.is_content_type?('json')
@since 0.4.0
# File lib/spidr/page/content_types.rb, line 69 def is_content_type?(type) if type.include?('/') # otherwise only match the first param content_types.any? do |value| value = value.split(';',2).first value == type end else # otherwise only match the sub-type content_types.any? do |value| value = value.split(';',2).first value = value.split('/',2).last value == type end end end
Determines if the response code is `403`.
@return [Boolean]
Specifies whether the response code is `403`.
# File lib/spidr/page/status_codes.rb, line 55 def is_forbidden? code == 403 end
Determines if the response code is `404`.
@return [Boolean]
Specifies whether the response code is `404`.
# File lib/spidr/page/status_codes.rb, line 67 def is_missing? code == 404 end
Determines if the response code is `200`.
@return [Boolean]
Specifies whether the response code is `200`.
# File lib/spidr/page/status_codes.rb, line 21 def is_ok? code == 200 end
Determines if the response code is `300`, `301`, `302`, `303` or `307`. Also checks for “soft” redirects added at the page level by a meta refresh tag.
@return [Boolean]
Specifies whether the response code is a HTTP Redirect code.
# File lib/spidr/page/status_codes.rb, line 103 def is_redirect? case code when 300..303, 307 true when 200 meta_redirect? else false end end
Determines if the response code is `408`.
@return [Boolean]
Specifies whether the response code is `408`.
# File lib/spidr/page/status_codes.rb, line 79 def is_timedout? code == 408 end
Determines if the page is JavaScript.
@return [Boolean]
Specifies whether the page is JavaScript.
# File lib/spidr/page/content_types.rb, line 149 def javascript? is_content_type?('text/javascript') || \ is_content_type?('application/javascript') end
Determines if the page is a JPEG image.
@return [Boolean]
Specifies whether the page is a JPEG image.
@since 0.7.0
# File lib/spidr/page/content_types.rb, line 259 def jpeg? is_content_type?('image/jpeg') end
Determines if the page is JSON.
@return [Boolean]
Specifies whether the page is JSON.
@since 0.3.0
# File lib/spidr/page/content_types.rb, line 162 def json? is_content_type?('application/json') end
The links from within the page.
@return [Array<String>]
All links within the HTML page, `frame`/`iframe` source URLs and any links in the `Location` header.
# File lib/spidr/page/html.rb, line 218 def links each_link.to_a end
`mailto:` links in the page.
@return [Array<String>]
The `mailto:` links found within the page.
@since 0.5.0
# File lib/spidr/page/html.rb, line 165 def mailtos each_mailto.to_a end
The meta-redirect links of the page.
@return [Array<String>]
All meta-redirect links in the page.
@deprecated
Deprecated in 0.3.0 and will be removed in 0.4.0. Use {#meta_redirects} instead.
# File lib/spidr/page/html.rb, line 87 def meta_redirect warn 'DEPRECATION: Spidr::Page#meta_redirect will be removed in 0.3.0' warn 'DEPRECATION: Use Spidr::Page#meta_redirects instead' meta_redirects end
Returns a boolean indicating whether or not page-level meta redirects are present in this page.
@return [Boolean]
Specifies whether the page includes page-level redirects.
# File lib/spidr/page/html.rb, line 61 def meta_redirect? !each_meta_redirect.first.nil? end
The meta-redirect links of the page.
@return [Array<String>]
All meta-redirect links in the page.
@since 0.3.0
# File lib/spidr/page/html.rb, line 73 def meta_redirects each_meta_redirect.to_a end
Determines if the page is a MS Word document.
@return [Boolean]
Specifies whether the page is a MS Word document.
# File lib/spidr/page/content_types.rb, line 203 def ms_word? is_content_type?('application/msword') end
Determines if the page is a PDF document.
@return [Boolean]
Specifies whether the page is a PDF document.
# File lib/spidr/page/content_types.rb, line 213 def pdf? is_content_type?('application/pdf') end
Determines if the page is plain-text.
@return [Boolean]
Specifies whether the page is plain-text.
# File lib/spidr/page/content_types.rb, line 94 def plain_text? is_content_type?('text/plain') end
Determines if the page is a PNG image.
@return [Boolean]
Specifies whether the page is a PNG image.
@since 0.7.0
# File lib/spidr/page/content_types.rb, line 235 def png? is_content_type?('image/png') end
URLs that this document redirects to.
@return [Array<String>]
The links that this page redirects to (usually found in a location header or by way of a page-level meta redirect).
# File lib/spidr/page/html.rb, line 129 def redirects_to each_redirect.to_a end
Determines if the page is a RSS feed.
@return [Boolean]
Specifies whether the page is a RSS feed.
# File lib/spidr/page/content_types.rb, line 182 def rss? is_content_type?('application/rss+xml') || \ is_content_type?('application/rdf+xml') end
Searches the document for XPath or CSS Path paths.
@param [Array<String>] paths
CSS or XPath expressions to search the document with.
@return [Array]
The matched nodes from the document. Returns an empty Array if no nodes were matched, or if the page is not an HTML or XML document.
@example
page.search('//a[@href]')
@see nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000239
# File lib/spidr/page.rb, line 90 def search(*paths) if doc doc.search(*paths) else [] end end
The title of the HTML page.
@return [String]
The inner-text of the title element of the page.
# File lib/spidr/page/html.rb, line 17 def title if (node = at('//title')) node.inner_text end end
Normalizes and expands a given link into a proper URI
.
@param [String] link
The link to normalize and expand.
@return [URI::HTTP]
The normalized URI.
# File lib/spidr/page/html.rb, line 267 def to_absolute(link) link = link.to_s new_url = begin url.merge(link) rescue URI::Error return end if (!new_url.opaque) && (path = new_url.path) # ensure that paths begin with a leading '/' for URI::FTP if (new_url.scheme == 'ftp' && !path.start_with?('/')) path.insert(0,'/') end # make sure the path does not contain any .. or . directories, # since URI::Generic#merge cannot normalize paths such as # "/stuff/../" new_url.path = URI.expand_path(path) end return new_url end
Absolute URIs from within the page.
@return [Array<URI::HTTP>]
The links from within the page, converted to absolute URIs.
# File lib/spidr/page/html.rb, line 254 def urls each_url.to_a end
Determines if the page is XML document.
@return [Boolean]
Specifies whether the page is XML document.
# File lib/spidr/page/content_types.rb, line 128 def xml? is_content_type?('text/xml') || \ is_content_type?('application/xml') end
Determines if the page is XML Stylesheet (XSL).
@return [Boolean]
Specifies whether the page is XML Stylesheet (XSL).
# File lib/spidr/page/content_types.rb, line 139 def xsl? is_content_type?('text/xsl') end
Determines if the page is a ZIP archive.
@return [Boolean]
Specifies whether the page is a ZIP archive.
# File lib/spidr/page/content_types.rb, line 223 def zip? is_content_type?('application/zip') end
Protected Instance Methods
Provides transparent access to the values in {#headers}.
@param [Symbol] name
The name of the missing method.
@param [Array] arguments
Additional arguments for the missing method.
@return [String]
The missing method mapped to a header in {#headers}.
@raise [NoMethodError]
The missing method did not map to a header in {#headers}.
# File lib/spidr/page.rb, line 136 def method_missing(name,*arguments,&block) if (arguments.empty? && block.nil?) header_name = name.to_s.tr('_','-') if @response.key?(header_name) return @response[header_name] end end return super(name,*arguments,&block) end