module PostRank::URI

Constants

C14N
URIREGEX
VERSION

Public Instance Methods

c14n(uri, opts = {}) click to toggle source
# File lib/postrank-uri.rb, line 163
def c14n(uri, opts = {})
  u = parse(uri, opts)
  u = embedded(u)

  if q = u.query_values(Array)
    q.delete_if { |k,v| C14N[:global].include?(k) }
    q.delete_if { |k,v| C14N[:hosts].find {|r,p| u.host =~ r && p.include?(k) } }
  end
  u.query_values = q

  if u.host =~ /^(mobile\.)?twitter\.com$/ && u.fragment && u.fragment.match(/^!(.*)/)
    u.fragment = nil
    u.path = $1
  end

  if u.host =~ /tumblr\.com$/ && u.path =~ /\/post\/\d+\//
    u.path = u.path.gsub(/[^\/]+$/, '')
  end

  u
end
clean(uri, opts = {}) click to toggle source
# File lib/postrank-uri.rb, line 145
def clean(uri, opts = {})
  uri = normalize(c14n(unescape(uri), opts))
  opts[:raw] ? uri : uri.to_s
end
embedded(uri) click to toggle source
# File lib/postrank-uri.rb, line 185
def embedded(uri)
  embedded = if uri.host == 'news.google.com' && uri.path == '/news/url' \
     || uri.host == 'xfruits.com'
    uri.query_values['url']

  elsif uri.host =~ /myspace\.com/ && uri.path =~ /PostTo/
    embedded = uri.query_values['u']
  end

  uri = clean(embedded, :raw => true) if embedded
  uri
end
escape(uri) click to toggle source
# File lib/postrank-uri.rb, line 127
def escape(uri)
  uri.gsub(URIREGEX[:escape]) do
    '%' + $1.unpack('H2' * $1.size).join('%').upcase
  end.gsub(' ','%20')
end
extract(text) click to toggle source
# File lib/postrank-uri.rb, line 97
def extract(text)
  return [] if !text
  urls = []
  text.to_s.scan(URIREGEX[:valid_url]) do |all, before, url, protocol, domain, path, query|
    # Only extract the URL if the domain is valid
    if PublicSuffix.valid?(domain, default_rule: nil)
      url = clean(url)
      urls.push url.to_s
    end
  end

  urls.compact
end
extract_href(text, host = nil) click to toggle source
# File lib/postrank-uri.rb, line 111
def extract_href(text, host = nil)
  urls = []
  Nokogiri.HTML(text).search('a').each do |a|
    begin
      url = clean(a.attr('href'), :raw => true, :host => host)

      next unless url.absolute?

      urls.push [url.to_s, a.text]
    rescue
      next
    end
  end
  urls
end
hash(uri, opts = {}) click to toggle source
# File lib/postrank-uri.rb, line 150
def hash(uri, opts = {})
  Digest::MD5.hexdigest(opts[:clean] == true ? clean(uri) : uri)
end
normalize(uri, opts = {}) click to toggle source
# File lib/postrank-uri.rb, line 154
def normalize(uri, opts = {})
  u = parse(uri, opts)
  u.path = u.path.gsub(URIREGEX[:double_slash_outside_scheme], '/')
  u.path = u.path.chomp('/') if u.path.size != 1
  u.query = nil if u.query && u.query.empty?
  u.fragment = nil
  u
end
parse(uri, opts = {}) click to toggle source
# File lib/postrank-uri.rb, line 198
def parse(uri, opts = {})
  return uri if uri.is_a? Addressable::URI

  uri = Addressable::URI.parse(uri)

  if !uri.host && uri.scheme !~ /^javascript|mailto|xmpp$/
    if uri.scheme
      # With no host and scheme yes, the parser exploded
      return parse("http://#{uri}", opts)
    end

    if opts[:host]
      uri.host = opts[:host]
    else
      parts = uri.path.to_s.split(/[\/:]/)
      if parts.first =~ URIREGEX[:valid_domain]
        host = parts.shift
        uri.path = '/' + parts.join('/')
        uri.host = host
      end
    end
  end

  uri.scheme = 'http' if uri.host && !uri.scheme
  uri.normalize!
end
unescape(uri) click to toggle source
# File lib/postrank-uri.rb, line 133
def unescape(uri)
  u = parse(uri)
  u.query = u.query.tr('+', ' ') if u.query
  u.to_s.gsub(URIREGEX[:unescape]) do |encoded|
    if !encoded.match(URIREGEX[:reserved_characters]).nil?
      encoded
    else
      [encoded.delete('%')].pack('H*')
    end
  end
end
valid?(uri) click to toggle source
# File lib/postrank-uri.rb, line 225
def valid?(uri)
  # URI is only valid if it is not nil, parses cleanly as a URI,
  # and the domain has a recognized, valid TLD component
  return false if uri.nil?

  is_valid = false
  cleaned_uri = clean(uri, :raw => true)

  if host = cleaned_uri.host
    is_valid = PublicSuffix.valid?(Addressable::IDNA.to_unicode(host), default_rule: nil)
  end

  is_valid
end