class LevenshteinComparator
Constants
- ASCII_REGEXP_MAPPING
- STOP_WORDS
Attributes
cleanified_strings[RW]
Public Class Methods
clean(s)
click to toggle source
# File lib/levenshtein_comparator.rb, line 125 def self.clean(s) self.unaccent( self.remove_featuring( self.remove_parenthesis( self.decode_html_entities(s) ) ) ) end
decode_html_entities(s)
click to toggle source
# File lib/levenshtein_comparator.rb, line 101 def self.decode_html_entities(s) HTMLEntities.new.decode(s) end
new(s)
click to toggle source
# File lib/levenshtein_comparator.rb, line 76 def initialize(s) self.cleanified_strings = self.class.to_array(s) end
remove_featuring(s)
click to toggle source
# File lib/levenshtein_comparator.rb, line 85 def self.remove_featuring(s) res = s.gsub(/([fF]eat(\.|uring) .*)/, '') res.strip end
remove_parenthesis(s)
click to toggle source
# File lib/levenshtein_comparator.rb, line 80 def self.remove_parenthesis(s) res = s.gsub(/([\(\[].*[\)\]])/, '') res.strip end
remove_stop_words(a)
click to toggle source
# File lib/levenshtein_comparator.rb, line 105 def self.remove_stop_words(a) a - STOP_WORDS end
to_array(s)
click to toggle source
Cut the string into an array of words Two words separated by a dash (-) should be considered as : 1 word if the first or the second word is only 1 character 2 words otherwise
# File lib/levenshtein_comparator.rb, line 113 def self.to_array(s) s = self.clean(s) arr = s.gsub(/\b(\w{2,})-(\w{2,})\b/, '\1 \2').split.map do |w| w.gsub(/[^A-Za-z0-9]/, '').downcase end.delete_if do |w| w.length < 2 && w !~ /\d/ end self.remove_stop_words(arr) end
unaccent(s)
click to toggle source
# File lib/levenshtein_comparator.rb, line 97 def self.unaccent(s) self.unaccent!(s.dup) end
unaccent!(s)
click to toggle source
# File lib/levenshtein_comparator.rb, line 90 def self.unaccent!(s) ASCII_REGEXP_MAPPING.each do |key, value| s.gsub! key, value end s end
Public Instance Methods
compare(pattern)
click to toggle source
# File lib/levenshtein_comparator.rb, line 135 def compare(pattern) pattern = self.class.to_array(pattern) size = cleanified_strings.size cleanified_strings.delete_if do |word| matched_word = pattern.find do |guess| if word =~ /\d+/ guess == word else if guess.length > 4 and word.length > 4 Levenshtein.distance(guess, word) <= 2 elsif guess.length > 2 and word.length > 2 Levenshtein.distance(guess, word) <= 1 else guess == word end end end # only deleting one of the words pattern.delete_at(pattern.index(matched_word)) if matched_word end size != cleanified_strings.size ? cleanified_strings.size == 0 ? :ok : :almost : :ko end