class Scylla::Classifier

Attributes

input[RW]
limit[RW]
ngrams[RW]
threshold[RW]

Public Class Methods

new(limit = 10, ngrams = 400, threshold = 1.04) click to toggle source

limit : Up to how many matching language results should be displayed ngrams : The total number of ngrams that are stored for each language threshold: The threshold score for matches

# File lib/scylla/classifier.rb, line 8
def initialize(limit = 10, ngrams = 400, threshold = 1.04)
  @limit = limit
  @ngrams = ngrams
  @threshold = threshold
end

Public Instance Methods

classify() click to toggle source

Classifies @input to a list of languages in order of best match

# File lib/scylla/classifier.rb, line 28
def classify
  results = Hash.new
  languages = Scylla::Loader.languages
  if languages.empty?
    p "No languages (.lm files) found in + " + Scylla::Loader.dir + ". Please run rake scylla:train after placing your training texts in the source_texts directory."
    return
  end
  sg = Scylla::Generator.new
  unknown = sg.create_lm(@input)
  languages.each_key do |key|
    ngram = languages[key]
    results[key] = get_score(unknown, ngram)
  end
  results = results.sort {|a,b| a[1]<=>b[1]}
  a = results[0][1]
  answers = [results.shift[0]]
  while (!results.empty? and results[0][1] < (@threshold * a))
    answers << results.shift[0]
  end
  return answers
end
classify_file(path) click to toggle source

Classifies a file to a list of languages in order of best match

# File lib/scylla/classifier.rb, line 21
def classify_file(path)
  @input = ""
  File.readlines(path).each { |line| @input += " " + line.strip }
  classify
end
classify_string(text) click to toggle source

Classifies a string to a list of languages in order of best match

# File lib/scylla/classifier.rb, line 15
def classify_string(text)
  @input = text
  classify
end
get_score(unknown, ngram) click to toggle source

Gets the score of the text in question compared to a particular language

# File lib/scylla/classifier.rb, line 51
def get_score(unknown, ngram)
  i, p = 0,0
  max_size = [unknown.size, 400].min
  while i < max_size
    if (ngram[unknown[i]])
      p += (ngram[unknown[i]]-i).abs
    else
      p += @ngrams
    end
    i += 1
  end
  return p
end