class Scylla::Generator

Constants

NONLATIN

Attributes

delimiter[RW]
dirlm[RW]
dirtext[RW]
minsize[RW]

Public Class Methods

new(dirtext = DEFAULT_SOURCE_DIR, dirlm = DEFAULT_TARGET_DIR, minsize = 0, silent = false, delimiter = "[[classifier_delimiter]]") click to toggle source

dirtext: The location of the source training text files minsize: The minimum size of the ngrams that you would like to store

# File lib/scylla/generator.rb, line 12
def initialize(dirtext = DEFAULT_SOURCE_DIR, dirlm = DEFAULT_TARGET_DIR, minsize = 0, silent = false, delimiter = "[[classifier_delimiter]]")
  @dirtext = dirtext
  @dirlm   = dirlm
  @minsize = minsize
  @delimiter = delimiter
end

Public Instance Methods

clean(string) click to toggle source
# File lib/scylla/generator.rb, line 83
def clean(string)
  delimit = string.index(@delimiter)
  string = string[0, delimit] if delimit
  string = Sanitize.clean(string)
  string = CGI.unescapeHTML(string)
  string = Unicode::downcase(string)
  string.gsub!(/(?:http|https):\/\/[a-z0-9]+(?:[\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(?:(?::[0-9]{1,5})?\/[^\s]*)?/, "")
  string.gsub!(/[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,4}/, "")
  string.gsub!(/[\*\^><!\"#\$%&\'\(\)\*\+:;,._\/=\?@\{\}\[\]|\-\n\r0-9]/," ")
  latin, nonlatin = string.scan(/[a-z]/), string.scan(/[\p{L}&&[^a-z]]/)
  string.gsub!(/[a-zA-Z]/, "") if !latin.empty? && !nonlatin.empty? && nonlatin.size/(latin.size*1.0) > 0.5
  string.strip.split(" ").join(" ")
end
create_lm(input, frequencies = false) click to toggle source

Creates a language map for a given input string. The frequencies boolean specifies whether or not the method should return the freqencies of the ngrams, or simply an array in sorted order

# File lib/scylla/generator.rb, line 100
def create_lm(input, frequencies = false)
  input = clean(input)
  ngram = Hash.new
  input.split(/[\d\s\[\]]/).each do |word|
    word = "_" + word + "_";
    len = word.size
    for i in 0..word.size
      for j in (1..3)
        next unless word[i,j]
        ngram[word[i,j]] ||= 0
        ngram[word[i,j]] += 1 if (len > (j - 1))
      end
      len = len - 1
    end
  end
  ngram.each_key do |key|
    ngram.delete(key) if key.size <= @minsize
  end
  ngram = ngram.sort {|a,b| b[1] <=> a[1]}
  return ngram if frequencies
  sorted = []
  ngram.each {|key| sorted << key[0]}
  return sorted
end
get_wiki(locale,article) click to toggle source
# File lib/scylla/generator.rb, line 47
def get_wiki(locale,article)
  Wikipedia.Configure {
    domain "#{locale}.wikipedia.org"
    path   'w/api.php'
  }
  p article
  page = Wikipedia.find( article )
  value = page.raw_data['query']['pages'].values.first['revisions'].first.fetch('*')
  value = value.force_encoding("UTF-8").chars.select {|c| c.valid_encoding?}.join
  value = value.gsub(/\{\{(.*?)\}\}/,"")
  value = value.gsub(/\[\[(.+?)\]\]/m,"")
  value = value.gsub(/\{\{(.+?)\}\}/m,"")
  value = value.gsub(/\{(.+?)\}/m,"")
  value = value.gsub(/\[(.+?)\]/m,"")
  value = Sanitize.clean(value)
  value = value.gsub(/[a-zA-Z]/,"") if NONLATIN.include?(locale)
  clean(value)
end
get_wikis() click to toggle source
# File lib/scylla/generator.rb, line 36
def get_wikis
  require 'wikipedia'
  locales = Scylla::Resources.locales
  locales.each do |key, value|
    text = get_wiki(value[0],value[1])
    textname = File.join(@dirtext, "#{key}.txt")
    File.delete(textname) if File.exists?(textname)
    File.open(textname, 'w') { |f| f.write(text) }
  end
end
train() click to toggle source

Loads all the .txt files in the specified source training text folder

and creates language maps using ngram frequencies. The maps are stored in lib/scylla/lms as .lm files

# File lib/scylla/generator.rb, line 22
def train
  languages = Dir.glob(@dirlm + "/*.lm")
  languages.each {|l| File.delete(l) }
  locales = Scylla::Resources.locales
  get_wikis
  locales.each do |key, value|
    path = File.join(@dirtext, "#{key}.txt")
    text = ""
    File.open(path).each { |line| text += " " + line }
    write_lm(text, key)
    File.delete(path)
  end
end
write_lm(text, language) click to toggle source

Reads a single text file specified by a path and writes a .lm file in lib/scylla/lms

# File lib/scylla/generator.rb, line 68
def write_lm(text, language)
  p "Creating language map for #{language}"
  lm = create_lm(text, true)
  lmname = File.join(@dirlm, "#{language}.lm")
  File.delete(lmname) if File.exists?(lmname)
  File.open(lmname, 'w') do |f|
    i = 0
    lm.each do |freq|
      break if i == 400
      f.write(freq[0] + "\t" + freq[1].to_s + "\n")
      i += 1
    end
  end
end