class Mitie::Document
Attributes
model[R]
text[R]
Public Class Methods
new(model, text)
click to toggle source
# File lib/mitie/document.rb, line 5 def initialize(model, text) @model = model @text = text end
Private Class Methods
finalize(pointer)
click to toggle source
# File lib/mitie/document.rb, line 105 def self.finalize(pointer) # must use proc instead of stabby lambda proc { FFI.mitie_free(pointer) } end
finalize_ptr(pointer)
click to toggle source
# File lib/mitie/document.rb, line 110 def self.finalize_ptr(pointer) # must use proc instead of stabby lambda proc { FFI.mitie_free(pointer.ptr) } end
Public Instance Methods
entities()
click to toggle source
# File lib/mitie/document.rb, line 34 def entities @entities ||= begin begin entities = [] tokens = tokens_with_offset detections = FFI.mitie_extract_entities(pointer, tokens_ptr) num_detections = FFI.mitie_ner_get_num_detections(detections) num_detections.times do |i| pos = FFI.mitie_ner_get_detection_position(detections, i) len = FFI.mitie_ner_get_detection_length(detections, i) tag = FFI.mitie_ner_get_detection_tagstr(detections, i).to_s score = FFI.mitie_ner_get_detection_score(detections, i) tok = tokens[pos, len] offset = tok[0][1] entity = {} if offset finish = tok[-1][1] + tok[-1][0].bytesize entity[:text] = text.byteslice(offset...finish) else entity[:text] = tok.map(&:first) end entity[:tag] = tag entity[:score] = score entity[:offset] = offset if offset entity[:token_index] = pos entity[:token_length] = len entities << entity end entities ensure FFI.mitie_free(detections) if detections end end end
tokens()
click to toggle source
# File lib/mitie/document.rb, line 10 def tokens @tokens ||= tokens_with_offset.map(&:first) end
tokens_with_offset()
click to toggle source
# File lib/mitie/document.rb, line 14 def tokens_with_offset @tokens_with_offset ||= begin if text.is_a?(Array) # offsets are unknown when given tokens text.map { |v| [v, nil] } else i = 0 tokens = [] loop do token = (tokens_ptr + i * Fiddle::SIZEOF_VOIDP).ptr break if token.null? offset = (offsets_ptr.ptr + i * Fiddle::SIZEOF_LONG).to_s(Fiddle::SIZEOF_LONG).unpack1("L!") tokens << [token.to_s.force_encoding(text.encoding), offset] i += 1 end tokens end end end
Private Instance Methods
offsets_ptr()
click to toggle source
# File lib/mitie/document.rb, line 80 def offsets_ptr tokenize[1] end
pointer()
click to toggle source
# File lib/mitie/document.rb, line 72 def pointer model.pointer end
tokenize()
click to toggle source
# File lib/mitie/document.rb, line 84 def tokenize @tokenize ||= begin if text.is_a?(Array) # malloc uses memset to set all bytes to 0 tokens_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP * (text.size + 1)) text.size.times do |i| tokens_ptr[i * Fiddle::SIZEOF_VOIDP, Fiddle::SIZEOF_VOIDP] = Fiddle::Pointer.to_ptr(text[i]).ref end [tokens_ptr, nil] else offsets_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP) tokens_ptr = FFI.mitie_tokenize_with_offsets(text, offsets_ptr) ObjectSpace.define_finalizer(tokens_ptr, self.class.finalize(tokens_ptr)) ObjectSpace.define_finalizer(offsets_ptr, self.class.finalize_ptr(offsets_ptr)) [tokens_ptr, offsets_ptr] end end end
tokens_ptr()
click to toggle source
# File lib/mitie/document.rb, line 76 def tokens_ptr tokenize[0] end