class AIPP::PDF

PDF to text reader with support for pages and fencing

@example

pdf = AIPP::PDF.new("/path/to/file.pdf")
pdf.file   # => #<Pathname:/path/to/file.pdf>
pdf.from(100).to(200).each_line_with_position do |line, page, last|
  line   # => line content (e.g. "first line")
  page   # => page number (e.g. 1)
  last   # => last line boolean (true for last line, false otherwise)
end

Attributes

file[R]

Public Class Methods

new(file, cache: true) click to toggle source
   # File lib/aipp/pdf.rb
16 def initialize(file, cache: true)
17   @file = file.is_a?(Pathname) ? file : Pathname(file)
18   @text, @page_ranges = cache ? read_cache : read
19   @from = 0
20   @to = @last = @text.length - 1
21 end

Public Instance Methods

each()
Alias for: each_line
each_line() { |line, page_for(index: offset), line_index == last_line_index| ... } click to toggle source

Executes the block for every line and passes the line content, page number and end of document boolean.

If no block is given, an enumerator is returned instead.

@yieldparam line [String] content of the line @yieldparam page [Integer] page number the line is found on within the PDF @yieldparam last [Boolean] true for the last line, false otherwise @return [Enumerator]

   # File lib/aipp/pdf.rb
82 def each_line
83   return enum_for(:each) unless block_given?
84   offset, last_line_index = @from, lines.count - 1
85   lines.each_with_index do |line, line_index|
86     yield(line, page_for(index: offset), line_index == last_line_index)
87     offset += line.length
88   end
89 end
Also aliased as: each
from(index) click to toggle source

Fence the PDF beginning with this index

@param index [Integer, Symbol] either an integer position within the

+text+ string or +:begin+ to indicate "first existing position"

@return [self]

   # File lib/aipp/pdf.rb
33 def from(index)
34   index = 0 if index == :begin
35   fail ArgumentError unless (0..@to).include? index
36   @from = index
37   self
38 end
inspect() click to toggle source

@return [String]

   # File lib/aipp/pdf.rb
24 def inspect
25   %Q(#<#{self.class} file=#{@file} range=#{range}>)
26 end
lines() click to toggle source

Text split to individual lines

@return [Array] lines

   # File lib/aipp/pdf.rb
69 def lines
70   text.split(/(?<=[\n\f])/)
71 end
range() click to toggle source

Get the current fencing range

@return [Range<Integer>]

   # File lib/aipp/pdf.rb
55 def range
56   (@from..@to)
57 end
text() click to toggle source

Text string of the PDF with fencing applied

@return [String] PDF converted to string

   # File lib/aipp/pdf.rb
62 def text
63   @text[range]
64 end
to(index) click to toggle source

Fence the PDF ending with this index

@param index [Integer, Symbol] either an integer position within the

+text+ string or +:end+ to indicate "last existing position"

@return [self]

   # File lib/aipp/pdf.rb
45 def to(index)
46   index = @last if index == :end
47   fail ArgumentError unless (@from..@last).include? index
48   @to = index
49   self
50 end

Private Instance Methods

page_for(index:) click to toggle source
    # File lib/aipp/pdf.rb
119 def page_for(index:)
120   @page_ranges.index(@page_ranges.bsearch { _1 >= index }) + 1
121 end
page_ranges_for(pages) click to toggle source
    # File lib/aipp/pdf.rb
111 def page_ranges_for(pages)
112   [].tap do |page_ranges|
113     pages.each_with_index  do |page, index|
114       page_ranges << (page_ranges.last || 0) + page.text.length + index
115     end
116   end
117 end
read() click to toggle source
   # File lib/aipp/pdf.rb
94 def read
95   pages = ::PDF::Reader.new(@file).pages
96   [pages.map(&:text).join("\f"), page_ranges_for(pages)]
97 end
read_cache() click to toggle source
    # File lib/aipp/pdf.rb
 99 def read_cache
100   cache_file = Pathname.new("#{@file}.json")
101   if cache_file.exist? && (@file.stat.mtime - cache_file.stat.mtime).abs < 1
102     JSON.load cache_file
103   else
104     read.tap do |data|
105       cache_file.write data.to_json
106       FileUtils.touch(cache_file, mtime: @file.stat.mtime)
107     end
108   end
109 end