class RelatonNist::DataFetcher

Constants

RELATION_TYPES
URL

Public Class Methods

fetch(output: "data", format: "yaml") click to toggle source

Fetch all the documnts from dataset

@param [String] output foldet name to save the documents @param [String] format format to save the documents

# File lib/relaton_nist/data_fetcher.rb, line 217
def self.fetch(output: "data", format: "yaml")
  new(output, format).fetch
end
new(output, format) click to toggle source
# File lib/relaton_nist/data_fetcher.rb, line 23
def initialize(output, format)
  @output = output
  @format = format
end

Public Instance Methods

fetch() click to toggle source

Fetch all the documnts from dataset

# File lib/relaton_nist/data_fetcher.rb, line 194
def fetch # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
  t1 = Time.now
  puts "Started at: #{t1}"

  docs = Nokogiri::XML OpenURI.open_uri URL
  FileUtils.mkdir @output unless Dir.exist? @output
  FileUtils.rm Dir[File.join(@output, "*.#{@format}")]
  docs.xpath("/body/query/doi_record/report-paper/report-paper_metadata")
    .each { |doc| parse_doc doc }

  t2 = Time.now
  puts "Stopped at: #{t2}"
  puts "Done in: #{(t2 - t1).round} sec."
rescue StandardError => e
  warn e.message
end
fetch_abstract(doc) click to toggle source

@param doc [Nokogiri::XML::Element] @return [Array<RelatonBib::FormattedString>]

# File lib/relaton_nist/data_fetcher.rb, line 103
def fetch_abstract(doc)
  doc.xpath("jats:abstract/jats:p", "jats" => "http://www.ncbi.nlm.nih.gov/JATS1").map do |a|
    RelatonBib::FormattedString.new(content: a.text, language: doc["language"], script: "Latn")
  end
end
fetch_contributor(doc) click to toggle source

@param doc [Nokogiri::XML::Element] @return [Array<Hash>]

# File lib/relaton_nist/data_fetcher.rb, line 111
def fetch_contributor(doc) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
  contribs = doc.xpath("contributors/person_name").map do |p|
    forename = []
    initial = []
    p.at("given_name")&.text&.split&.each do |fn|
      if /^(?<init>\w)\.?$/ =~ fn
        initial << RelatonBib::LocalizedString.new(init, doc["language"], "Latn")
      else
        forename << RelatonBib::LocalizedString.new(fn, doc["language"], "Latn")
      end
    end
    sname = p.at("surname").text
    surname = RelatonBib::LocalizedString.new sname, doc["language"], "Latn"
    initial = []
    ident = p.xpath("ORCID").map do |id|
      RelatonBib::PersonIdentifier.new "orcid", id.text
    end
    fullname = RelatonBib::FullName.new(
      surname: surname, forename: forename, initial: initial, identifier: ident,
    )
    person = RelatonBib::Person.new name: fullname
    { entity: person, role: [{ type: p["contributor_role"] }] }
  end
  contribs + doc.xpath("publisher").map do |p|
    abbr = p.at("../institution/institution_acronym")&.text
    org = RelatonBib::Organization.new(name: p.at("publisher_name").text, abbreviation: abbr)
    { entity: org, role: [{ type: "publisher" }] }
  end
end
fetch_date(doc) click to toggle source

@param doc [Nokogiri::XML::Element] @return [Array<RelatonBib::BibliographicDate>]

# File lib/relaton_nist/data_fetcher.rb, line 60
def fetch_date(doc)
  doc.xpath("publication_date|approval_date").map do |dt|
    on = dt.at("year").text
    if (m = dt.at "month")
      on += "-#{m.text}"
      d = dt.at "day"
      on += "-#{d.text}" if d
    end
    type = dt.name == "publication_date" ? "published" : "confirmed"
    RelatonBib::BibliographicDate.new(type: type, on: on)
  end
end
fetch_docid(doc) click to toggle source

@param doc [Nokogiri::XML::Element] @return [Array<RelatonBib::DocumentIdentifier>]

# File lib/relaton_nist/data_fetcher.rb, line 43
def fetch_docid(doc)
  parse_docid(doc).map do |id|
    RelatonBib::DocumentIdentifier.new(type: id[:type], id: id[:id])
  end
end
fetch_edition(doc) click to toggle source

@param doc [Nokogiri::XML::Element] @return [String]

# File lib/relaton_nist/data_fetcher.rb, line 75
def fetch_edition(doc)
  doc.at("edition_number")&.text
end
fetch_place(doc) click to toggle source

@param doc [Nokogiri::XML::Element] @return [Array<String>]

# File lib/relaton_nist/data_fetcher.rb, line 143
def fetch_place(doc)
  doc.xpath("institution/institution_place").map(&:text)
end
fetch_relation(doc) click to toggle source

@param doc [Nokogiri::XML::Element] @return [Array<Hash>]

# File lib/relaton_nist/data_fetcher.rb, line 81
def fetch_relation(doc)
  ns = "http://www.crossref.org/relations.xsd"
  doc.xpath("./ns:program/ns:related_item", ns: ns).map do |rel|
    doi = rel.at_xpath("ns:intra_work_relation|ns:inter_work_relation", ns: ns)
    # ref = doi_to_id doi.text
    # ref, = parse_docid doc
    fref = RelatonBib::FormattedRef.new content: doi.text
    bibitem = RelatonBib::BibliographicItem.new formattedref: fref
    type = RELATION_TYPES[doi["relationship-type"]]
    { type: type, bibitem: bibitem }
  end
end
fetch_title(doc) click to toggle source

@param doc [Nokogiri::XML::Element] @return [RelatonBib::TypedTitleStringCollection, Array]

# File lib/relaton_nist/data_fetcher.rb, line 51
def fetch_title(doc)
  t = doc.xpath("titles/title|titles/subtitle")
  return [] unless t.any?

  RelatonBib::TypedTitleString.from_string t.map(&:text).join(" "), "en", "Latn"
end
parse_doc(doc) click to toggle source

Create a document instance an save it.

@param doc [Nokogiri::XML::Element]

@raise [StandardError]

# File lib/relaton_nist/data_fetcher.rb, line 174
def parse_doc(doc) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
  # mtd = doc.at('doi_record/report-paper/report-paper_metadata')
  item = RelatonNist::NistBibliographicItem.new(
    type: "standard", docid: fetch_docid(doc), title: fetch_title(doc),
    link: fetch_link(doc), abstract: fetch_abstract(doc),
    date: fetch_date(doc), edition: fetch_edition(doc),
    contributor: fetch_contributor(doc), relation: fetch_relation(doc),
    place: fetch_place(doc),
    language: [doc["language"]], script: ["Latn"], doctype: "standard"
  )
  write_file item
rescue StandardError => e
  warn "Document: #{doc.at('doi').text}"
  warn e.message
  raise e
end
parse_docid(doc) click to toggle source
# File lib/relaton_nist/data_fetcher.rb, line 28
def parse_docid(doc)
  doi = doc.at("doi_data/doi").text
  id = doc.at("publisher_item/item_number", "publisher_item/identifier").text.sub(%r{^/}, "")
  case doi
  when "10.6028/NBS.CIRC.12e2revjune" then id.sub!("13e", "12e")
  when "10.6028/NBS.CIRC.36e2" then id.sub!("46e", "36e")
  when "10.6028/NBS.HB.67suppJune1967" then id.sub!("1965", "1967")
  when "10.6028/NBS.HB.105-1r1990" then id.sub!("105-1-1990", "105-1r1990")
  when "10.6028/NIST.HB.150-10-1995" then id.sub!(/150-10$/, "150-10-1995")
  end
  [{ type: "NIST", id: id }, { type: "DOI", id: doi }]
end
write_file(bib) click to toggle source

Save document

@param bib [RelatonNist::NistBibliographicItem]

# File lib/relaton_nist/data_fetcher.rb, line 152
def write_file(bib) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
  id = bib.docidentifier[0].id.gsub(%r{[/\s:.]}, "_").upcase.sub(/^NIST_IR/, "NISTIR")
  file = File.join(@output, "#{id}.#{@format}")
  if File.exist? file
    warn "File #{file} exists. Docid: #{bib.docidentifier[0].id}"
    # warn "Link: #{bib.link.detect { |l| l.type == 'src' }.content}"
  else
    output = case @format
             when "yaml" then bib.to_hash.to_yaml
             when "xml" then bib.to_xml bibdata: true
             end
    File.write file, output, encoding: "UTF-8"
  end
end