#!/usr/bin/ruby class ParseVolc1 def initialize input @fp = File.open(input) case input when /\.xml$/i then @each = :each_xml when /\.txt$/i then @each = :each_csv else raise "unknown suffix #{input}" end @block = nil end def close @fp.close end def each_xml require 'rubygems' require 'libxml' xparser = LibXML::XML::Reader.io(@fp) tagstack = [] record = nil loop do xparser.read or break case xparser.node_type when LibXML::XML::Reader::TYPE_ELEMENT then tagstack.push xparser.name if tagstack.size == 2 then record = {} end when LibXML::XML::Reader::TYPE_END_ELEMENT then if tagstack.size == 2 then @block.call record record = nil end tagstack.pop when LibXML::XML::Reader::TYPE_TEXT then if tagstack.size == 3 then record[tagstack.last.downcase] = xparser.value end end end end def each_csv heading = nil row = [] token = nil @fp.each_line do |line| line.sub!(/\r?\n$/, '') for word in line.split(/,/, -1) if token.nil? case word when /^"([^"]*)"$/ then # quoted finalized cell row.push $1 when /^"(([^"]|"")*)"$/ then # quoted finalized cell row.push $1.gsub(/""/, '"') when /^(([^"]|"")*)$/ then # unquoted finalized cell row.push $1.gsub(/""/, '"') when /^"(([^"]|"")*)$/ then # quoted continued cell token = $1.gsub(/""/, '"') else raise [token, word, row].inspect end else case word when /^([^"]*)$/ then # unclosed continued cell token = [token, $1].join(',') when /^(([^"]|"")*)$/ then # unclosed continued cell token = [token, $1.gsub(/""/, '"')].join(',') when /^(([^"]|"")*)"$/ then # closed continued cell row.push [token, $1.gsub(/""/, '"')].join(',') token = nil else raise [token, word, row].inspect end end end if token.nil? then if heading.nil? then heading = row.collect { |key| key.downcase } else @block.call Hash[*heading.zip(row).flatten] end row = [] end end end def each &block @block = block self.send(@each) @block = nil end def self::read input, &block par = self.new(input) par.each &block par.close end end if __FILE__ == $0 for arg in ARGV ParseVolc1.read(arg) do |row| p row end end end