#!/usr/local/bin/ruby

require 'net/http'
require 'openssl'
require 'uri'
require 'gdbm'
require 'syslog'
require 'rubygems'
require 'tarwriter'

class WGet

  def initialize
    @conn = nil
    @resp = nil
    @ca = nil
    $logger = Syslog.open('syndl', Syslog::LOG_PID, Syslog::LOG_NEWS)
    $onset = Time.now
    @n = Hash.new(0)
    @n['-w'] = !!($VERBOSE)
  end

  def ca= val
    @ca = val
  end

  def connect(uri)
    if @conn then
      return 0 if  @conn.address == uri.host and @conn.port == uri.port
      @conn.finish
    end
    STDERR.puts "#CONNECT #{uri.host}:#{uri.port}" if $VERBOSE
    @conn = Net::HTTP.new(uri.host, uri.port, :ENV)
    @conn.use_ssl = true
    if @ca
      if /\/$/ === @ca then
        @conn.ca_path = @ca
      else
        @conn.ca_file = @ca
      end
    else
      STDERR.puts "Warning: server certificate not verified"
      @conn.verify_mode = OpenSSL::SSL::VERIFY_NONE
    end
    @conn.start
  end

  def get(uri, lmt = nil, etag = nil)
    begin
      connect(uri)
      hdr = {}
      path = uri.request_uri
      STDERR.puts "#GET #{path}" if $VERBOSE
      if lmt then
    hdr['if-modified-since'] = lmt
      end
      if etag then
    hdr['If-None-Match'] = etag
      end
      STDERR.puts "# #{hdr.inspect}" if $VERBOSE
      @resp = @conn.get2(path, hdr)
      STDERR.puts "#--> #{@resp.code}" if $VERBOSE
      rc = @resp.code
    rescue Exception => e
      rc = '500'
      $logger.err("#{rc} rescue=#{e.class.to_s}")
    end
    @n[rc] += 1
    rc
  end

  def body
    @resp.body
  end

  def lmt
    @resp['last-modified']
  end

  def etag
    @resp['etag']
  end

  def tag s
    @n['tag'] = s
  end

  def eagain
    @n['EAGAIN'] = 1
  end

  def waitok s
    @n["wait#{s}"] += 1
  end

  def close
    @conn.finish if @conn and @conn.started?
    $logger.info('elapsed %g wget %s', Time.now - $onset, @n.inspect)
    $logger.close
  end

end

class SynDL

  def help
    puts "#$0 rtdb logdb feedurl ..."
    exit 1
  end

  def initialize argv
    @rtdb = argv.shift
    @logdb = argv.shift
    @feeds = argv.dup
    help if @feeds.empty?
    # @folder and @wget must be close()-ed properly
    @folder = TarWriter::Folder.new(nil, 'a')
    @wget = WGet.new
    @pfilter = {}
  end

  def getlmt(feed)
    lmt = etag = nil
    mode = GDBM::NOLOCK | GDBM::READER
    GDBM.open(@rtdb, 0644, mode) {|rtdb|
      key = "lmt/#{feed}"
      lmt = rtdb[key]
      key = "etag/#{feed}"
      etag = rtdb[key]
    }
    return [lmt, etag]
  rescue Errno::ENOENT
    nil
  end

  def setlmt(feed, lmt, etag)
    return unless lmt or etag
    mode = GDBM::WRCREAT
    GDBM.open(@rtdb, 0644, mode) {|rtdb|
      if lmt then
    key = "lmt/#{feed}"
    rtdb[key] = lmt
      end
      if etag then
    key = "etag/#{feed}"
    rtdb[key] = etag
      end
    }
  end

  def getfeed(ldb, feed)
    lmt, etag = getlmt(feed)
    ufeed = URI.parse(feed)
    STDERR.puts "##{ufeed.inspect}" if $VERBOSE
    code = @wget.get(ufeed, lmt, etag)
    case code
    when '304' then
      STDERR.puts "#unchanged" if $VERBOSE
      errid = 'dup:' + Time.now.utc.strftime('%Y-%m-%dT%H%M%SZ')
      ldb[errid] = feed
      return 0
    when '200' then
      :do_nothing
    else
      errid = "err:#{code}:" + Time.now.utc.strftime('%Y-%m-%dT%H%M%SZ')
      ldb[errid] = feed
      exit 16
    end
    fbdy = @wget.body
    lmt2 = @wget.lmt
    etag2 = @wget.etag
    STDERR.puts "#ETag: #{etag2}" if $VERBOSE
    # @wget can be reused now
    defer = Hash.new
    fbdy.each_line { |line|
      id = line.chomp
      if @pfilter[:match] then
        next unless @pfilter[:match] =~ id
      end
      if @pfilter[:reject] then
        next if @pfilter[:reject] =~ id
      end
      if ldb[id] then
        STDERR.puts "#dup skip #{id}" if $VERBOSE
        next
      end
      umsg = URI.parse(id)
      code = @wget.get(umsg)
      if '404' == code then
        defer[id] = true
      else
        body = @wget.body
        STDERR.puts "#size #{body.size}" if $VERBOSE
        fnam = File.basename(id).gsub(/[^A-Za-z_0-9.]/, '_')
        t = Time.now.utc
        @folder.add(fnam, body, t)
        ldb[id] = t.strftime('%Y-%m-%dT%H%M%SZ')
      end
    }
    5.times {|i|
      break if defer.empty?
      sleep(11)
      defer.keys.each {|id|
        umsg = URI.parse(id)
        code = @wget.get(umsg)
        next if '404' == code
        body = @wget.body
        STDERR.puts "#size #{body.size}" if $VERBOSE
        fnam = File.basename(id).gsub(/[^A-Za-z_0-9.]/, '_')
        t = Time.now.utc
        @folder.add(fnam, body, t)
        ldb[id] = t.strftime('%Y-%m-%dT%H%M%SZ')
        defer.delete(id)
        @wget.waitok(i * 11)
      }
    }
    unless defer.empty?
      defer.keys.each{ @wget.waitok('X') }
    end
    setlmt(feed, lmt2, etag2)
  end

  def run
    GDBM.open(@logdb, 0644, GDBM::WRCREAT) {|ldb|
    @feeds.each {|feed|
      case feed
      when /^--match=/
        if $'.empty? then @pfilter.delete(:match)
        else @pfilter[:match] = Regexp.new($')
        end
      when /^--reject=/
        if $'.empty? then @pfilter.delete(:reject)
        else @pfilter[:reject] = Regexp.new($')
        end
      when /^--ca=/
        @wget.ca= $'
      when /^--tar=/
        tar = $'
        @folder.close
        @folder = TarWriter::Folder.new(tar, 'a')
      when /^--chdir=/
        dir = $'
        @folder.close
        Dir.chdir(dir)
        @folder = TarWriter::Folder.new(nil, 'a')
      when /^--tag=/
        @wget.tag($')
      else
        STDERR.puts "getfeed #{feed}" if $VERBOSE
        getfeed(ldb, feed)
        GC.start
      end
    }
    }
  rescue Errno::EAGAIN
    $logger.err("db #{@logdb} busy - possibly multiple runs")
    @wget.eagain
  ensure
    @wget.close
    @folder.close
  end

end

SynDL.new(ARGV).run