I’m writing a program that computes the minimum, maximum and average each row
across CSV files.
I was using Hash(String, Array(Float64)) or Hash(String, Array(Int32)) for the dataset, but fetching and updating is something complicated, so I wrote a generic method.
require "csv"
minimum = Hash(String, Array(Float64)).new
maximum = Hash(String, Array(Float64)).new
average = Hash(String, Array(Float64)).new
number  = Hash(String, Array(Int32)).new
def update(set : Hash(String, Array(T)), index, key, value) forall T
  v = value.as(T)
  if (a = set[key]?)
    if a[index]?
      a[index] = v
    else
      a << v
    end
  else
    set[key] = [v]
  end
end
def compute(key, index, *sets)
  arrays = sets.map { |d| d[key]? }
  values = arrays.map { |a| a ? a[index]? : nil }
  ret = yield *values
  sets.zip(ret) do |d, r|
    update(d, index, key, r)
  end
end
csv_files = %w[foo.csv bar.csv hoge.csv fuga.csv]
csv_files.each do |file|
  File.open(file, "r") do |fp|
    index = 0
    CSV.new(fp, headers: true) do |csv|
      val = csv.row["value of foo"].to_f64
      num = csv.row["number of foo"].to_i32
      compute("foo", index, minimum, maximum) do |omin, omax|
        [omin, omax, val].compact.minmax
      end
      compute("foo", index, average, number) do |oave, onum|
        # not yet implemented, but something like
        { 1.0, 0 }
      end
      index += 1
    end
  end
  first_file = false
end
I’ll refactor the class of datasets something like:
struct A
  property min : Array(Float64)
  property max : Array(Float64)
  property ave : Array(Float64)
  property num : Array(Int32)
  def add(index, value, number)
     # ...
  end
end
dataset = Hash(String, A).new
csv_files = %w[foo.csv bar.csv hoge.csv fuga.csv]
csv_files.each do |file|
  File.open(file, "r") do |fp|
    index = 0
    CSV.new(fp, headers: true) do |csv|
      val = csv.row["value of foo"].to_f64
      num = csv.row["number of foo"].to_i32
      dataset["foo"].add(index, val, num)
      index += 1
    end
  end
  first_file = false
end
Thanks.