Module: Rumale::Dataset
- Defined in:
- rumale-core/lib/rumale/dataset.rb
Overview
Module for loading and saving a dataset file.
Class Method Summary collapse
-
.dump_libsvm_file(data, labels, filename, zero_based: false) ⇒ Object
Dump the dataset with the libsvm file format.
-
.load_libsvm_file(filename, n_features: nil, zero_based: false, dtype: Numo::DFloat) ⇒ Array<Numo::NArray>
Load a dataset with the libsvm file format into Numo::NArray.
-
.make_blobs(n_samples = 1000, n_features = 2, centers: nil, cluster_std: 1.0, center_box: [-10, 10], shuffle: true, random_seed: nil) ⇒ Object
Generate Gaussian blobs.
-
.make_circles(n_samples, shuffle: true, noise: nil, factor: 0.8, random_seed: nil) ⇒ Object
Generate a two-dimensional data set consisting of an inner circle and an outer circle.
-
.make_moons(n_samples, shuffle: true, noise: nil, random_seed: nil) ⇒ Object
Generate a two-dimensional data set consisting of two half circles shifted.
Class Method Details
.dump_libsvm_file(data, labels, filename, zero_based: false) ⇒ Object
Dump the dataset with the libsvm file format.
43 44 45 46 47 48 49 50 51 52 53 54 55 |
# File 'rumale-core/lib/rumale/dataset.rb', line 43 def dump_libsvm_file(data, labels, filename, zero_based: false) n_samples = [data.shape[0], labels.shape[0]].min single_label = labels.shape[1].nil? label_type = detect_dtype(labels) value_type = detect_dtype(data) File.open(filename, 'w') do |file| n_samples.times do |n| label = single_label ? labels[n] : labels[n, true].to_a file.puts(dump_libsvm_line(label, data[n, true], label_type, value_type, zero_based)) end end end |
.load_libsvm_file(filename, n_features: nil, zero_based: false, dtype: Numo::DFloat) ⇒ Array<Numo::NArray>
Load a dataset with the libsvm file format into Numo::NArray.
22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
# File 'rumale-core/lib/rumale/dataset.rb', line 22 def load_libsvm_file(filename, n_features: nil, zero_based: false, dtype: Numo::DFloat) ftvecs = [] labels = [] n_features_detected = 0 CSV.foreach(filename, col_sep: "\s", headers: false) do |line| label, ftvec, max_idx = parse_libsvm_line(line, zero_based) labels.push(label) ftvecs.push(ftvec) n_features_detected = max_idx if n_features_detected < max_idx end n_features ||= n_features_detected n_features = [n_features, n_features_detected].max [convert_to_matrix(ftvecs, n_features, dtype), Numo::NArray.asarray(labels)] end |
.make_blobs(n_samples = 1000, n_features = 2, centers: nil, cluster_std: 1.0, center_box: [-10, 10], shuffle: true, random_seed: nil) ⇒ Object
Generate Gaussian blobs.
134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
# File 'rumale-core/lib/rumale/dataset.rb', line 134 def make_blobs(n_samples = 1000, n_features = 2, centers: nil, cluster_std: 1.0, center_box: [-10, 10], shuffle: true, random_seed: nil) # initialize rng. rs = random_seed rs ||= srand rng = Random.new(rs) # initialize centers. if centers.is_a?(Numo::DFloat) n_centers = centers.shape[0] n_features = centers.shape[1] else n_centers = centers.is_a?(Integer) ? centers : 3 center_min = center_box.first center_max = center_box.last centers = ::Rumale::Utils.rand_uniform([n_centers, n_features], rng) min_vec = centers.min(0) dif_vec = centers.max(0) - min_vec dif_vec[dif_vec.eq(0)] = 1.0 centers = ((centers - min_vec.tile(n_centers, 1)) / dif_vec.tile(n_centers, 1)) * (center_max - center_min) + center_min end # generate blobs. sz_cluster = [n_samples / n_centers] * n_centers (n_samples % n_centers).times { |n| sz_cluster[n] += 1 } x = ::Rumale::Utils.rand_normal([sz_cluster[0], n_features], rng, 0.0, cluster_std) + centers[0, true] y = Numo::Int32.zeros(sz_cluster[0]) (1...n_centers).each do |n| c = ::Rumale::Utils.rand_normal([sz_cluster[n], n_features], rng, 0.0, cluster_std) + centers[n, true] x = Numo::DFloat.vstack([x, c]) y = y.concatenate(Numo::Int32.zeros(sz_cluster[n]) + n) end # shuffle data. if shuffle rand_ids = Array(0...n_samples).shuffle(random: rng.dup) x = x[rand_ids, true].dup y = y[rand_ids].dup end [x, y] end |
.make_circles(n_samples, shuffle: true, noise: nil, factor: 0.8, random_seed: nil) ⇒ Object
Generate a two-dimensional data set consisting of an inner circle and an outer circle.
65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
# File 'rumale-core/lib/rumale/dataset.rb', line 65 def make_circles(n_samples, shuffle: true, noise: nil, factor: 0.8, random_seed: nil) # initialize some variables. rs = random_seed rs ||= srand rng = Random.new(rs) n_samples_out = n_samples.fdiv(2).to_i n_samples_in = n_samples - n_samples_out # make two circles. linsp_out = Numo::DFloat.linspace(0, 2 * Math::PI, n_samples_out) linsp_in = Numo::DFloat.linspace(0, 2 * Math::PI, n_samples_in) circle_out = Numo::DFloat[Numo::NMath.cos(linsp_out), Numo::NMath.sin(linsp_out)].transpose circle_in = Numo::DFloat[Numo::NMath.cos(linsp_in), Numo::NMath.sin(linsp_in)].transpose x = Numo::DFloat.vstack([circle_out, factor * circle_in]) y = Numo::Int32.hstack([Numo::Int32.zeros(n_samples_out), Numo::Int32.ones(n_samples_in)]) # shuffle data indices. if shuffle rand_ids = Array(0...n_samples).shuffle(random: rng.dup) x = x[rand_ids, true].dup y = y[rand_ids].dup end # add gaussian noise. x += ::Rumale::Utils.rand_normal(x.shape, rng.dup, 0.0, noise) unless noise.nil? [x, y] end |
.make_moons(n_samples, shuffle: true, noise: nil, random_seed: nil) ⇒ Object
Generate a two-dimensional data set consisting of two half circles shifted.
97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
# File 'rumale-core/lib/rumale/dataset.rb', line 97 def make_moons(n_samples, shuffle: true, noise: nil, random_seed: nil) # initialize some variables. rs = random_seed rs ||= srand rng = Random.new(rs) n_samples_out = n_samples.fdiv(2).to_i n_samples_in = n_samples - n_samples_out # make two half circles. linsp_out = Numo::DFloat.linspace(0, Math::PI, n_samples_out) linsp_in = Numo::DFloat.linspace(0, Math::PI, n_samples_in) circle_out = Numo::DFloat[Numo::NMath.cos(linsp_out), Numo::NMath.sin(linsp_out)].transpose circle_in = Numo::DFloat[1 - Numo::NMath.cos(linsp_in), 1 - Numo::NMath.sin(linsp_in) - 0.5].transpose x = Numo::DFloat.vstack([circle_out, circle_in]) y = Numo::Int32.hstack([Numo::Int32.zeros(n_samples_out), Numo::Int32.ones(n_samples_in)]) # shuffle data indices. if shuffle rand_ids = Array(0...n_samples).shuffle(random: rng.dup) x = x[rand_ids, true].dup y = y[rand_ids].dup end # add gaussian noise. x += ::Rumale::Utils.rand_normal(x.shape, rng.dup, 0.0, noise) unless noise.nil? [x, y] end |