vocabtree  0.0.1
dataset.hpp
Go to the documentation of this file.
1 #pragma once
2 
3 #include "image.hpp"
4 #include <memory>
5 #include <boost/bimap.hpp>
6 #include <sstream>
7 #include <iomanip>
8 
9 /// The Dataset class is an abstract wrapper describing a dataset. A dataset consiste of the actual
10 /// data, plus a way to convert the images, or frames of a video into an integer index. The dataset
11 /// should at minimum provide an easy way to map image paths to unique integers. For a sample implementation
12 /// of a Dataset see the SimpleDataset class.
13 ///
14 /// Combined with the Image class implementation, a Dataset + Image provides a way to find relevant paths
15 /// for features and images. Note that the implementation of a Dataset or Image class should implement a relative
16 /// path to the Image data, with the absolute path being interchangebale.
17 class Dataset {
18 
19 public:
20  /// Constructs a dataset given a base location. An example base location might be
21  /// /c/data/. Given this base location, an implementation of the Dataset should
22  /// find all the data and construct a mapping between the data and the id, for example
23  /// by searching through base_location + /images/.
24  Dataset(const std::string &base_location);
25 
26  /// Loads a dataset from the db_data_location. The base_location provides the absolute
27  /// path of data.
28  Dataset(const std::string &base_location, const std::string &db_data_location);
29 
30  virtual ~Dataset();
31 
32  /// Writes the dataset mapping to the input data location. Returns true if successful, false
33  /// otherwise.
34  virtual bool write(const std::string &db_data_location) = 0;
35 
36  /// Reads the dataset mapping from the input data location. Returns true if successful, false
37  /// otherwise.
38  virtual bool read (const std::string &db_data_location) = 0;
39 
40  /// Given a unique integer ID, returns an Image associated with that ID.
41  virtual std::shared_ptr<Image> image(uint64_t id) const = 0;
42 
43  /// Returns the number of images in the dataset.
44  virtual uint64_t num_images() const = 0;
45 
46  /// Returns the absolute path of the data directory
47  std::string location() const;
48 
49  /// Returns the absolute path of the file (appends the file path to the database path).
50  std::string location(const std::string &relative_path) const;
51 
52  /// Adds the given image to the database, if there is an id collision, will not add the image and
53  /// return false, otherwise returns true.
54  virtual bool add_image(const std::shared_ptr<const Image> &image) = 0 ;
55 
56  /// Returns a vector of all images in the dataset.
57  std::vector< std::shared_ptr< const Image> > all_images() const;
58 
59  /// Returns a vector of random images in the dataset of size count.
60  std::vector< std::shared_ptr< const Image> > random_images(size_t count) const;
61 
62  /// @TODO: Shards the dataset to the new input locations, and returns the sharded datasets
63  std::vector<Dataset> shard(const std::vector<std::string> &new_locations);
64 
65 protected:
66  std::string data_directory; /// Holds the absolute path of the data.
67 };
68 
69 /// Prints out information about the dataset.
70 std::ostream& operator<< (std::ostream &out, const Dataset &dataset);
71 
72 /// SimpleDataset is a sample implementation of a Dataset, where the data is stored as JPEG
73 /// images in a single folder called images/ and features are stored in a folder feats/<feat_name>.
74 /// For example, given a base absolute path of /c/data/. Image data is found in /c/data/images and
75 /// sift features are found in /c/data/feats/sift/.
76 class SimpleDataset : public Dataset {
77 
78 public:
79 
80  /// SimpleImage class used with the SimpleDataset class. Features are stored in
81  /// <data_dir>/<feats>/<feat_name>
82  class SimpleImage : public Image {
83  public:
84  /// Constructs a SimpleImage given the Image relative path in the Dataset directory,
85  /// and a corresponding unique image ID.
86  SimpleImage(const std::string &path, uint64_t imageid);
87 
88  /// Returns the corresponding feature path given a feature name (ex. "sift").
89  std::string feature_path(const std::string &feat_name) const;
90 
91  /// Returns the image location relative to the database data directory.
92  std::string location() const;
93 
94  protected:
95  std::string image_path; /// Stores the relative image path.
96  };
97 
98  /// Creates a simple dataset from the images in base_location/images. It is recommended
99  /// to then call write(...) to save the dataset so that it does not have to traverse the HDD
100  /// everytime we load the dataset.
101  SimpleDataset(const std::string &base_location);
102 
103  /// If a dataset file is location at db_data_location, will load that file from. Otherwise,
104  /// this will create the dataset from base_location/images and call write(db_data_location).
105  SimpleDataset(const std::string &base_location, const std::string &db_data_location);
106  ~SimpleDataset();
107 
108  /// Writes the SimpleDataset out to the specified file. If the containing directory does not
109  /// exist, it will be automatically created. The Dataset data is stored in a binary format
110  /// with num_images() entries of the form uint64_t, uint16_t, char * corresponding to an
111  /// image id, string length, and the image location string respectively. Returns true if
112  /// success, fail otherwise (checks the ofstream error bit).
113  bool write(const std::string &db_data_location);
114 
115  /// Reads the specified SimpleDataset. See write(const std::string &db_data_location) for
116  /// more information about the binary format. Returns true if success, false otherwise.
117  /// (checks the ifstream error bit).
118  bool read(const std::string &db_data_location);
119 
120  /// Given a unique integer ID, returns an Image associated with that ID.
121  std::shared_ptr<Image> image(uint64_t id) const;
122 
123  /// Adds the given image to the database, if there is an id collision, will not add the image and
124  /// return false, otherwise returns true.
125  bool add_image(const std::shared_ptr<const Image> &image);
126 
127  /// Returns the number of images in the dataset.
128  uint64_t num_images() const;
129 
130 private:
131 
132  /// Constructs the dataset an fills in the image id map.
133  void construct_dataset();
134 
135  boost::bimap<std::string, uint64_t> id_image_map; /// Map which holds the image path and id
136 
137 };