vocabtree  0.0.1
inverted_index.cxx
Go to the documentation of this file.
1 #include "inverted_index.hpp"
2 
3 #include <config.hpp>
4 
5 #include <utils/filesystem.hpp>
6 #include <utils/numerics.hpp>
7 
8 #include <iostream>
9 #include <fstream>
10 
12 
13 }
14 
15 InvertedIndex::InvertedIndex(const std::string &file_name) : SearchBase(file_name) {
16  if(!filesystem::file_exists(file_name)) {
17  std::cerr << "Error reading index from " << file_name << std::endl;
18  return;
19  }
20  if(!this->load(file_name)) {
21  std::cerr << "Error reading index from " << file_name << std::endl;
22  }
23 }
24 
25 bool InvertedIndex::load (const std::string &file_path) {
26  std::cout << "Reading inverted index from " << file_path << "..." << std::endl;
27 
28  std::ifstream ifs(file_path, std::ios::binary);
29  uint32_t num_clusters;
30  ifs.read((char *)&num_clusters, sizeof(uint32_t));
31  inverted_index.resize(num_clusters);
32  idf_weights.resize(num_clusters);
33  ifs.read((char *)&idf_weights[0], sizeof(float) * num_clusters);
34  for(uint32_t i=0; i<num_clusters; i++) {
35  uint64_t num_entries;
36  ifs.read((char *)&num_entries, sizeof(uint64_t));
37  inverted_index[i].resize(num_entries);
38  ifs.read((char *)&inverted_index[i][0], sizeof(uint64_t) * num_entries);
39  }
40 
41  std::cout << "Done reading inverted index." << std::endl;
42 
43  return (ifs.rdstate() & std::ifstream::failbit) == 0;
44 }
45 
46 
47 bool InvertedIndex::save (const std::string &file_path) const {
48  std::cout << "Writing inverted index to " << file_path << "..." << std::endl;
49 
50  std::ofstream ofs(file_path, std::ios::binary | std::ios::trunc);
51 
52  uint32_t num_clusters = inverted_index.size();
53  ofs.write((const char *)&num_clusters, sizeof(uint32_t));
54  ofs.write((const char *)&idf_weights[0], sizeof(float) * num_clusters);
55  for(uint32_t i=0; i<num_clusters; i++) {
56  uint64_t num_entries = inverted_index[i].size();
57  ofs.write((const char *)&num_entries, sizeof(uint64_t));
58  ofs.write((const char *)&inverted_index[i][0], sizeof(uint64_t) * num_entries);
59  }
60 
61  std::cout << "Done writing inverted index." << std::endl;
62 
63  return (ofs.rdstate() & std::ofstream::failbit) == 0;
64 }
65 
66 bool InvertedIndex::train(Dataset &dataset, const std::shared_ptr<const TrainParamsBase> &params, const std::vector< std::shared_ptr<const Image > > &examples) {
67  const std::shared_ptr<const TrainParams> &ii_params = std::static_pointer_cast<const TrainParams>(params);
68 
69  const std::shared_ptr<BagOfWords> &bag_of_words = ii_params->bag_of_words;
70 
71  if(bag_of_words == nullptr) return false;
72 
73  inverted_index.resize(bag_of_words->num_clusters());
74  idf_weights.resize(bag_of_words->num_clusters(), 0.f);
75 
76  for (size_t i = 0; i < examples.size(); i++) {
77  const std::shared_ptr<const Image> &image = examples[i];
78  const std::string &bow_descriptors_location = dataset.location(image->feature_path("bow_descriptors"));
79 
80  if (!filesystem::file_exists(bow_descriptors_location)) continue;
81 
82  numerics::sparse_vector_t bow_descriptors;
83  if(!filesystem::load_sparse_vector(bow_descriptors_location, bow_descriptors)) continue;
84 
85  for(size_t j=0; j<bow_descriptors.size(); j++) {
86  inverted_index[bow_descriptors[j].first].push_back(image->id);
87  }
88  }
89 
90  for(size_t i=0; i<idf_weights.size(); i++) {
91  idf_weights[i] = logf(
92  (float)examples.size() /
93  (float)inverted_index[i].size());
94  }
95 
96  return true;
97 }
98 
99 std::shared_ptr<MatchResultsBase> InvertedIndex::search(Dataset &dataset, const std::shared_ptr<const SearchParamsBase> &params, const std::shared_ptr<const Image > &example) {
100 
101  // const std::shared_ptr<const SearchParams> &ii_params = std::static_pointer_cast<const SearchParams>(params);
102 
103  std::shared_ptr<MatchResults> match_result = std::make_shared<MatchResults>();
104 
105  const std::string &example_bow_descriptors_location = dataset.location(example->feature_path("bow_descriptors"));
106  if(!filesystem::file_exists(example_bow_descriptors_location)) {
107  std::cerr << "No file at load " << example_bow_descriptors_location << std::endl;
108  return nullptr;
109  }
110  numerics::sparse_vector_t example_bow_descriptors;
111  if(!filesystem::load_sparse_vector(example_bow_descriptors_location, example_bow_descriptors)) {
112  std::cerr << "Failed to load " << example_bow_descriptors_location << std::endl;
113  return nullptr;
114  }
115 
116  std::vector<uint64_t> candidates(dataset.num_images(), 0);
117  uint64_t num_candidates = 0;
118  for(size_t i=0; i<example_bow_descriptors.size(); i++) {
119  uint32_t cluster = example_bow_descriptors[i].first;
120  for(size_t j=0; j<inverted_index[cluster].size(); j++) {
121  if(!candidates[inverted_index[cluster][j]])
122  candidates[inverted_index[cluster][j]] = ++num_candidates;
123  }
124  }
125 
126  std::vector< std::pair<float, uint64_t> > candidate_scores(num_candidates);
127 
128 #if ENABLE_MULTITHREADING && ENABLE_OPENMP
129 #pragma omp parallel for schedule(dynamic)
130 #endif
131  for(int64_t i=0; i<(int64_t)candidates.size(); i++) {
132  if(!candidates[i]) continue;
133 
134  const std::string &bow_descriptors_location = dataset.location(dataset.image(i)->feature_path("bow_descriptors"));
135  if (!filesystem::file_exists(bow_descriptors_location)) continue;
136  numerics::sparse_vector_t bow_descriptors;
137  if(!filesystem::load_sparse_vector(bow_descriptors_location, bow_descriptors)) continue;
138 
139  float sim = numerics::min_hist(example_bow_descriptors, bow_descriptors, idf_weights);
140  candidate_scores[candidates[i]-1] = std::pair<float, uint64_t>(sim, i);
141  }
142 
143  std::sort(candidate_scores.begin(), candidate_scores.end(),
144  boost::bind(&std::pair<float, uint64_t>::first, _1) >
145  boost::bind(&std::pair<float, uint64_t>::first, _2));
146 
147  match_result->tfidf_scores.resize(candidate_scores.size());
148  match_result->matches.resize(candidate_scores.size());
149 
150  for(int64_t i=0; i<(int64_t)candidate_scores.size(); i++) {
151  match_result->tfidf_scores[i] = candidate_scores[i].first;
152  match_result->matches[i] = candidate_scores[i].second;
153  }
154 
155  return std::static_pointer_cast<MatchResultsBase>(match_result);
156 }
157 
158 uint32_t InvertedIndex::num_clusters() const {
159  return idf_weights.size();
160 }
161 
162 std::ostream& operator<< (std::ostream &out, const InvertedIndex::MatchResults &match_results) {
163  out << "[ ";
164  for(uint32_t i=0; i<MIN(8, match_results.matches.size()); i++) {
165  out << "[ " << match_results.matches[i] << ", " << match_results.tfidf_scores[i] << " ] ";
166  }
167  out << "]";
168  return out;
169 }