btllib
bloom_filter.hpp
1#ifndef BTLLIB_BLOOM_FILTER_HPP
2#define BTLLIB_BLOOM_FILTER_HPP
3
4#include "btllib/nthash.hpp"
5
6#include "cpptoml.h"
7
8#include <atomic>
9#include <climits>
10#include <cstdint>
11#include <fstream>
12#include <memory>
13#include <string>
14#include <vector>
15
16namespace btllib {
17
18static const uint8_t BIT_MASKS[CHAR_BIT] = {
19 // NOLINT
20 0x01, 0x02, 0x04, 0x08, // NOLINT
21 0x10, 0x20, 0x40, 0x80 // NOLINT
22};
23
24static const char* const BLOOM_FILTER_SIGNATURE = "[BTLBloomFilter_v6]";
25static const char* const KMER_BLOOM_FILTER_SIGNATURE =
26 "[BTLKmerBloomFilter_v6]";
27static const char* const SEED_BLOOM_FILTER_SIGNATURE =
28 "[BTLSeedBloomFilter_v6]";
29static const char* const HASH_FN = NTHASH_FN_NAME;
30
31static const unsigned MAX_HASH_VALUES = 1024;
32static const unsigned PLACEHOLDER_NEWLINES = 50;
33
35class BloomFilterInitializer
36{
37
38public:
39 BloomFilterInitializer(const std::string& path, const std::string& signature)
40 : ifs(path)
41 , table(parse_header(ifs, signature))
42 {}
43
44 static bool check_file_signature(std::ifstream& ifs,
45 std::string& file_signature,
46 const std::string& expected_signature);
47
50 static std::shared_ptr<cpptoml::table> parse_header(
51 std::ifstream& file,
52 const std::string& signature);
53
54 std::ifstream ifs;
55 std::shared_ptr<cpptoml::table> table;
56
57 BloomFilterInitializer(const BloomFilterInitializer&) = delete;
58 BloomFilterInitializer(BloomFilterInitializer&&) = default;
59
60 BloomFilterInitializer& operator=(const BloomFilterInitializer&) = delete;
61 BloomFilterInitializer& operator=(BloomFilterInitializer&&) = default;
62};
64
66{
67
68public:
71
79 BloomFilter(size_t bytes, unsigned hash_num, std::string hash_fn = "");
80
86 explicit BloomFilter(const std::string& path);
87
88 BloomFilter(const BloomFilter&) = delete;
89 BloomFilter(BloomFilter&&) = delete;
90
91 BloomFilter& operator=(const BloomFilter&) = delete;
92 BloomFilter& operator=(BloomFilter&&) = delete;
93
100 void insert(const uint64_t* hashes);
101
107 void insert(const std::vector<uint64_t>& hashes) { insert(hashes.data()); }
108
117 bool contains(const uint64_t* hashes) const;
118
126 bool contains(const std::vector<uint64_t>& hashes) const
127 {
128 return contains(hashes.data());
129 }
130
139 bool contains_insert(const uint64_t* hashes);
140
148 bool contains_insert(const std::vector<uint64_t>& hashes)
149 {
150 return contains_insert(hashes.data());
151 }
152
154 size_t get_bytes() const { return bytes; }
156 uint64_t get_pop_cnt() const;
158 double get_occupancy() const;
160 unsigned get_hash_num() const { return hash_num; }
162 double get_fpr() const;
164 const std::string& get_hash_fn() const { return hash_fn; }
165
171 void save(const std::string& path);
172
173 static void save(const std::string& path,
174 const cpptoml::table& table,
175 const char* data,
176 size_t n);
177
183 static bool is_bloom_file(const std::string& path)
184 {
185 return check_file_signature(path, BLOOM_FILTER_SIGNATURE);
186 }
187
188 static bool check_file_signature(const std::string& path,
189 const std::string& signature);
190
191private:
192 BloomFilter(const std::shared_ptr<BloomFilterInitializer>& bfi);
193
194 friend class KmerBloomFilter;
195 friend class SeedBloomFilter;
196
197 size_t bytes = 0;
198 size_t array_size =
199 0; // Should be equal to bytes, but not guaranteed by standard
200 size_t array_bits = 0;
201 unsigned hash_num = 0;
202 std::string hash_fn;
203 std::unique_ptr<std::atomic<uint8_t>[]> array;
204};
205
210{
211
212public:
215
223 KmerBloomFilter(size_t bytes, unsigned hash_num, unsigned k);
224
230 explicit KmerBloomFilter(const std::string& path);
231
232 KmerBloomFilter(const KmerBloomFilter&) = delete;
234
235 KmerBloomFilter& operator=(const KmerBloomFilter&) = delete;
236 KmerBloomFilter& operator=(KmerBloomFilter&&) = delete;
237
244 void insert(const char* seq, size_t seq_len);
245
251 void insert(const std::string& seq) { insert(seq.c_str(), seq.size()); }
252
259 void insert(const uint64_t* hashes) { bloom_filter.insert(hashes); }
260
266 void insert(const std::vector<uint64_t>& hashes)
267 {
268 bloom_filter.insert(hashes);
269 }
270
279 unsigned contains(const char* seq, size_t seq_len) const;
280
288 unsigned contains(const std::string& seq) const
289 {
290 return contains(seq.c_str(), seq.size());
291 }
292
299 bool contains(const uint64_t* hashes) const
300 {
301 return bloom_filter.contains(hashes);
302 }
303
309 bool contains(const std::vector<uint64_t>& hashes) const
310 {
311 return bloom_filter.contains(hashes);
312 }
313
322 unsigned contains_insert(const char* seq, size_t seq_len);
323
331 unsigned contains_insert(const std::string& seq)
332 {
333 return contains_insert(seq.c_str(), seq.size());
334 }
335
344 bool contains_insert(const uint64_t* hashes)
345 {
346 return bloom_filter.contains_insert(hashes);
347 }
348
356 bool contains_insert(const std::vector<uint64_t>& hashes)
357 {
358 return bloom_filter.contains_insert(hashes);
359 }
360
362 size_t get_bytes() const { return bloom_filter.get_bytes(); }
364 uint64_t get_pop_cnt() const { return bloom_filter.get_pop_cnt(); }
366 double get_occupancy() const { return bloom_filter.get_occupancy(); }
368 unsigned get_hash_num() const { return bloom_filter.get_hash_num(); }
370 double get_fpr() const { return bloom_filter.get_fpr(); }
372 unsigned get_k() const { return k; }
374 const std::string& get_hash_fn() const { return bloom_filter.get_hash_fn(); }
376 BloomFilter& get_bloom_filter() { return bloom_filter; }
377
383 void save(const std::string& path);
384
390 static bool is_bloom_file(const std::string& path)
391 {
392 return btllib::BloomFilter::check_file_signature(
393 path, KMER_BLOOM_FILTER_SIGNATURE);
394 }
395
396private:
397 KmerBloomFilter(const std::shared_ptr<BloomFilterInitializer>& bfi);
398
399 friend class SeedBloomFilter;
400
401 unsigned k = 0;
402 BloomFilter bloom_filter;
403};
404
409{
410
411public:
414
423 SeedBloomFilter(size_t bytes,
424 unsigned k,
425 const std::vector<std::string>& seeds,
426 unsigned hash_num_per_seed);
427
433 explicit SeedBloomFilter(const std::string& path);
434
435 SeedBloomFilter(const SeedBloomFilter&) = delete;
437
438 SeedBloomFilter& operator=(const SeedBloomFilter&) = delete;
439 SeedBloomFilter& operator=(SeedBloomFilter&&) = delete;
440
447 void insert(const char* seq, size_t seq_len);
448
454 void insert(const std::string& seq) { insert(seq.c_str(), seq.size()); }
455
462 void insert(const uint64_t* hashes) { kmer_bloom_filter.insert(hashes); }
463
469 void insert(const std::vector<uint64_t>& hashes)
470 {
471 kmer_bloom_filter.insert(hashes);
472 }
473
484 std::vector<std::vector<unsigned>> contains(const char* seq,
485 size_t seq_len) const;
486
496 std::vector<std::vector<unsigned>> contains(const std::string& seq) const
497 {
498 return contains(seq.c_str(), seq.size());
499 }
500
508 bool contains(const uint64_t* hashes) const
509 {
510 return kmer_bloom_filter.contains(hashes);
511 }
512
519 bool contains(const std::vector<uint64_t>& hashes) const
520 {
521 return kmer_bloom_filter.contains(hashes);
522 }
523
535 std::vector<std::vector<unsigned>> contains_insert(const char* seq,
536 size_t seq_len);
537
548 std::vector<std::vector<unsigned>> contains_insert(const std::string& seq)
549 {
550 return contains_insert(seq.c_str(), seq.size());
551 }
552
562 bool contains_insert(const uint64_t* hashes)
563 {
564 return kmer_bloom_filter.contains_insert(hashes);
565 }
566
575 bool contains_insert(const std::vector<uint64_t>& hashes)
576 {
577 return kmer_bloom_filter.contains_insert(hashes);
578 }
579
581 size_t get_bytes() const { return kmer_bloom_filter.get_bytes(); }
583 uint64_t get_pop_cnt() const { return kmer_bloom_filter.get_pop_cnt(); }
585 double get_occupancy() const { return kmer_bloom_filter.get_occupancy(); }
588 unsigned get_total_hash_num() const
589 {
590 return get_hash_num_per_seed() * get_seeds().size();
591 }
594 double get_fpr() const;
596 unsigned get_k() const { return kmer_bloom_filter.get_k(); }
598 const std::vector<std::string>& get_seeds() const { return seeds; }
601 const std::vector<SpacedSeed>& get_parsed_seeds() const
602 {
603 return parsed_seeds;
604 }
606 unsigned get_hash_num_per_seed() const
607 {
608 return kmer_bloom_filter.get_hash_num();
609 }
611 unsigned get_hash_num() const { return get_hash_num_per_seed(); }
613 const std::string& get_hash_fn() const
614 {
615 return kmer_bloom_filter.get_hash_fn();
616 }
618 KmerBloomFilter& get_kmer_bloom_filter() { return kmer_bloom_filter; }
619
625 void save(const std::string& path);
626
632 static bool is_bloom_file(const std::string& path)
633 {
634 return btllib::BloomFilter::check_file_signature(
635 path, SEED_BLOOM_FILTER_SIGNATURE);
636 }
637
638private:
639 SeedBloomFilter(const std::shared_ptr<BloomFilterInitializer>& bfi);
640
641 std::vector<std::string> seeds;
642 std::vector<SpacedSeed> parsed_seeds;
643 KmerBloomFilter kmer_bloom_filter;
644};
645
646} // namespace btllib
647
648#endif
Definition: bloom_filter.hpp:66
bool contains(const uint64_t *hashes) const
bool contains(const std::vector< uint64_t > &hashes) const
Definition: bloom_filter.hpp:126
void insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:107
void insert(const uint64_t *hashes)
static bool is_bloom_file(const std::string &path)
Definition: bloom_filter.hpp:183
double get_fpr() const
const std::string & get_hash_fn() const
Definition: bloom_filter.hpp:164
unsigned get_hash_num() const
Definition: bloom_filter.hpp:160
BloomFilter(size_t bytes, unsigned hash_num, std::string hash_fn="")
void save(const std::string &path)
size_t get_bytes() const
Definition: bloom_filter.hpp:154
double get_occupancy() const
bool contains_insert(const uint64_t *hashes)
BloomFilter()
Definition: bloom_filter.hpp:70
bool contains_insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:148
uint64_t get_pop_cnt() const
BloomFilter(const std::string &path)
Definition: bloom_filter.hpp:210
void insert(const char *seq, size_t seq_len)
unsigned contains_insert(const char *seq, size_t seq_len)
double get_fpr() const
Definition: bloom_filter.hpp:370
BloomFilter & get_bloom_filter()
Definition: bloom_filter.hpp:376
void insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:266
void insert(const std::string &seq)
Definition: bloom_filter.hpp:251
unsigned get_hash_num() const
Definition: bloom_filter.hpp:368
static bool is_bloom_file(const std::string &path)
Definition: bloom_filter.hpp:390
unsigned contains(const char *seq, size_t seq_len) const
unsigned contains_insert(const std::string &seq)
Definition: bloom_filter.hpp:331
KmerBloomFilter(const std::string &path)
uint64_t get_pop_cnt() const
Definition: bloom_filter.hpp:364
bool contains_insert(const uint64_t *hashes)
Definition: bloom_filter.hpp:344
KmerBloomFilter()
Definition: bloom_filter.hpp:214
bool contains(const uint64_t *hashes) const
Definition: bloom_filter.hpp:299
void insert(const uint64_t *hashes)
Definition: bloom_filter.hpp:259
const std::string & get_hash_fn() const
Definition: bloom_filter.hpp:374
void save(const std::string &path)
bool contains_insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:356
size_t get_bytes() const
Definition: bloom_filter.hpp:362
double get_occupancy() const
Definition: bloom_filter.hpp:366
unsigned contains(const std::string &seq) const
Definition: bloom_filter.hpp:288
unsigned get_k() const
Definition: bloom_filter.hpp:372
bool contains(const std::vector< uint64_t > &hashes) const
Definition: bloom_filter.hpp:309
KmerBloomFilter(size_t bytes, unsigned hash_num, unsigned k)
Definition: bloom_filter.hpp:409
unsigned get_total_hash_num() const
Definition: bloom_filter.hpp:588
double get_occupancy() const
Definition: bloom_filter.hpp:585
bool contains(const uint64_t *hashes) const
Definition: bloom_filter.hpp:508
std::vector< std::vector< unsigned > > contains_insert(const std::string &seq)
Definition: bloom_filter.hpp:548
void insert(const char *seq, size_t seq_len)
bool contains(const std::vector< uint64_t > &hashes) const
Definition: bloom_filter.hpp:519
std::vector< std::vector< unsigned > > contains_insert(const char *seq, size_t seq_len)
void save(const std::string &path)
SeedBloomFilter(size_t bytes, unsigned k, const std::vector< std::string > &seeds, unsigned hash_num_per_seed)
const std::vector< SpacedSeed > & get_parsed_seeds() const
Definition: bloom_filter.hpp:601
KmerBloomFilter & get_kmer_bloom_filter()
Definition: bloom_filter.hpp:618
void insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:469
bool contains_insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:575
static bool is_bloom_file(const std::string &path)
Definition: bloom_filter.hpp:632
unsigned get_hash_num_per_seed() const
Definition: bloom_filter.hpp:606
SeedBloomFilter(const std::string &path)
double get_fpr() const
uint64_t get_pop_cnt() const
Definition: bloom_filter.hpp:583
size_t get_bytes() const
Definition: bloom_filter.hpp:581
unsigned get_k() const
Definition: bloom_filter.hpp:596
void insert(const uint64_t *hashes)
Definition: bloom_filter.hpp:462
const std::vector< std::string > & get_seeds() const
Definition: bloom_filter.hpp:598
const std::string & get_hash_fn() const
Definition: bloom_filter.hpp:613
std::vector< std::vector< unsigned > > contains(const char *seq, size_t seq_len) const
SeedBloomFilter()
Definition: bloom_filter.hpp:413
unsigned get_hash_num() const
Definition: bloom_filter.hpp:611
std::vector< std::vector< unsigned > > contains(const std::string &seq) const
Definition: bloom_filter.hpp:496
void insert(const std::string &seq)
Definition: bloom_filter.hpp:454
bool contains_insert(const uint64_t *hashes)
Definition: bloom_filter.hpp:562
Definition: bloom_filter.hpp:16