1#ifndef BTLLIB_BLOOM_FILTER_HPP
2#define BTLLIB_BLOOM_FILTER_HPP
4#include "btllib/nthash.hpp"
18static const uint8_t BIT_MASKS[CHAR_BIT] = {
20 0x01, 0x02, 0x04, 0x08,
21 0x10, 0x20, 0x40, 0x80
24static const char*
const BLOOM_FILTER_SIGNATURE =
"[BTLBloomFilter_v6]";
25static const char*
const KMER_BLOOM_FILTER_SIGNATURE =
26 "[BTLKmerBloomFilter_v6]";
27static const char*
const SEED_BLOOM_FILTER_SIGNATURE =
28 "[BTLSeedBloomFilter_v6]";
29static const char*
const HASH_FN = NTHASH_FN_NAME;
31static const unsigned MAX_HASH_VALUES = 1024;
32static const unsigned PLACEHOLDER_NEWLINES = 50;
35class BloomFilterInitializer
39 BloomFilterInitializer(
const std::string& path,
const std::string& signature)
41 , table(parse_header(ifs, signature))
44 static bool check_file_signature(std::ifstream& ifs,
45 std::string& file_signature,
46 const std::string& expected_signature);
50 static std::shared_ptr<cpptoml::table> parse_header(
52 const std::string& signature);
55 std::shared_ptr<cpptoml::table> table;
57 BloomFilterInitializer(
const BloomFilterInitializer&) =
delete;
58 BloomFilterInitializer(BloomFilterInitializer&&) =
default;
60 BloomFilterInitializer& operator=(
const BloomFilterInitializer&) =
delete;
61 BloomFilterInitializer& operator=(BloomFilterInitializer&&) =
default;
79 BloomFilter(
size_t bytes,
unsigned hash_num, std::string hash_fn =
"");
107 void insert(
const std::vector<uint64_t>& hashes) {
insert(hashes.data()); }
126 bool contains(
const std::vector<uint64_t>& hashes)
const
171 void save(
const std::string& path);
173 static void save(
const std::string& path,
174 const cpptoml::table& table,
185 return check_file_signature(path, BLOOM_FILTER_SIGNATURE);
188 static bool check_file_signature(
const std::string& path,
189 const std::string& signature);
192 BloomFilter(
const std::shared_ptr<BloomFilterInitializer>& bfi);
200 size_t array_bits = 0;
201 unsigned hash_num = 0;
203 std::unique_ptr<std::atomic<uint8_t>[]> array;
244 void insert(
const char* seq,
size_t seq_len);
251 void insert(
const std::string& seq) {
insert(seq.c_str(), seq.size()); }
266 void insert(
const std::vector<uint64_t>& hashes)
268 bloom_filter.
insert(hashes);
279 unsigned contains(
const char* seq,
size_t seq_len)
const;
290 return contains(seq.c_str(), seq.size());
301 return bloom_filter.
contains(hashes);
309 bool contains(
const std::vector<uint64_t>& hashes)
const
311 return bloom_filter.
contains(hashes);
372 unsigned get_k()
const {
return k; }
383 void save(
const std::string& path);
392 return btllib::BloomFilter::check_file_signature(
393 path, KMER_BLOOM_FILTER_SIGNATURE);
425 const std::vector<std::string>& seeds,
426 unsigned hash_num_per_seed);
447 void insert(
const char* seq,
size_t seq_len);
454 void insert(
const std::string& seq) {
insert(seq.c_str(), seq.size()); }
462 void insert(
const uint64_t* hashes) { kmer_bloom_filter.
insert(hashes); }
469 void insert(
const std::vector<uint64_t>& hashes)
471 kmer_bloom_filter.
insert(hashes);
484 std::vector<std::vector<unsigned>>
contains(
const char* seq,
485 size_t seq_len)
const;
496 std::vector<std::vector<unsigned>>
contains(
const std::string& seq)
const
498 return contains(seq.c_str(), seq.size());
510 return kmer_bloom_filter.
contains(hashes);
519 bool contains(
const std::vector<uint64_t>& hashes)
const
521 return kmer_bloom_filter.
contains(hashes);
598 const std::vector<std::string>&
get_seeds()
const {
return seeds; }
625 void save(
const std::string& path);
634 return btllib::BloomFilter::check_file_signature(
635 path, SEED_BLOOM_FILTER_SIGNATURE);
641 std::vector<std::string> seeds;
642 std::vector<SpacedSeed> parsed_seeds;
Definition: bloom_filter.hpp:66
bool contains(const uint64_t *hashes) const
bool contains(const std::vector< uint64_t > &hashes) const
Definition: bloom_filter.hpp:126
void insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:107
void insert(const uint64_t *hashes)
static bool is_bloom_file(const std::string &path)
Definition: bloom_filter.hpp:183
const std::string & get_hash_fn() const
Definition: bloom_filter.hpp:164
unsigned get_hash_num() const
Definition: bloom_filter.hpp:160
BloomFilter(size_t bytes, unsigned hash_num, std::string hash_fn="")
void save(const std::string &path)
size_t get_bytes() const
Definition: bloom_filter.hpp:154
double get_occupancy() const
bool contains_insert(const uint64_t *hashes)
BloomFilter()
Definition: bloom_filter.hpp:70
bool contains_insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:148
uint64_t get_pop_cnt() const
BloomFilter(const std::string &path)
Definition: bloom_filter.hpp:210
void insert(const char *seq, size_t seq_len)
unsigned contains_insert(const char *seq, size_t seq_len)
double get_fpr() const
Definition: bloom_filter.hpp:370
BloomFilter & get_bloom_filter()
Definition: bloom_filter.hpp:376
void insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:266
void insert(const std::string &seq)
Definition: bloom_filter.hpp:251
unsigned get_hash_num() const
Definition: bloom_filter.hpp:368
static bool is_bloom_file(const std::string &path)
Definition: bloom_filter.hpp:390
unsigned contains(const char *seq, size_t seq_len) const
unsigned contains_insert(const std::string &seq)
Definition: bloom_filter.hpp:331
KmerBloomFilter(const std::string &path)
uint64_t get_pop_cnt() const
Definition: bloom_filter.hpp:364
bool contains_insert(const uint64_t *hashes)
Definition: bloom_filter.hpp:344
KmerBloomFilter()
Definition: bloom_filter.hpp:214
bool contains(const uint64_t *hashes) const
Definition: bloom_filter.hpp:299
void insert(const uint64_t *hashes)
Definition: bloom_filter.hpp:259
const std::string & get_hash_fn() const
Definition: bloom_filter.hpp:374
void save(const std::string &path)
bool contains_insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:356
size_t get_bytes() const
Definition: bloom_filter.hpp:362
double get_occupancy() const
Definition: bloom_filter.hpp:366
unsigned contains(const std::string &seq) const
Definition: bloom_filter.hpp:288
unsigned get_k() const
Definition: bloom_filter.hpp:372
bool contains(const std::vector< uint64_t > &hashes) const
Definition: bloom_filter.hpp:309
KmerBloomFilter(size_t bytes, unsigned hash_num, unsigned k)
Definition: bloom_filter.hpp:409
unsigned get_total_hash_num() const
Definition: bloom_filter.hpp:588
double get_occupancy() const
Definition: bloom_filter.hpp:585
bool contains(const uint64_t *hashes) const
Definition: bloom_filter.hpp:508
std::vector< std::vector< unsigned > > contains_insert(const std::string &seq)
Definition: bloom_filter.hpp:548
void insert(const char *seq, size_t seq_len)
bool contains(const std::vector< uint64_t > &hashes) const
Definition: bloom_filter.hpp:519
std::vector< std::vector< unsigned > > contains_insert(const char *seq, size_t seq_len)
void save(const std::string &path)
SeedBloomFilter(size_t bytes, unsigned k, const std::vector< std::string > &seeds, unsigned hash_num_per_seed)
const std::vector< SpacedSeed > & get_parsed_seeds() const
Definition: bloom_filter.hpp:601
KmerBloomFilter & get_kmer_bloom_filter()
Definition: bloom_filter.hpp:618
void insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:469
bool contains_insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:575
static bool is_bloom_file(const std::string &path)
Definition: bloom_filter.hpp:632
unsigned get_hash_num_per_seed() const
Definition: bloom_filter.hpp:606
SeedBloomFilter(const std::string &path)
uint64_t get_pop_cnt() const
Definition: bloom_filter.hpp:583
size_t get_bytes() const
Definition: bloom_filter.hpp:581
unsigned get_k() const
Definition: bloom_filter.hpp:596
void insert(const uint64_t *hashes)
Definition: bloom_filter.hpp:462
const std::vector< std::string > & get_seeds() const
Definition: bloom_filter.hpp:598
const std::string & get_hash_fn() const
Definition: bloom_filter.hpp:613
std::vector< std::vector< unsigned > > contains(const char *seq, size_t seq_len) const
SeedBloomFilter()
Definition: bloom_filter.hpp:413
unsigned get_hash_num() const
Definition: bloom_filter.hpp:611
std::vector< std::vector< unsigned > > contains(const std::string &seq) const
Definition: bloom_filter.hpp:496
void insert(const std::string &seq)
Definition: bloom_filter.hpp:454
bool contains_insert(const uint64_t *hashes)
Definition: bloom_filter.hpp:562
Definition: bloom_filter.hpp:16