1#ifndef BTLLIB_COUNTING_BLOOM_FILTER_HPP
2#define BTLLIB_COUNTING_BLOOM_FILTER_HPP
4#include "btllib/bloom_filter.hpp"
5#include "btllib/nthash.hpp"
6#include "btllib/status.hpp"
21static const char*
const COUNTING_BLOOM_FILTER_SIGNATURE =
22 "[BTLCountingBloomFilter_v5]";
23static const char*
const KMER_COUNTING_BLOOM_FILTER_SIGNATURE =
24 "[BTLKmerCountingBloomFilter_v5]";
27class KmerCountingBloomFilter;
51 std::string hash_fn =
"");
72 void insert(
const uint64_t* hashes);
79 void insert(
const std::vector<uint64_t>& hashes) {
insert(hashes.data()); }
89 T
contains(
const uint64_t* hashes)
const;
98 T
contains(
const std::vector<uint64_t>& hashes)
const
224 void save(
const std::string& path);
233 return btllib::BloomFilter::check_file_signature(
234 path, COUNTING_BLOOM_FILTER_SIGNATURE);
240 void insert(
const uint64_t* hashes, T min_val);
245 size_t array_size = 0;
246 unsigned hash_num = 0;
248 std::unique_ptr<std::atomic<T>[]> array;
292 void insert(
const char* seq,
size_t seq_len);
299 void insert(
const std::string& seq) {
insert(seq.c_str(), seq.size()); }
307 void insert(
const uint64_t* hashes) { counting_bloom_filter.insert(hashes); }
314 void insert(
const std::vector<uint64_t>& hashes)
316 counting_bloom_filter.insert(hashes);
327 uint64_t
contains(
const char* seq,
size_t seq_len)
const;
338 return contains(seq.c_str(), seq.size());
351 return counting_bloom_filter.contains(hashes);
361 T
contains(
const std::vector<uint64_t>& hashes)
const
363 return counting_bloom_filter.contains(hashes);
398 return counting_bloom_filter.contains_insert(hashes);
410 return counting_bloom_filter.contains_insert(hashes);
446 return counting_bloom_filter.insert_contains(hashes);
458 return counting_bloom_filter.insert_contains(hashes);
500 return counting_bloom_filter.insert_thresh_contains(hashes, threshold);
517 return counting_bloom_filter.insert_thresh_contains(hashes, threshold);
559 return counting_bloom_filter.contains_insert_thresh(hashes, threshold);
574 return counting_bloom_filter.contains_insert_thresh(hashes, threshold);
578 size_t get_bytes()
const {
return counting_bloom_filter.get_bytes(); }
580 uint64_t
get_pop_cnt()
const {
return counting_bloom_filter.get_pop_cnt(); }
582 double get_occupancy()
const {
return counting_bloom_filter.get_occupancy(); }
584 unsigned get_hash_num()
const {
return counting_bloom_filter.get_hash_num(); }
586 double get_fpr()
const {
return counting_bloom_filter.get_fpr(); }
588 unsigned get_k()
const {
return k; }
592 return counting_bloom_filter.get_hash_fn();
597 return counting_bloom_filter;
605 void save(
const std::string& path);
615 return btllib::BloomFilter::check_file_signature(
616 path, KMER_COUNTING_BLOOM_FILTER_SIGNATURE);
626using CountingBloomFilter8 = CountingBloomFilter<uint8_t>;
627using CountingBloomFilter16 = CountingBloomFilter<uint16_t>;
628using CountingBloomFilter32 = CountingBloomFilter<uint32_t>;
630using KmerCountingBloomFilter8 = KmerCountingBloomFilter<uint8_t>;
631using KmerCountingBloomFilter16 = KmerCountingBloomFilter<uint16_t>;
632using KmerCountingBloomFilter32 = KmerCountingBloomFilter<uint32_t>;
639 size_t(std::ceil(double(bytes) / sizeof(uint64_t)) * sizeof(uint64_t)))
640 , array_size(get_bytes() / sizeof(array[0]))
642 , hash_fn(std::move(hash_fn))
643 , array(new std::atomic<T>[array_size])
645 check_error(bytes == 0,
"CountingBloomFilter: memory budget must be >0!");
647 "CountingBloomFilter: number of hash values must be >0!");
649 hash_num > MAX_HASH_VALUES,
650 "CountingBloomFilter: number of hash values cannot be over 1024!");
651 check_warning(
sizeof(uint8_t) !=
sizeof(std::atomic<uint8_t>),
652 "Atomic primitives take extra memory. CountingBloomFilter will "
654 std::to_string(bytes) +
" for bit array.");
655 std::memset((
void*)array.get(), 0, array_size *
sizeof(array[0]));
666 bool update_done =
false;
667 T new_val, tmp_min_val;
669 new_val = min_val + 1;
670 for (
size_t i = 0; i < hash_num; ++i) {
671 tmp_min_val = min_val;
672 update_done = array[hashes[i] % array_size].compare_exchange_strong(
673 tmp_min_val, new_val);
676 (min_val = contains(hashes)) == std::numeric_limits<T>::max()) {
686 contains_insert(hashes);
693 T min = array[hashes[0] % array_size];
694 for (
size_t i = 1; i < hash_num; ++i) {
695 const size_t idx = hashes[i] % array_size;
696 if (array[idx] < min) {
707 const auto count = contains(hashes);
708 if (count < std::numeric_limits<T>::max()) {
709 insert(hashes, count);
718 const auto count = contains(hashes);
719 if (count < std::numeric_limits<T>::max()) {
720 insert(hashes, count);
723 return std::numeric_limits<T>::max();
731 const auto count = contains(hashes);
732 if (count < threshold) {
733 insert(hashes, count);
744 const auto count = contains(hashes);
745 if (count < threshold) {
746 insert(hashes, count);
755 uint64_t pop_cnt = 0;
756#pragma omp parallel for default(none) reduction(+ : pop_cnt)
757 for (
size_t i = 0; i < array_size; ++i) {
769 return double(get_pop_cnt()) / double(array_size);
776 return std::pow(get_occupancy(),
double(hash_num));
782 std::make_shared<BloomFilterInitializer>(path,
783 COUNTING_BLOOM_FILTER_SIGNATURE))
788 const std::shared_ptr<BloomFilterInitializer>& bfi)
789 : bytes(*bfi->table->get_as<decltype(bytes)>(
"bytes"))
790 , array_size(bytes / sizeof(array[0]))
791 , hash_num(*(bfi->table->get_as<decltype(hash_num)>(
"hash_num")))
792 , hash_fn(bfi->table->contains(
"hash_fn")
793 ? *(bfi->table->get_as<decltype(hash_fn)>(
"hash_fn"))
795 , array(new std::atomic<T>[array_size])
797 check_warning(
sizeof(uint8_t) !=
sizeof(std::atomic<uint8_t>),
798 "Atomic primitives take extra memory. CountingBloomFilter will "
800 std::to_string(bytes) +
" for bit array.");
801 const auto loaded_counter_bits =
802 *(bfi->table->get_as<
size_t>(
"counter_bits"));
803 check_error(
sizeof(array[0]) * CHAR_BIT != loaded_counter_bits,
804 "CountingBloomFilter" +
805 std::to_string(
sizeof(array[0]) * CHAR_BIT) +
806 " tried to load a file of CountingBloomFilter" +
807 std::to_string(loaded_counter_bits));
808 bfi->ifs.read((
char*)array.get(),
809 std::streamsize(array_size *
sizeof(array[0])));
820 auto root = cpptoml::make_table();
824 auto header = cpptoml::make_table();
825 header->insert(
"bytes", get_bytes());
826 header->insert(
"hash_num", get_hash_num());
827 if (!hash_fn.empty()) {
828 header->insert(
"hash_fn", hash_fn);
830 header->insert(
"counter_bits",
size_t(
sizeof(array[0]) * CHAR_BIT));
831 std::string header_string = COUNTING_BLOOM_FILTER_SIGNATURE;
833 header_string.substr(1, header_string.size() - 2);
834 root->insert(header_string, header);
837 path, *root, (
char*)array.get(), array_size *
sizeof(array[0]));
845 , counting_bloom_filter(bytes, hash_num, HASH_FN)
852 NtHash nthash(seq, seq_len, get_hash_num(), get_k());
853 while (nthash.
roll()) {
854 counting_bloom_filter.insert(nthash.hashes());
863 NtHash nthash(seq, seq_len, get_hash_num(), get_k());
864 while (nthash.
roll()) {
865 sum += counting_bloom_filter.contains(nthash.hashes());
875 NtHash nthash(seq, seq_len, get_hash_num(), get_k());
876 while (nthash.
roll()) {
877 sum += counting_bloom_filter.contains_insert(nthash.hashes());
887 NtHash nthash(seq, seq_len, get_hash_num(), get_k());
888 while (nthash.
roll()) {
889 sum += counting_bloom_filter.insert_contains(nthash.hashes());
901 NtHash nthash(seq, seq_len, get_hash_num(), get_k());
902 while (nthash.
roll()) {
904 counting_bloom_filter.insert_thresh_contains(nthash.hashes(), threshold);
916 NtHash nthash(seq, seq_len, get_hash_num(), get_k());
917 while (nthash.
roll()) {
919 counting_bloom_filter.contains_insert_thresh(nthash.hashes(), threshold);
926 const std::string& path)
928 std::make_shared<BloomFilterInitializer>(
930 KMER_COUNTING_BLOOM_FILTER_SIGNATURE))
935 const std::shared_ptr<BloomFilterInitializer>& bfi)
936 : k(*(bfi->table->get_as<decltype(k)>(
"k")))
937 , counting_bloom_filter(bfi)
939 check_error(counting_bloom_filter.hash_fn != HASH_FN,
940 "KmerCountingBloomFilter: loaded hash function (" +
941 counting_bloom_filter.hash_fn +
942 ") is different from the one used by default (" + HASH_FN +
954 auto root = cpptoml::make_table();
958 auto header = cpptoml::make_table();
959 header->insert(
"bytes", get_bytes());
960 header->insert(
"hash_num", get_hash_num());
961 header->insert(
"hash_fn", get_hash_fn());
962 header->insert(
"counter_bits",
963 size_t(
sizeof(counting_bloom_filter.array[0]) * CHAR_BIT));
964 header->insert(
"k", k);
965 std::string header_string = KMER_COUNTING_BLOOM_FILTER_SIGNATURE;
967 header_string.substr(1, header_string.size() - 2);
968 root->insert(header_string, header);
972 (
char*)counting_bloom_filter.array.get(),
973 counting_bloom_filter.array_size *
974 sizeof(counting_bloom_filter.array[0]));
void save(const std::string &path)
Definition: counting_bloom_filter.hpp:36
T insert_thresh_contains(const std::vector< uint64_t > &hashes, const T threshold)
Definition: counting_bloom_filter.hpp:172
void insert(const std::vector< uint64_t > &hashes)
Definition: counting_bloom_filter.hpp:79
double get_fpr() const
Definition: counting_bloom_filter.hpp:774
const std::string & get_hash_fn() const
Definition: counting_bloom_filter.hpp:217
T contains_insert(const uint64_t *hashes)
Definition: counting_bloom_filter.hpp:705
static bool is_bloom_file(const std::string &path)
Definition: counting_bloom_filter.hpp:231
T insert_contains(const std::vector< uint64_t > &hashes)
Definition: counting_bloom_filter.hpp:143
void save(const std::string &path)
Definition: counting_bloom_filter.hpp:814
uint64_t get_pop_cnt() const
Definition: counting_bloom_filter.hpp:753
T contains_insert_thresh(const uint64_t *hashes, T threshold)
Definition: counting_bloom_filter.hpp:741
T contains(const std::vector< uint64_t > &hashes) const
Definition: counting_bloom_filter.hpp:98
size_t get_bytes() const
Definition: counting_bloom_filter.hpp:207
unsigned get_hash_num() const
Definition: counting_bloom_filter.hpp:213
T contains(const uint64_t *hashes) const
Definition: counting_bloom_filter.hpp:691
CountingBloomFilter()
Definition: counting_bloom_filter.hpp:40
T insert_contains(const uint64_t *hashes)
Definition: counting_bloom_filter.hpp:716
void insert(const uint64_t *hashes)
Definition: counting_bloom_filter.hpp:684
T contains_insert_thresh(const std::vector< uint64_t > &hashes, const T threshold)
Definition: counting_bloom_filter.hpp:200
T insert_thresh_contains(const uint64_t *hashes, T threshold)
Definition: counting_bloom_filter.hpp:728
T contains_insert(const std::vector< uint64_t > &hashes)
Definition: counting_bloom_filter.hpp:120
double get_occupancy() const
Definition: counting_bloom_filter.hpp:767
Definition: counting_bloom_filter.hpp:258
uint64_t contains(const char *seq, size_t seq_len) const
Definition: counting_bloom_filter.hpp:860
T insert_contains(const std::vector< uint64_t > &hashes)
Definition: counting_bloom_filter.hpp:456
T insert_thresh_contains(const std::vector< uint64_t > &hashes, const T threshold)
Definition: counting_bloom_filter.hpp:514
void save(const std::string &path)
Definition: counting_bloom_filter.hpp:948
CountingBloomFilter< T > & get_counting_bloom_filter()
Definition: counting_bloom_filter.hpp:595
T insert_contains(const char *seq, size_t seq_len)
Definition: counting_bloom_filter.hpp:884
T insert_thresh_contains(const std::string &seq, const T threshold)
Definition: counting_bloom_filter.hpp:482
T contains_insert_thresh(const uint64_t *hashes, const T threshold)
Definition: counting_bloom_filter.hpp:557
T insert_thresh_contains(const char *seq, size_t seq_len, T threshold)
Definition: counting_bloom_filter.hpp:896
T contains(const uint64_t *hashes) const
Definition: counting_bloom_filter.hpp:349
T contains_insert(const uint64_t *hashes)
Definition: counting_bloom_filter.hpp:396
T contains_insert(const std::vector< uint64_t > &hashes)
Definition: counting_bloom_filter.hpp:408
T contains_insert(const std::string &seq)
Definition: counting_bloom_filter.hpp:383
size_t get_bytes() const
Definition: counting_bloom_filter.hpp:578
T contains_insert_thresh(const char *seq, size_t seq_len, T threshold)
Definition: counting_bloom_filter.hpp:911
T insert_thresh_contains(const uint64_t *hashes, const T threshold)
Definition: counting_bloom_filter.hpp:498
T contains_insert_thresh(const std::vector< uint64_t > &hashes, const T threshold)
Definition: counting_bloom_filter.hpp:571
T insert_contains(const std::string &seq)
Definition: counting_bloom_filter.hpp:430
const std::string & get_hash_fn() const
Definition: counting_bloom_filter.hpp:590
double get_occupancy() const
Definition: counting_bloom_filter.hpp:582
void insert(const std::vector< uint64_t > &hashes)
Definition: counting_bloom_filter.hpp:314
void insert(const char *seq, size_t seq_len)
Definition: counting_bloom_filter.hpp:850
T contains_insert(const char *seq, size_t seq_len)
Definition: counting_bloom_filter.hpp:872
unsigned get_hash_num() const
Definition: counting_bloom_filter.hpp:584
T insert_contains(const uint64_t *hashes)
Definition: counting_bloom_filter.hpp:444
uint64_t get_pop_cnt() const
Definition: counting_bloom_filter.hpp:580
unsigned get_k() const
Definition: counting_bloom_filter.hpp:588
void insert(const std::string &seq)
Definition: counting_bloom_filter.hpp:299
uint64_t contains(const std::string &seq) const
Definition: counting_bloom_filter.hpp:336
void insert(const uint64_t *hashes)
Definition: counting_bloom_filter.hpp:307
T contains_insert_thresh(const std::string &seq, const T threshold)
Definition: counting_bloom_filter.hpp:541
static bool is_bloom_file(const std::string &path)
Definition: counting_bloom_filter.hpp:613
KmerCountingBloomFilter()
Definition: counting_bloom_filter.hpp:262
double get_fpr() const
Definition: counting_bloom_filter.hpp:586
T contains(const std::vector< uint64_t > &hashes) const
Definition: counting_bloom_filter.hpp:361
Definition: nthash.hpp:54
Definition: bloom_filter.hpp:16
void check_error(bool condition, const std::string &msg)
void check_warning(bool condition, const std::string &msg)