From 1682fededda092947dd59a9c18cc0b4f349d5ed9 Mon Sep 17 00:00:00 2001 From: rchikhi Date: Fri, 3 Mar 2017 19:00:45 +0100 Subject: [PATCH] getting rid of emphf; enabling BooPHF by default for Graph, in prevision of dropping support for non-c++11 compilers --- gatb-core/CMakeLists.txt | 41 +- gatb-core/src/gatb/bcalm2/bglue_algo.cpp | 2 +- gatb-core/src/gatb/bcalm2/bglue_algo.hpp | 2 +- gatb-core/src/gatb/debruijn/impl/Graph.cpp | 52 +- gatb-core/src/gatb/debruijn/impl/Graph.hpp | 1 - .../src/gatb/debruijn/impl/GraphUnitigs.cpp | 33 +- .../gatb/debruijn/impl/Simplifications.cpp | 19 - .../src/gatb/kmer/impl/MPHFAlgorithm.cpp | 18 +- .../src/gatb/kmer/impl/MPHFAlgorithm.hpp | 4 - gatb-core/src/gatb/kmer/impl/Model.hpp | 128 --- .../gatb/tools/collections/impl/BooPHF.hpp | 193 +++- .../src/gatb/tools/collections/impl/EMPHF.hpp | 232 ---- .../src/gatb/tools/collections/impl/MPHF.hpp | 152 --- .../tools/collections/impl/MPHFWrapper.hpp | 151 --- .../gatb/tools/collections/impl/MapMPHF.hpp | 12 +- gatb-core/src/gatb/tools/misc/api/Enums.hpp | 38 - .../gatb/tools/misc/api/StringsRepository.hpp | 2 - gatb-core/src/gatb/tools/misc/impl/Tool.cpp | 2 +- .../gatb/tools/storage/impl/StorageTools.cpp | 3 - gatb-core/test/benchmark/bench_graph.cpp | 6 +- gatb-core/test/benchmark/bench_mphf.cpp | 4 +- .../test/unit/src/debruijn/TestDebruijn.cpp | 26 +- .../unit/src/debruijn/TestDebruijnUnitigs.cpp | 9 +- .../unit/src/debruijn/TestSimplifications.cpp | 3 - .../debruijn/TestSimplificationsUnitigs.cpp | 3 - gatb-core/test/unit/src/kmer/TestMPHF.cpp | 43 +- .../unit/src/tools/collections/TestMap.cpp | 2 - gatb-core/thirdparty/CMakeLists.txt | 34 - gatb-core/thirdparty/emphf/.gitignore | 14 - gatb-core/thirdparty/emphf/CMakeLists.txt | 53 - gatb-core/thirdparty/emphf/LICENSE | 13 - gatb-core/thirdparty/emphf/README.md | 48 - gatb-core/thirdparty/emphf/base_hash.hpp | 228 ---- gatb-core/thirdparty/emphf/bitpair_vector.hpp | 99 -- gatb-core/thirdparty/emphf/bitstream.hpp | 179 --- gatb-core/thirdparty/emphf/common.hpp | 265 ----- .../thirdparty/emphf/compute_mphf_generic.hpp | 58 - .../thirdparty/emphf/compute_mphf_hem.cpp | 45 - .../thirdparty/emphf/compute_mphf_scan.cpp | 11 - .../emphf/compute_mphf_scan_mmap.cpp | 11 - .../thirdparty/emphf/compute_mphf_seq.cpp | 11 - .../thirdparty/emphf/emphf_config.hpp.in | 6 - .../thirdparty/emphf/gen_synthetic_data.cpp | 20 - gatb-core/thirdparty/emphf/hypergraph.hpp | 137 --- .../emphf/hypergraph_sorter_scan.hpp | 399 ------- .../emphf/hypergraph_sorter_seq.hpp | 130 --- .../emphf/internal_memory_model.hpp | 79 -- .../thirdparty/emphf/mmap_memory_model.hpp | 388 ------- gatb-core/thirdparty/emphf/mphf.hpp | 138 --- gatb-core/thirdparty/emphf/mphf_hem.hpp | 312 ----- .../thirdparty/emphf/packed_edge_list.hpp | 269 ----- gatb-core/thirdparty/emphf/packed_vector.hpp | 118 -- gatb-core/thirdparty/emphf/perfutils.hpp | 60 - .../emphf/ranked_bitpair_vector.hpp | 87 -- gatb-core/thirdparty/emphf/test | 1000 ----------------- gatb-core/thirdparty/emphf/test_all.py | 38 - gatb-core/thirdparty/emphf/test_mphf.cpp | 12 - .../thirdparty/emphf/test_mphf_generic.hpp | 146 --- gatb-core/thirdparty/emphf/test_mphf_hem.cpp | 12 - 59 files changed, 243 insertions(+), 5358 deletions(-) delete mode 100644 gatb-core/src/gatb/tools/collections/impl/EMPHF.hpp delete mode 100644 gatb-core/src/gatb/tools/collections/impl/MPHF.hpp delete mode 100644 gatb-core/src/gatb/tools/collections/impl/MPHFWrapper.hpp delete mode 100644 gatb-core/thirdparty/emphf/.gitignore delete mode 100644 gatb-core/thirdparty/emphf/CMakeLists.txt delete mode 100644 gatb-core/thirdparty/emphf/LICENSE delete mode 100644 gatb-core/thirdparty/emphf/README.md delete mode 100644 gatb-core/thirdparty/emphf/base_hash.hpp delete mode 100644 gatb-core/thirdparty/emphf/bitpair_vector.hpp delete mode 100644 gatb-core/thirdparty/emphf/bitstream.hpp delete mode 100644 gatb-core/thirdparty/emphf/common.hpp delete mode 100644 gatb-core/thirdparty/emphf/compute_mphf_generic.hpp delete mode 100644 gatb-core/thirdparty/emphf/compute_mphf_hem.cpp delete mode 100644 gatb-core/thirdparty/emphf/compute_mphf_scan.cpp delete mode 100644 gatb-core/thirdparty/emphf/compute_mphf_scan_mmap.cpp delete mode 100644 gatb-core/thirdparty/emphf/compute_mphf_seq.cpp delete mode 100644 gatb-core/thirdparty/emphf/emphf_config.hpp.in delete mode 100644 gatb-core/thirdparty/emphf/gen_synthetic_data.cpp delete mode 100644 gatb-core/thirdparty/emphf/hypergraph.hpp delete mode 100644 gatb-core/thirdparty/emphf/hypergraph_sorter_scan.hpp delete mode 100644 gatb-core/thirdparty/emphf/hypergraph_sorter_seq.hpp delete mode 100644 gatb-core/thirdparty/emphf/internal_memory_model.hpp delete mode 100644 gatb-core/thirdparty/emphf/mmap_memory_model.hpp delete mode 100644 gatb-core/thirdparty/emphf/mphf.hpp delete mode 100644 gatb-core/thirdparty/emphf/mphf_hem.hpp delete mode 100644 gatb-core/thirdparty/emphf/packed_edge_list.hpp delete mode 100644 gatb-core/thirdparty/emphf/packed_vector.hpp delete mode 100644 gatb-core/thirdparty/emphf/perfutils.hpp delete mode 100644 gatb-core/thirdparty/emphf/ranked_bitpair_vector.hpp delete mode 100644 gatb-core/thirdparty/emphf/test delete mode 100755 gatb-core/thirdparty/emphf/test_all.py delete mode 100644 gatb-core/thirdparty/emphf/test_mphf.cpp delete mode 100644 gatb-core/thirdparty/emphf/test_mphf_generic.hpp delete mode 100644 gatb-core/thirdparty/emphf/test_mphf_hem.cpp diff --git a/gatb-core/CMakeLists.txt b/gatb-core/CMakeLists.txt index 954bf3d8d..4887149fc 100644 --- a/gatb-core/CMakeLists.txt +++ b/gatb-core/CMakeLists.txt @@ -70,12 +70,6 @@ if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") set (use_new_cxx 1) endif() - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.7) - set (use_mphf 0) - else() - set (use_mphf 1) - endif() - elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") if ("${CMAKE_CXX_COMPILER_VERSION}" STREQUAL "") @@ -87,18 +81,14 @@ elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") IF(CMAKE_SYSTEM_NAME MATCHES "(Darwin)") # different clang versions number between linux and mac if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.3) - set (use_mphf 0) else() - set (use_mphf 1) set (use_new_cxx 1) endif() else() if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.2) - set (use_mphf 0) else() set (use_new_cxx 1) - set (use_mphf 1) endif() endif() @@ -156,7 +146,9 @@ if (use_new_cxx) set (LIBRARY_COMPILE_DEFINITIONS "${LIBRARY_COMPILE_DEFINITIONS} -DWITH_LAMBDA_EXPRESSION ${CXX_STD_VERSION}") endif() -# detect SSE +# detect SSE for popcount +# this was for emphf, maybe it's for something else also? otherwise this part can be removed. +# # from https://github.com/rurban/smhasher/blob/master/CMakeLists.txt # i do not see much performance gain for now, but let's keep that code here, might be useful later. # list of performance gain observed: @@ -195,29 +187,6 @@ ENDIF() if (use_new_cxx) set (LIBRARY_COMPILE_DEFINITIONS "${LIBRARY_COMPILE_DEFINITIONS} -DUSE_NEW_CXX ") endif() - -if (use_mphf) - set (LIBRARY_COMPILE_DEFINITIONS "${LIBRARY_COMPILE_DEFINITIONS} -DWITH_MPHF ") - message ("-------------------------------------------------------------------------------------") - message ("-- WILL COMPILE MPHF! (COMPILER VERSION IS HIGH ENOUGH) ") - message ("-------------------------------------------------------------------------------------") - if (SSE4_2_FOUND) - set (LIBRARY_COMPILE_DEFINITIONS "${LIBRARY_COMPILE_DEFINITIONS} -DEMPHF_USE_POPCOUNT=1") - endif() - - # for emphf - if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") - message ("-- Using stdlib=libc++ for clang/osx") - - IF(CMAKE_SYSTEM_NAME MATCHES "(Darwin)") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++") - endif() - # my clang 3.7 (linux) and 3.9 (linux also) doesn't like when i use stdlib=libc++, it refuses to compile because cannot found standard libs like - # so i'm setting this as mac-only - - endif () - -endif() # In case we use an "old" version of clang with boost and c++0x, we have to skip rvalue usage if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") @@ -321,10 +290,6 @@ ADD_SUBDIRECTORY(thirdparty) # we must be sure that hdf5 is built and installed before building gatb-core ADD_DEPENDENCIES (gatbcore-static hdf5 hdf5_postbuild) -IF (DEFINED WITH_MPHF) - ADD_DEPENDENCIES(gatbcore-static emphf_copyasis) -ENDIF() - ################################################################################ # DOCUMENTATION GENERATION ################################################################################ diff --git a/gatb-core/src/gatb/bcalm2/bglue_algo.cpp b/gatb-core/src/gatb/bcalm2/bglue_algo.cpp index 302cbf538..cc94c4dd6 100644 --- a/gatb-core/src/gatb/bcalm2/bglue_algo.cpp +++ b/gatb-core/src/gatb/bcalm2/bglue_algo.cpp @@ -473,7 +473,7 @@ struct Comp{ template class hasher_t { - typedef emphf::jenkins64_hasher BaseHasher; + typedef jenkins64_hasher BaseHasher; /* from BooPHF.hpp, which itself is from emphf:base_hasher */ BaseHasher emphf_hasher; AdaptatorDefault adaptor; diff --git a/gatb-core/src/gatb/bcalm2/bglue_algo.hpp b/gatb-core/src/gatb/bcalm2/bglue_algo.hpp index e9f1c861b..d94c5119d 100644 --- a/gatb-core/src/gatb/bcalm2/bglue_algo.hpp +++ b/gatb-core/src/gatb/bcalm2/bglue_algo.hpp @@ -56,7 +56,7 @@ #include // for repartitor #include #include -#include +#include //heh at this point I could have maybe just included gatb_core.hpp but well, no circular dependencies, this file is part of gatb-core now. diff --git a/gatb-core/src/gatb/debruijn/impl/Graph.cpp b/gatb-core/src/gatb/debruijn/impl/Graph.cpp index 93e14e42d..6325512a8 100644 --- a/gatb-core/src/gatb/debruijn/impl/Graph.cpp +++ b/gatb-core/src/gatb/debruijn/impl/Graph.cpp @@ -206,7 +206,7 @@ void configure_visitor::operator() (GraphData& data.setBranching (algo.getBranchingCollection()); } - if ((graph._mphfKind != MPHF_NONE) && (graph.getState() & GraphTemplate::STATE_MPHF_DONE) && (graph.getState() & GraphTemplate::STATE_SORTING_COUNT_DONE)) + if ((graph.getState() & GraphTemplate::STATE_MPHF_DONE) && (graph.getState() & GraphTemplate::STATE_SORTING_COUNT_DONE)) { typedef typename Kmer::Count Count; typedef typename Kmer::Type Type; @@ -219,7 +219,6 @@ void configure_visitor::operator() (GraphData& Iterable* solidKmers = new IterableAdaptor > (*solidCounts); MPHFAlgorithm mphf_algo ( - graph._mphfKind, dskGroup, "mphf", solidCounts, @@ -474,7 +473,7 @@ void build_visitor_postsolid::operator() (GraphData< Partition* solidCounts = & dskGroup.getPartition ("solid"); /** We create an instance of the MPHF Algorithm class (why is that a class, and not a function?) and execute it. */ - if (graph._mphfKind != MPHF_NONE && (!graph.checkState(GraphTemplate::STATE_MPHF_DONE))) + if ((!graph.checkState(GraphTemplate::STATE_MPHF_DONE))) { DEBUG ((cout << "build_visitor : MPHFAlgorithm BEGIN\n")); @@ -482,7 +481,6 @@ void build_visitor_postsolid::operator() (GraphData< Iterable* solidKmers = new IterableAdaptor > (*solidCounts); MPHFAlgorithm mphf_algo ( - graph._mphfKind, dskGroup, "mphf", solidCounts, @@ -645,20 +643,6 @@ IOptionsParser* GraphTemplate::getOptionsParser (b parser->push_back (DebloomAlgorithm<>::getOptionsParser()); parser->push_back (BranchingAlgorithm<>::getOptionsParser()); - /** We activate MPHF option only if available. */ - if (MPHF::enabled) - { - IOptionsParser* parserEmphf = new OptionsParser ("mphf"); - parserEmphf->push_back (new tools::misc::impl::OptionOneParam (STR_MPHF_TYPE, "mphf type ('none' or 'emphf' or 'BooPHF')", false, "BooPHF")); - parser->push_back (parserEmphf); - } - else //we still activate option for command line compatibility - { - IOptionsParser* parserEmphf = new OptionsParser ("mphf"); - parserEmphf->push_back (new tools::misc::impl::OptionOneParam (STR_MPHF_TYPE, "mphf type ('none')", false, "none")); - parser->push_back (parserEmphf); - } - /** We create a "general options" parser. */ IOptionsParser* parserGeneral = new OptionsParser ("general"); parserGeneral->push_front (new OptionOneParam (STR_INTEGER_PRECISION, "integers precision (0 for optimized value)", false, "0", false)); @@ -747,7 +731,7 @@ GraphTemplate::GraphTemplate (size_t kmerSize) _variant(new GraphDataVariant_t()), _kmerSize(kmerSize), _info("graph"), _state(GraphTemplate::STATE_INIT_DONE), _bloomKind(BLOOM_DEFAULT), _debloomKind(DEBLOOM_DEFAULT), _debloomImpl(DEBLOOM_IMPL_DEFAULT), - _branchingKind(BRANCHING_STORED), _mphfKind(MPHF_NONE) + _branchingKind(BRANCHING_STORED) { /** We configure the data variant according to the provided kmer size. */ setVariant (_variant, _kmerSize); @@ -768,7 +752,7 @@ template GraphTemplate::GraphTemplate (const std::string& uri) : _storageMode(PRODUCT_MODE_DEFAULT), _storage(0), _variant(new GraphDataVariant_t()), _kmerSize(0), _info("graph"), - _name(System::file().getBaseName(uri)), _mphfKind(MPHF_BOOPHF) + _name(System::file().getBaseName(uri)) { /** We create a storage instance. */ @@ -816,10 +800,6 @@ GraphTemplate::GraphTemplate (bank::IBank* bank, t parse (params->getStr(STR_DEBLOOM_IMPL), _debloomImpl); parse (params->getStr(STR_BRANCHING_TYPE), _branchingKind); - /** This one is conditional. */ - if (params->get(STR_MPHF_TYPE) && MPHF::enabled) { parse (params->getStr(STR_MPHF_TYPE), _mphfKind); } - else { _mphfKind = MPHF_NONE; } - /** We configure the data variant according to the provided kmer size. */ setVariant (_variant, _kmerSize, integerPrecision); @@ -854,10 +834,6 @@ GraphTemplate::GraphTemplate (tools::misc::IProper parse (params->getStr(STR_DEBLOOM_IMPL), _debloomImpl); parse (params->getStr(STR_BRANCHING_TYPE), _branchingKind); - /** This one is conditional. */ - if (params->get(STR_MPHF_TYPE) && MPHF::enabled) { parse (params->getStr(STR_MPHF_TYPE), _mphfKind); } - else { _mphfKind = MPHF_NONE; } - /** We configure the data variant according to the provided kmer size. */ setVariant (_variant, _kmerSize, integerPrecision); @@ -925,7 +901,7 @@ GraphTemplate::GraphTemplate () _variant(new GraphDataVariant()), _kmerSize(0), _info("graph"), _state(GraphTemplate::STATE_INIT_DONE), _bloomKind(BLOOM_DEFAULT), - _debloomKind(DEBLOOM_DEFAULT), _debloomImpl(DEBLOOM_IMPL_DEFAULT), _branchingKind(BRANCHING_STORED), _mphfKind(MPHF_NONE) + _debloomKind(DEBLOOM_DEFAULT), _debloomImpl(DEBLOOM_IMPL_DEFAULT), _branchingKind(BRANCHING_STORED) { //std::cout << "empty graphtemplate constructor" << std::endl; } @@ -941,8 +917,7 @@ GraphTemplate::GraphTemplate () template GraphTemplate::GraphTemplate (const GraphTemplate& graph) : _storageMode(graph._storageMode), _storage(0), - _variant(new GraphDataVariant()), _kmerSize(graph._kmerSize), _info("graph"), _name(graph._name), _state(graph._state), - _mphfKind(graph._mphfKind) + _variant(new GraphDataVariant()), _kmerSize(graph._kmerSize), _info("graph"), _name(graph._name), _state(graph._state) { setStorage (graph._storage); @@ -969,7 +944,6 @@ GraphTemplate& GraphTemplate { template void GraphTemplate::precomputeAdjacency(unsigned int nbCores, bool verbose) { -#ifndef WITH_MPHF - std::cout << "Adjacency precomputation isn't supported when GATB-core is compiled with a non-C++11 compiler" << std::endl; -#else - ProgressGraphIteratorTemplate itNode (iterator(), "precomputing adjacency", verbose); bool hasMPHF = getState() & GraphTemplate::STATE_MPHF_DONE; @@ -3565,7 +3535,6 @@ void GraphTemplate::precomputeAdjacency(unsigned i // TODO delete _container here -#endif } // now deleteNode depends on getNodeAdjacency @@ -3720,10 +3689,6 @@ bool GraphTemplate::debugCompareNeighborhoods(Node template void GraphTemplate::deleteNodesByIndex(vector &bitmap, int nbCores, gatb::core::system::ISynchronizer* synchro) const { -#ifndef WITH_MPHF - std::cout << "Node deletion isn't supported when GATB-core is compiled with a non-C++11 compiler" << std::endl; -#else - GraphIterator itNode = this->iterator(); Dispatcher dispatcher (nbCores); @@ -3743,7 +3708,6 @@ void GraphTemplate::deleteNodesByIndex(vectorunlock(); } }); // end of parallel node iteration -#endif } template @@ -3825,9 +3789,6 @@ struct allocateNonSimpleNodeCache_visitor : public boost::static_visitor template void GraphTemplate::cacheNonSimpleNodes(unsigned int nbCores, bool verbose) { -#ifndef WITH_MPHF - std::cout << "cacheNonSimpleNode isn't supported when GATB-core is compiled with a non-C++11 compiler" << std::endl; -#else boost::apply_visitor (allocateNonSimpleNodeCache_visitor(), *(GraphDataVariant*)_variant); setState(GraphTemplate::STATE_NONSIMPLE_CACHE); GraphIterator itNode = this->iterator(); @@ -3845,7 +3806,6 @@ void GraphTemplate::cacheNonSimpleNodes(unsigned i } }); // end of parallel node iteration std::cout << "Cached " << nbCachedNodes << " non-simple nodes" << std::endl; -#endif } template diff --git a/gatb-core/src/gatb/debruijn/impl/Graph.hpp b/gatb-core/src/gatb/debruijn/impl/Graph.hpp index 59379b438..4eb009b28 100644 --- a/gatb-core/src/gatb/debruijn/impl/Graph.hpp +++ b/gatb-core/src/gatb/debruijn/impl/Graph.hpp @@ -1074,7 +1074,6 @@ class GraphTemplate tools::misc::DebloomKind _debloomKind; tools::misc::DebloomImpl _debloomImpl; tools::misc::BranchingKind _branchingKind; - tools::misc::MPHFKind _mphfKind; /** */ GraphIterator getNodes () const; diff --git a/gatb-core/src/gatb/debruijn/impl/GraphUnitigs.cpp b/gatb-core/src/gatb/debruijn/impl/GraphUnitigs.cpp index 5712ed433..f817e3597 100644 --- a/gatb-core/src/gatb/debruijn/impl/GraphUnitigs.cpp +++ b/gatb-core/src/gatb/debruijn/impl/GraphUnitigs.cpp @@ -395,7 +395,9 @@ get_from_navigational_vector(const std::vector &v, uint64_t utig, cons template void GraphUnitigsTemplate::load_unitigs(string unitigs_filename) { - std::cout << "loading unitigs from disk to memory" << std::endl; + bool verbose = (nb_unitigs > 1000000); // big dataset, let's show some memory usage verbosity here + if (verbose) + std::cout << "loading unitigs from disk to memory" << std::endl; BankFasta inputBank (unitigs_filename); //bank::IBank* inputBank = Bank::open (unitigs_filename); @@ -448,18 +450,21 @@ void GraphUnitigsTemplate::load_unitigs(string unitigs_filename) // an estimation of memory usage uint64_t nb_kmers = unitigs.size(); uint64_t mem_vec = (unitigs.capacity() * sizeof(string) + nb_utigs_nucl_mem); - std::cout << "Memory usage:" << std::endl; - std::cout << " " << (sizeof(uint64_t) * incoming.size()) / 1024 / 1024 << " MB keys in incoming dict" << std::endl; - std::cout << " " << (sizeof(uint64_t) * outcoming.size()) / 1024 / 1024 << " MB keys in outcoming dict" << std::endl; - std::cout << " " << (sizeof(uint64_t) * incoming_map.size()) / 1024 / 1024 << " MB keys in incoming_map dict" << std::endl; - std::cout << " " << (sizeof(uint64_t) * outcoming_map.size()) / 1024 / 1024 << " MB keys in outcoming_map dict" << std::endl; - std::cout << " " << mem_vec /1024 /1024 << " MB unitigs nucleotides" << std::endl; - std::cout << " " << (nb_kmers*sizeof(float)) / 1024 / 1024 << " MB unitigs abundances" << std::endl; - std::cout << " " << (2*nb_kmers/8) / 1024 / 1024 << " MB deleted/visited bitvectors" << std::endl; - std::cout << "Estimated total: " << (nb_kmers*(sizeof(float) + 2.0/8.0) + sizeof(uint64_t) * ( incoming.size() + outcoming.size() + incoming.size() + outcoming_map.size()) + mem_vec) / 1024 / 1024 << " MB" << std::endl; - - if (nb_utigs_nucl != nb_utigs_nucl_mem) - std::cout << "unitigs strings size " << nb_utigs_nucl << " vs capacity " << nb_utigs_nucl_mem << std::endl; + if (verbose) + { + std::cout << "Memory usage:" << std::endl; + std::cout << " " << (sizeof(uint64_t) * incoming.size()) / 1024 / 1024 << " MB keys in incoming dict" << std::endl; + std::cout << " " << (sizeof(uint64_t) * outcoming.size()) / 1024 / 1024 << " MB keys in outcoming dict" << std::endl; + std::cout << " " << (sizeof(uint64_t) * incoming_map.size()) / 1024 / 1024 << " MB keys in incoming_map dict" << std::endl; + std::cout << " " << (sizeof(uint64_t) * outcoming_map.size()) / 1024 / 1024 << " MB keys in outcoming_map dict" << std::endl; + std::cout << " " << mem_vec /1024 /1024 << " MB unitigs nucleotides" << std::endl; + std::cout << " " << (nb_kmers*sizeof(float)) / 1024 / 1024 << " MB unitigs abundances" << std::endl; + std::cout << " " << (2*nb_kmers/8) / 1024 / 1024 << " MB deleted/visited bitvectors" << std::endl; + std::cout << "Estimated total: " << (nb_kmers*(sizeof(float) + 2.0/8.0) + sizeof(uint64_t) * ( incoming.size() + outcoming.size() + incoming.size() + outcoming_map.size()) + mem_vec) / 1024 / 1024 << " MB" << std::endl; + + if (nb_utigs_nucl != nb_utigs_nucl_mem) + std::cout << "unitigs strings size " << nb_utigs_nucl << " vs capacity " << nb_utigs_nucl_mem << std::endl; + } } /********************************************************************* @@ -616,7 +621,6 @@ GraphUnitigsTemplate& GraphUnitigsTemplate::operator= (GraphUnitigsT BaseGraph::_storageMode = graph._storageMode; BaseGraph::_name = graph._name; BaseGraph::_info = graph._info; - BaseGraph::_mphfKind = graph._mphfKind; BaseGraph::_state = graph._state; BaseGraph::setStorage (graph._storage); @@ -660,7 +664,6 @@ GraphUnitigsTemplate& GraphUnitigsTemplate::operator= (GraphUnitigsT BaseGraph::_storageMode = graph._storageMode; BaseGraph::_name = graph._name; BaseGraph::_info = graph._info; - BaseGraph::_mphfKind = graph._mphfKind; BaseGraph::_state = graph._state; BaseGraph::setStorage (graph._storage); diff --git a/gatb-core/src/gatb/debruijn/impl/Simplifications.cpp b/gatb-core/src/gatb/debruijn/impl/Simplifications.cpp index 2ec6ab4ce..10d6de335 100644 --- a/gatb-core/src/gatb/debruijn/impl/Simplifications.cpp +++ b/gatb-core/src/gatb/debruijn/impl/Simplifications.cpp @@ -41,11 +41,9 @@ #include #include // for ProgressTimerAndSystem -#ifdef WITH_MPHF #include #define get_wtime() chrono::system_clock::now() #define diff_wtime(x,y) (unsigned long)chrono::duration_cast(y - x).count() -#endif #define DIR2STR(dir) ((dir==DIR_OUTCOMING) ? "outcoming" : "incoming") @@ -362,10 +360,6 @@ bool Simplifications::satisfyRCTC(double pathAbundance, Nod template unsigned long Simplifications::removeTips() { -#ifndef WITH_MPHF - std::cout << "Graph simplifications aren't supported when GATB-core is compiled with a non-C++11 compiler" << std::endl; - return 0; -#else unsigned int k = _graph.getKmerSize(); unsigned int maxTipLengthTopological = (unsigned int)((float)k * (3.5 - 1.0)); // aggressive with SPAdes length threshold, but no coverage criterion @@ -591,7 +585,6 @@ unsigned long Simplifications::removeTips() _firstNodeIteration = false; return nbTipsRemoved; -#endif // WITH_MPHF } enum HMCP_Success { HMCP_DIDNT_FIND_END = 0, HMCP_FOUND_END = 1 , HMCP_MAX_DEPTH = -1, HMCP_LOOP = -2}; @@ -1187,11 +1180,6 @@ void Simplifications::heuristic_most_covered_path_unitigs( template unsigned long Simplifications::removeBulges() { -#ifndef WITH_MPHF - std::cout << "Graph simplifications aren't supported when GATB-core is compiled with a non-C++11 compiler" << std::endl; - return 0; -#else - unsigned int k = _graph.getKmerSize(); unsigned int coeff = 3; unsigned int additive_coeff = 100; @@ -1478,7 +1466,6 @@ unsigned long Simplifications::removeBulges() } return nbBulgesRemoved; -#endif } @@ -1509,11 +1496,6 @@ unsigned long Simplifications::removeBulges() template unsigned long Simplifications::removeErroneousConnections() { -#ifndef WITH_MPHF - std::cout << "Graph simplifications aren't supported when GATB-core is compiled with a non-C++11 compiler" << std::endl; - return 0; -#else - unsigned int k = _graph.getKmerSize(); unsigned int maxECLength = (unsigned int)((float)k * (10 - 1.0)) ; // SPAdes mode double RCTCcutoff = 4.0; @@ -1724,7 +1706,6 @@ unsigned long Simplifications::removeErroneousConnections() } return nbECRemoved; -#endif } // instantiation diff --git a/gatb-core/src/gatb/kmer/impl/MPHFAlgorithm.cpp b/gatb-core/src/gatb/kmer/impl/MPHFAlgorithm.cpp index d0a191b0a..cfe06decc 100644 --- a/gatb-core/src/gatb/kmer/impl/MPHFAlgorithm.cpp +++ b/gatb-core/src/gatb/kmer/impl/MPHFAlgorithm.cpp @@ -80,7 +80,6 @@ const Abundance_t MPHFAlgorithm::MAX_ABUNDANCE = s *********************************************************************/ template MPHFAlgorithm::MPHFAlgorithm ( - tools::misc::MPHFKind mphfKind, Group& group, const std::string& name, Iterable* solidCounts, @@ -90,7 +89,7 @@ MPHFAlgorithm::MPHFAlgorithm ( IProperties* options ) : Algorithm("mphf", nbCores, options), _group(group), _name(name), _buildOrLoad(buildOrLoad), - _dataSize(0), _nb_abundances_above_precision(0), _solidCounts(0), _solidKmers(0), _abundanceMap(0), _nodeStateMap(0), _adjacencyMap(0), _progress(0),_mphfKind(mphfKind) + _dataSize(0), _nb_abundances_above_precision(0), _solidCounts(0), _solidKmers(0), _abundanceMap(0), _nodeStateMap(0), _adjacencyMap(0), _progress(0) { /** We keep a reference on the solid kmers. */ setSolidCounts (solidCounts); @@ -99,15 +98,12 @@ MPHFAlgorithm::MPHFAlgorithm ( setSolidKmers (solidKmers); /** We build the hash object. */ - setAbundanceMap (new AbundanceMap(mphfKind)); - setNodeStateMap (new NodeStateMap(mphfKind)); - setAdjacencyMap (new AdjacencyMap(mphfKind)); - - /** We gather some statistics. */ - getInfo()->add (1, "enabled", "%d", AbundanceMap::enabled); + setAbundanceMap (new AbundanceMap()); + setNodeStateMap (new NodeStateMap()); + setAdjacencyMap (new AdjacencyMap()); /** In case of load, we load the mphf and populate right now. */ - if (AbundanceMap::enabled == true && buildOrLoad == false) + if (buildOrLoad == false) { /** We load the hash object from the dedicated storage group. */ { TIME_INFO (getTimeInfo(), "load"); @@ -154,7 +150,7 @@ template void MPHFAlgorithm::execute () { /** We check whether we can use such a type. */ - if (AbundanceMap::enabled == true && _buildOrLoad == true) + if (_buildOrLoad == true) { /** We need a progress object. */ tools::dp::IteratorListener* delegate = createIteratorListener(0,""); LOCAL (delegate); @@ -163,7 +159,7 @@ void MPHFAlgorithm::execute () //if MPHF_BOOPHF and verbose 0, give a null progress to the builder, make it understand the internal progress bar of boophf needs to be removed - if((_mphfKind == tools::misc::MPHF_BOOPHF) && (typeid(*delegate) == typeid(tools::dp::IteratorListener))) + if((typeid(*delegate) == typeid(tools::dp::IteratorListener))) setProgress (0); diff --git a/gatb-core/src/gatb/kmer/impl/MPHFAlgorithm.hpp b/gatb-core/src/gatb/kmer/impl/MPHFAlgorithm.hpp index c9ecfcab4..8d7aae904 100644 --- a/gatb-core/src/gatb/kmer/impl/MPHFAlgorithm.hpp +++ b/gatb-core/src/gatb/kmer/impl/MPHFAlgorithm.hpp @@ -25,7 +25,6 @@ #include #include #include -#include // for MPHFKind #include #include #include @@ -106,7 +105,6 @@ class MPHFAlgorithm : public gatb::core::tools::misc::impl::Algorithm * \param[in] buildOrLoad : true for build/save the MPHF, false for load only * \param[in] options : extra options for configuration (may be empty) */ MPHFAlgorithm ( - tools::misc::MPHFKind mphfKind, tools::storage::impl::Group& group, const std::string& name, tools::collections::Iterable* solidCounts, @@ -182,8 +180,6 @@ class MPHFAlgorithm : public gatb::core::tools::misc::impl::Algorithm tools::dp::IteratorListener* _progress; void setProgress (tools::dp::IteratorListener* progress) { SP_SETATTR(progress); } - /** rememer mphf kind here also*/ - tools::misc::MPHFKind _mphfKind; }; /********************************************************************************/ diff --git a/gatb-core/src/gatb/kmer/impl/Model.hpp b/gatb-core/src/gatb/kmer/impl/Model.hpp index 3bb28a724..4235a43b7 100644 --- a/gatb-core/src/gatb/kmer/impl/Model.hpp +++ b/gatb-core/src/gatb/kmer/impl/Model.hpp @@ -46,13 +46,6 @@ #include #include -//#define TESTING_EMPHF_HASH -//#ifdef TESTING_EMPHF_HASH -#if 0 // let's not have Model depend on modifications to MPHF for now. -#include -#include -#endif - extern const char bin2NT[] ; extern const char binrev[] ; extern const unsigned char revcomp_4NT[]; @@ -901,127 +894,6 @@ struct Kmer hash2(k, 1LL); } -//#ifdef TESTING_EMPHF_HASH -#if 0 - gatb::core::tools::collections::impl::AdaptatorDefault adaptor; - //emphf::jenkins64_hasher emphf_hasher; // for some reason, if I re-use this hasher, now MPHF takes 3x more times. (really! try it with bench_graph) - - // so I decided to just strip and copy the relevant EMPHF code here - struct copied_jenkins64_hasher { - - typedef uint64_t seed_t; - typedef uint64_t hash_t; - typedef std::tuple hash_triple_t; - typedef std::pair byte_range_t; - - copied_jenkins64_hasher(uint64_t seed) - : m_seed(seed) - {} - copied_jenkins64_hasher() - {} - - - - inline uint64_t copied_unaligned_load64(uint8_t const* from) const - { - uint64_t tmp; - memcpy(reinterpret_cast(&tmp), from, 8); - return tmp; - } - - - // Adapted from http://www.burtleburtle.net/bob/c/lookup8.c - hash_triple_t operator()(byte_range_t s) const - { - using std::get; - hash_triple_t h(m_seed, m_seed, 0x9e3779b97f4a7c13ULL); - - size_t len = (size_t)(s.second - s.first); - uint8_t const* cur = s.first; - uint8_t const* end = s.second; - - while (end - cur >= 24) { - get<0>(h) += copied_unaligned_load64(cur); - cur += 8; - get<1>(h) += copied_unaligned_load64(cur); - cur += 8; - get<2>(h) += copied_unaligned_load64(cur); - cur += 8; - - mix(h); - } - - get<2>(h) += len; - - switch (end - cur) { - case 23: get<2>(h) += (uint64_t(cur[22]) << 56); - case 22: get<2>(h) += (uint64_t(cur[21]) << 48); - case 21: get<2>(h) += (uint64_t(cur[20]) << 40); - case 20: get<2>(h) += (uint64_t(cur[19]) << 32); - case 19: get<2>(h) += (uint64_t(cur[18]) << 24); - case 18: get<2>(h) += (uint64_t(cur[17]) << 16); - case 17: get<2>(h) += (uint64_t(cur[16]) << 8); - // the first byte of c is reserved for the length - case 16: get<1>(h) += (uint64_t(cur[15]) << 56); - case 15: get<1>(h) += (uint64_t(cur[14]) << 48); - case 14: get<1>(h) += (uint64_t(cur[13]) << 40); - case 13: get<1>(h) += (uint64_t(cur[12]) << 32); - case 12: get<1>(h) += (uint64_t(cur[11]) << 24); - case 11: get<1>(h) += (uint64_t(cur[10]) << 16); - case 10: get<1>(h) += (uint64_t(cur[ 9]) << 8); - case 9: get<1>(h) += (uint64_t(cur[ 8])); - case 8: get<0>(h) += (uint64_t(cur[ 7]) << 56); - case 7: get<0>(h) += (uint64_t(cur[ 6]) << 48); - case 6: get<0>(h) += (uint64_t(cur[ 5]) << 40); - case 5: get<0>(h) += (uint64_t(cur[ 4]) << 32); - case 4: get<0>(h) += (uint64_t(cur[ 3]) << 24); - case 3: get<0>(h) += (uint64_t(cur[ 2]) << 16); - case 2: get<0>(h) += (uint64_t(cur[ 1]) << 8); - case 1: get<0>(h) += (uint64_t(cur[ 0])); - case 0: break; // nothing to add - default: assert(false); - } - - mix(h); - - return h; - } - - protected: - - static void mix(hash_triple_t& h) - { - uint64_t& a = std::get<0>(h); - uint64_t& b = std::get<1>(h); - uint64_t& c = std::get<2>(h); - - a -= b; a -= c; a ^= (c >> 43); - b -= c; b -= a; b ^= (a << 9); - c -= a; c -= b; c ^= (b >> 8); - a -= b; a -= c; a ^= (c >> 38); - b -= c; b -= a; b ^= (a << 23); - c -= a; c -= b; c ^= (b >> 5); - a -= b; a -= c; a ^= (c >> 35); - b -= c; b -= a; b ^= (a << 49); - c -= a; c -= b; c ^= (b >> 11); - a -= b; a -= c; a ^= (c >> 12); - b -= c; b -= a; b ^= (a << 18); - c -= a; c -= b; c ^= (b >> 22); - } - - seed_t m_seed; - }; - - copied_jenkins64_hasher hasher; - - - inline u_int64_t EMPHFhash(const Type &k) - { - return std::get<2>(hasher(adaptor(k))); - //return std::get<2>(emphf_hasher(adaptor(k))); - } -#endif - }; /********************************************************************************/ diff --git a/gatb-core/src/gatb/tools/collections/impl/BooPHF.hpp b/gatb-core/src/gatb/tools/collections/impl/BooPHF.hpp index 849e29cc2..5ce498ea6 100644 --- a/gatb-core/src/gatb/tools/collections/impl/BooPHF.hpp +++ b/gatb-core/src/gatb/tools/collections/impl/BooPHF.hpp @@ -28,6 +28,7 @@ #include #include +#include #include @@ -42,13 +43,192 @@ namespace collections { namespace impl { /********************************************************************************/ + +typedef std::pair byte_range_t; + +/** For some specialization (see below), we need to adapt the key type to some + * range of raw data in memory. We provide here a default adaptor that can + * be used as default template type for the MPHF class. + */ +template +struct AdaptatorDefault +{ + byte_range_t operator() (const T& t) const + { + const u_int8_t* buf = reinterpret_cast (&t); + const u_int8_t* end = buf + sizeof(T); + return byte_range_t(buf, end); + } +}; + +// from emphf, https://github.com/ot/emphf/blob/master/base_hash.hpp +// Apache License 2 +// itself was adapted from http://www.burtleburtle.net/bob/c/lookup8.c +inline uint64_t unaligned_load64(uint8_t const* from) +{ + uint64_t tmp; + memcpy(reinterpret_cast(&tmp), from, 8); + //(ot): reverse bytes in big-endian architectures + return tmp; + } +struct jenkins64_hasher { + + typedef uint64_t seed_t; + typedef uint64_t hash_t; + typedef std::tuple hash_triple_t; + + jenkins64_hasher() + {} + + jenkins64_hasher(uint64_t seed) + : m_seed(seed) + {} + + template + static jenkins64_hasher generate(Rng& rng) + { + return jenkins64_hasher(rng()); + } + + // Adapted from http://www.burtleburtle.net/bob/c/lookup8.c + hash_triple_t operator()(byte_range_t s) const + { + using std::get; + hash_triple_t h(m_seed, m_seed, 0x9e3779b97f4a7c13ULL); + + size_t len = (size_t)(s.second - s.first); + uint8_t const* cur = s.first; + uint8_t const* end = s.second; + + while (end - cur >= 24) { + get<0>(h) += unaligned_load64(cur); + cur += 8; + get<1>(h) += unaligned_load64(cur); + cur += 8; + get<2>(h) += unaligned_load64(cur); + cur += 8; + + mix(h); + } + + get<2>(h) += len; + + switch (end - cur) { + case 23: get<2>(h) += (uint64_t(cur[22]) << 56); + case 22: get<2>(h) += (uint64_t(cur[21]) << 48); + case 21: get<2>(h) += (uint64_t(cur[20]) << 40); + case 20: get<2>(h) += (uint64_t(cur[19]) << 32); + case 19: get<2>(h) += (uint64_t(cur[18]) << 24); + case 18: get<2>(h) += (uint64_t(cur[17]) << 16); + case 17: get<2>(h) += (uint64_t(cur[16]) << 8); + // the first byte of c is reserved for the length + case 16: get<1>(h) += (uint64_t(cur[15]) << 56); + case 15: get<1>(h) += (uint64_t(cur[14]) << 48); + case 14: get<1>(h) += (uint64_t(cur[13]) << 40); + case 13: get<1>(h) += (uint64_t(cur[12]) << 32); + case 12: get<1>(h) += (uint64_t(cur[11]) << 24); + case 11: get<1>(h) += (uint64_t(cur[10]) << 16); + case 10: get<1>(h) += (uint64_t(cur[ 9]) << 8); + case 9: get<1>(h) += (uint64_t(cur[ 8])); + case 8: get<0>(h) += (uint64_t(cur[ 7]) << 56); + case 7: get<0>(h) += (uint64_t(cur[ 6]) << 48); + case 6: get<0>(h) += (uint64_t(cur[ 5]) << 40); + case 5: get<0>(h) += (uint64_t(cur[ 4]) << 32); + case 4: get<0>(h) += (uint64_t(cur[ 3]) << 24); + case 3: get<0>(h) += (uint64_t(cur[ 2]) << 16); + case 2: get<0>(h) += (uint64_t(cur[ 1]) << 8); + case 1: get<0>(h) += (uint64_t(cur[ 0])); + case 0: break; // nothing to add + default: assert(false); + } + + mix(h); + + return h; + } + + // rehash a hash triple + hash_triple_t operator()(hash_triple_t h) const + { + std::get<0>(h) += m_seed; + std::get<1>(h) += m_seed; + std::get<2>(h) += 0x9e3779b97f4a7c13ULL; + + mix(h); + + return h; + } + + void swap(jenkins64_hasher& other) + { + std::swap(m_seed, other.m_seed); + } + + void save(std::ostream& os) const + { + os.write(reinterpret_cast(&m_seed), sizeof(m_seed)); + } + + void load(std::istream& is) + { + is.read(reinterpret_cast(&m_seed), sizeof(m_seed)); + } + + seed_t seed() const + { + return m_seed; + } + + protected: + + static void mix(hash_triple_t& h) + { + uint64_t& a = std::get<0>(h); + uint64_t& b = std::get<1>(h); + uint64_t& c = std::get<2>(h); + + a -= b; a -= c; a ^= (c >> 43); + b -= c; b -= a; b ^= (a << 9); + c -= a; c -= b; c ^= (b >> 8); + a -= b; a -= c; a ^= (c >> 38); + b -= c; b -= a; b ^= (a << 23); + c -= a; c -= b; c ^= (b >> 5); + a -= b; a -= c; a ^= (c >> 35); + b -= c; b -= a; b ^= (a << 49); + c -= a; c -= b; c ^= (b >> 11); + a -= b; a -= c; a ^= (c >> 12); + b -= c; b -= a; b ^= (a << 18); + c -= a; c -= b; c ^= (b >> 22); + } + + seed_t m_seed; +}; + + + /** \brief Minimal Perfect Hash Function * * This is a specialization of the MPHF class for exist=true. * It uses BooPHF for the implementation and is most a wrapper between BooPHF and * GATB-CORE concepts. */ -template +/** \brief Perfect minimal hash function for a given kind of key + * + * This class provides an interface for getting hash codes for some key type T, which + * can be done through the operator() method + * + * This class is not a classic hash feature because it hashes only a given set of T items + * (provided as a T iterator) through its 'build' method. Once building is done, hash code + * can be accessed through the operator() + * + * We propose here a default implementation that doesn't do much. The idea behind is that + * we can specialize the class for the 'exist' template argument in order to provide a true + * implementation (through EMPHF library for instance). If such an implementation exists, + * the constant 'enabled' will be true, which allows to test it in the code (it is a little + * bit better than using compilation flag). + */ + +template, class Progress=tools::misc::impl::ProgressNone> class BooPHF : public system::SmartPointer { private: @@ -56,7 +236,7 @@ class BooPHF : public system::SmartPointer // a hash wrapper that calls emphf's hasher to produce, given an element, a single hash value for BooPHF class hasher_t { - typedef emphf::jenkins64_hasher BaseHasher; + typedef jenkins64_hasher BaseHasher; BaseHasher emphf_hasher; Adaptator adaptor; @@ -82,9 +262,6 @@ class BooPHF : public system::SmartPointer public: - /** Template specialization. */ - static const bool enabled = true; - /** Definition of a hash value. */ typedef u_int64_t Code; @@ -94,7 +271,7 @@ class BooPHF : public system::SmartPointer /** Build the hash function from a set of items. * \param[in] iterable : keys iterator * \param[in] progress : object that listens to the event of the algorithm */ - void build (tools::collections::Iterable* iterable, int nbThreads, tools::dp::IteratorListener* progress=0) + void build (tools::collections::Iterable* iterable, int nbThreads = 1, tools::dp::IteratorListener* progress=0) { if (isBuilt==true) { throw system::Exception ("MFHP: built already done"); } @@ -104,7 +281,7 @@ class BooPHF : public system::SmartPointer size_t nbElts = iterable->getNbItems(); - iterator_wrapper kmers (iter); // TODO use EMPHF's to prevent code duplication, or actually, put it in MPHFWrapper. + iterator_wrapper kmers (iter); bool withprogress = true; @@ -136,7 +313,6 @@ class BooPHF : public system::SmartPointer { /** We need an input stream for the given collection given by group/name. */ tools::storage::impl::Storage::istream is (group, name); - /** We load the emphf object from the input stream. */ bphf = boophf_t(); bphf.load (is); return size(); @@ -148,7 +324,6 @@ class BooPHF : public system::SmartPointer { /** We need an output stream for the given collection given by group/name. */ tools::storage::impl::Storage::ostream os (group, name); - /** We save the emphf object to the output stream. */ bphf.save (os); /** We set the number of keys as an attribute of the group. */ group.addProperty ("nb_keys", misc::impl::Stringify().format("%d",nbKeys)); // FIXME: maybe overflow here diff --git a/gatb-core/src/gatb/tools/collections/impl/EMPHF.hpp b/gatb-core/src/gatb/tools/collections/impl/EMPHF.hpp deleted file mode 100644 index 6e8626010..000000000 --- a/gatb-core/src/gatb/tools/collections/impl/EMPHF.hpp +++ /dev/null @@ -1,232 +0,0 @@ -/***************************************************************************** - * GATB : Genome Assembly Tool Box - * Copyright (C) 2014 INRIA - * Authors: R.Chikhi, G.Rizk, E.Drezen - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . -*****************************************************************************/ - -/** \file EMPHF.hpp - * \date 01/03/2013 - * \author edrezen - * \brief Minimal Perfect Hash Function - */ - -#ifndef _GATB_CORE_TOOLS_MISC_IMPL_EMPHF_HPP_ -#define _GATB_CORE_TOOLS_MISC_IMPL_EMPHF_HPP_ - -/********************************************************************************/ - -#include -#include - -#include -#include -#include - -//#define USE_HEM 1 -// **warning**: HEM is buggy with k=21; it just is. I have not investigated why because we switched to boophf. -// otherwise, HEM, is faster to construct. will use more memory and might have slower queries. - -#ifdef USE_HEM -#include -#else -#include -#include -#endif -/********************************************************************************/ -namespace gatb { -namespace core { -namespace tools { -namespace collections { -namespace impl { -/********************************************************************************/ - -/** \brief Minimal Perfect Hash Function - * - * This is a specialization of the MPHF class for exist=true. - * It uses EMPHF for the implementation and is most a wrapper between EMPHF and - * GATB-CORE concepts. - */ -template -class EMPHF : public system::SmartPointer -{ -private: - -#ifdef USE_HEM - typedef emphf::mphf_hem mphf_t; -#else - // adapted from compute_mphf_scan_mmap.cpp - typedef emphf::hypergraph_sorter_scan HypergraphSorter32; - typedef emphf::hypergraph_sorter_scan HypergraphSorter64; - typedef emphf::jenkins64_hasher BaseHasher; - typedef emphf::mphf mphf_t; -#endif -public: - - /** Template specialization. */ - static const bool enabled = true; - - /** Definition of a hash value. */ - typedef u_int64_t Code; - - /** Constructor. */ - EMPHF () : isBuilt(false), nbKeys(0) {} - - /** Build the hash function from a set of items. - * \param[in] iterable : keys iterator - * \param[in] progress : object that listens to the event of the algorithm */ - void build (tools::collections::Iterable* iterable, int nbThreads = 1, tools::dp::IteratorListener* progress=0) - { - if (isBuilt==true) { throw system::Exception ("MFHP: built already done"); } - - /** We create an iterator from the iterable. */ - tools::dp::Iterator* iter = iterable->iterator(); - LOCAL (iter); - - size_t nbElts = iterable->getNbItems(); - - // a small fix, emphf for 2 nodes doesn't seem to work - if (nbElts <= 2) { nbElts = 3; } - if (nbElts <= 3) { std::cout << "Warning: MPHF has a tiny amount of elements (" << nbElts << "), might not work correctly." << std::endl; } - - iterator_wrapper kmers (iter); - - // We may have no provided listener => use default one. - if (progress==0) { progress = new tools::dp::IteratorListener; } - LOCAL (progress); - -#ifdef USE_HEM - emphf::mmap_memory_model mm; - mphf_t(mm, nbElts, kmers, adaptor, progress).swap(mphf); -#else - size_t max_nodes = (size_t(std::ceil(double(nbElts) * 1.23)) + 2) / 3 * 3; - if (max_nodes >= uint64_t(1) << 32) - { - HypergraphSorter64 sorter; - mphf_t(sorter, nbElts, kmers, adaptor, progress).swap(mphf); - } - else - { - HypergraphSorter32 sorter; - mphf_t(sorter, nbElts, kmers, adaptor, progress).swap(mphf); - } -#endif - - isBuilt = true; - nbKeys = iterable->getNbItems(); - } - - /** Returns the hash code for the given key. WARNING : default implementation here will - * throw an exception. - * \param[in] key : the key to be hashed - * \return the hash value. */ - Code operator () (const Key& key) - { - return mphf.lookup (key, adaptor); - } - - /** Returns the number of keys. - * \return keys number */ - size_t size() const { return mphf.size(); } - - /** Load hash function from a collection*/ - size_t load (tools::storage::impl::Group& group, const std::string& name) - { - /** We need an input stream for the given collection given by group/name. */ - tools::storage::impl::Storage::istream is (group, name); - /** We load the emphf object from the input stream. */ - mphf.load (is); - /** We return the number of keys. */ - return mphf.size(); - } - - /** Save hash function to a collection - * \return the number of bytes of the saved data. */ - size_t save (tools::storage::impl::Group& group, const std::string& name) - { - /** We need an output stream for the given collection given by group/name. */ - tools::storage::impl::Storage::ostream os (group, name); - /** We save the emphf object to the output stream. */ - mphf.save (os); - /** We set the number of keys as an attribute of the group. */ - group.addProperty ("nb_keys", misc::impl::Stringify().format("%d",nbKeys)); // FIXME: maybe overflow here - return os.tellp(); - } - -private: - - mphf_t mphf; - Adaptator adaptor; - bool isBuilt; - size_t nbKeys; - -private: - - class iterator_adaptator : public std::iterator - { - public: - iterator_adaptator() : iterator(0), pos(0) {} - - iterator_adaptator(tools::dp::Iterator* iterator) : iterator(iterator), pos(0) { iterator->first(); } - - Key const& operator*() { return iterator->item(); } - - iterator_adaptator& operator++() - { - iterator->next(); - pos++; - if (iterator->isDone()) - { - iterator = nullptr; - pos = 0; - } - return *this; - } - - friend bool operator==(iterator_adaptator const& lhs, iterator_adaptator const& rhs) - { - if (!lhs.iterator || !rhs.iterator) { if (!lhs.iterator && !rhs.iterator) { return true; } else { return false; } } - return rhs.pos == lhs.pos; - } - - friend bool operator!=(iterator_adaptator const& lhs, iterator_adaptator const& rhs) { return !(lhs == rhs); } - - private: - tools::dp::Iterator* iterator; - unsigned long pos; - }; - - class iterator_wrapper - { - public: - iterator_wrapper (tools::dp::Iterator* iterator) : iterator(iterator) {} - - iterator_adaptator begin() const { return iterator_adaptator (iterator); } - iterator_adaptator end () const { return iterator_adaptator (); } - size_t size () const { return 0; } - - private: - // noncopyble - iterator_wrapper(iterator_wrapper const&); - iterator_wrapper& operator=(iterator_wrapper const&); - tools::dp::Iterator* iterator; - }; -}; - -/********************************************************************************/ -} } } } } /* end of namespaces. */ -/********************************************************************************/ - -#endif /* _GATB_CORE_TOOLS_MISC_IMPL_EMPHF_HPP_ */ diff --git a/gatb-core/src/gatb/tools/collections/impl/MPHF.hpp b/gatb-core/src/gatb/tools/collections/impl/MPHF.hpp deleted file mode 100644 index aa02f5f51..000000000 --- a/gatb-core/src/gatb/tools/collections/impl/MPHF.hpp +++ /dev/null @@ -1,152 +0,0 @@ -/***************************************************************************** - * GATB : Genome Assembly Tool Box - * Copyright (C) 2014 INRIA - * Authors: R.Chikhi, G.Rizk, E.Drezen - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . -*****************************************************************************/ - -/** \file MPHF.hpp - * \date 01/03/2013 - * \author edrezen - * \brief Minimal Perfect Hash Function - */ - -#ifndef _GATB_CORE_TOOLS_MISC_MPHF_HPP_ -#define _GATB_CORE_TOOLS_MISC_MPHF_HPP_ - -/********************************************************************************/ - -#include -#include -#include -#include -#include - -/********************************************************************************/ -namespace gatb { -namespace core { -namespace tools { -namespace collections { -namespace impl { -/********************************************************************************/ - -typedef std::pair byte_range_t; - -/** For some specialization (see below), we need to adapt the key type to some - * range of raw data in memory. We provide here a default adaptor that can - * be used as default template type for the MPHF class. - */ -template -struct AdaptatorDefault -{ - byte_range_t operator() (const T& t) const - { - const u_int8_t* buf = reinterpret_cast (&t); - const u_int8_t* end = buf + sizeof(T); - return byte_range_t(buf, end); - } -}; - -/********************************************************************************/ - -/** \brief Perfect minimal hash function for a given kind of key - * - * This class provides an interface for getting hash codes for some key type T, which - * can be done through the operator() method - * - * This class is not a classic hash feature because it hashes only a given set of T items - * (provided as a T iterator) through its 'build' method. Once building is done, hash code - * can be accessed through the operator() - * - * We propose here a default implementation that doesn't do much. The idea behind is that - * we can specialize the class for the 'exist' template argument in order to provide a true - * implementation (through EMPHF library for instance). If such an implementation exists, - * the constant 'enabled' will be true, which allows to test it in the code (it is a little - * bit better than using compilation flag). - */ -template, class Progress=tools::misc::impl::ProgressNone, bool exist=true> -class MPHF : public system::SmartPointer -{ -public: - /** Constant telling whether the feature is enabled or not. - * - * - if not enabled, calls to methods may return an exception. - * - * - if enabled, the implementation should be done through a specialization - * of the 'exist' template parameter; such an implementation can be conditionally - * compiled through a compilation flag, so if the implementation is not available - * on some os/architecture, we switch back to the 'not enabled case' (see EMPHF - * for instance) - */ - static const bool enabled = false; - - /** Definition of a hash value. */ - typedef u_int64_t Code; - - tools::misc::MPHFKind mphfKind; - - /** Constructor. */ - MPHF (tools::misc::MPHFKind mphfKind) : mphfKind(mphfKind) {} - - /** Constructor. */ - MPHF () : mphfKind(tools::misc::MPHF_BOOPHF /* boophf is best mphf; so it deserves default*/) {} - - /** Constructor. */ - MPHF (tools::dp::Iterator* iterator, size_t nbElts) { } - - /** Constructor. */ - MPHF (tools::collections::Iterable* iterable) { } - - /** Build the hash function from a set of items. - * \param[in] iterable : keys iterator - * \param[in] nb threads - * \param[in] progress : object that listens to the event of the algorithm */ - void build (tools::collections::Iterable* iterable, int nbThreads = 1, tools::dp::IteratorListener* progress=0) { error(); } - - /** Returns the hash code for the given key. WARNING : default implementation here will - * throw an exception. - * \param[in] key : the key to be hashed - * \return the hash value. */ - Code operator () (const Key& key) { error(); return Code(); } - - /** Returns the number of keys. - * \return keys number */ - size_t size() const { error(); return 0; } - - /** Load hash function from a collection - * \return the number of keys. */ - size_t load (tools::storage::impl::Group& group, const std::string& name) { error(); return 0; } - - /** Save hash function to a collection - * \return the number of bytes of the saved data. */ - size_t save (tools::storage::impl::Group& group, const std::string& name) { error(); return 0; } - -private: - - /** Default error management. */ - void error () const { printf("MPHF error\n"); throw gatb::core::system::ExceptionNotImplemented(); } -}; - -/********************************************************************************/ -} } } } } /* end of namespaces. */ -/********************************************************************************/ - -/** NOW HERE THE TRICK... We include the wrapper (for EMPHF and BooPHF implementations) if allowed by compilation flag. */ -#ifdef WITH_MPHF - #include -#endif -/* sooo many indirections for the MPHF code.. MPHFAlgorithm > MapMPHF > MPHF > MPHFWrapper > EMPHF/BooMHF. could perhaps simplify someday? */ - -#endif /* _GATB_CORE_TOOLS_MISC_IMPL_MPHF_HPP_ */ diff --git a/gatb-core/src/gatb/tools/collections/impl/MPHFWrapper.hpp b/gatb-core/src/gatb/tools/collections/impl/MPHFWrapper.hpp deleted file mode 100644 index 9cadbc84a..000000000 --- a/gatb-core/src/gatb/tools/collections/impl/MPHFWrapper.hpp +++ /dev/null @@ -1,151 +0,0 @@ -/***************************************************************************** - * GATB : Genome Assembly Tool Box - * Copyright (C) 2014 INRIA - * Authors: R.Chikhi, G.Rizk, E.Drezen - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . -*****************************************************************************/ - -/** \file EMPHF.hpp - * \date 01/03/2013 - * \author edrezen - * \brief Minimal Perfect Hash Function - */ - -#ifndef _GATB_CORE_TOOLS_MISC_IMPL_MPHFWRAPPER_HPP_ -#define _GATB_CORE_TOOLS_MISC_IMPL_MPHFWRAPPER_HPP_ - -/********************************************************************************/ - -#include -#include -#include // for MPHFKind - -#include -#include - -/********************************************************************************/ -namespace gatb { -namespace core { -namespace tools { -namespace collections { -namespace impl { -/********************************************************************************/ - -/** \brief Minimal Perfect Hash Function - * - * that's a wrapper for emphf or boophf - */ -template -class MPHF : public system::SmartPointer -{ -private: - -public: - - /** Template specialization. */ - static const bool enabled = true; - - /** Definition of a hash value. */ - typedef u_int64_t Code; - - tools::misc::MPHFKind mphfKind; - - /** Constructor. */ - MPHF (tools::misc::MPHFKind mphfKind) : mphfKind(mphfKind) {} - - /** Constructor. */ - MPHF () : mphfKind(tools::misc::MPHF_BOOPHF /* boophf is best mphf; so it deserves default*/) {} - - /** Build the hash function from a set of items. - * \param[in] iterable : keys iterator - * \param[in] progress : object that listens to the event of the algorithm */ - void build (tools::collections::Iterable* iterable, int nbThreads = 1, tools::dp::IteratorListener* progress=0) - { - if (mphfKind == tools::misc::MPHF_EMPHF) - emphf.build(iterable, nbThreads, progress); - else - { - if (mphfKind == tools::misc::MPHF_BOOPHF) - boophf.build(iterable, nbThreads, progress); - else - std::cout << "Error: building MPHF of wrong kind (debug: " << (unsigned int)mphfKind << ")" << std::endl; - } - } - - /** Returns the hash code for the given key. WARNING : default implementation here will - * throw an exception. - * \param[in] key : the key to be hashed - * \return the hash value. */ - Code operator () (const Key& key) - { - if (mphfKind == tools::misc::MPHF_EMPHF) - return emphf(key); - if (mphfKind == tools::misc::MPHF_BOOPHF) - return boophf(key); - return 0; - } - - /** Returns the number of keys. - * \return keys number */ - size_t size() const { - if (mphfKind == tools::misc::MPHF_EMPHF) - return emphf.size(); - if (mphfKind == tools::misc::MPHF_BOOPHF) - return boophf.size(); - - std::cout << "Error: size of MPHF of wrong kind (debug: " << (unsigned int)mphfKind << ")" << std::endl; - return 0; - } - - /** Load hash function from a collection*/ - size_t load (tools::storage::impl::Group& group, const std::string& name) - { - if (mphfKind == tools::misc::MPHF_EMPHF) - return emphf.load(group,name); - if (mphfKind == tools::misc::MPHF_BOOPHF) - return boophf.load(group,name); - - std::cout << "Error: loading MPHF of wrong kind (debug: " << (unsigned int)mphfKind << ")" << std::endl; - return 0; - } - - /** Save hash function to a collection - * \return the number of bytes of the saved data. */ - size_t save (tools::storage::impl::Group& group, const std::string& name) - { - if (mphfKind == tools::misc::MPHF_EMPHF) - return emphf.save(group,name); - if (mphfKind == tools::misc::MPHF_BOOPHF) - return boophf.save(group,name); - - std::cout << "Error: loading MPHF of wrong kind (debug: " << (unsigned int)mphfKind << ")" << std::endl; - return 0; - } - -private: - - /** we will use alternatively one or the other; - * just having them as variable doesn't use any memory as long as build() isn't called*/ - - EMPHF emphf; - BooPHF boophf; - -}; - -/********************************************************************************/ -} } } } } /* end of namespaces. */ -/********************************************************************************/ - -#endif /* _GATB_CORE_TOOLS_MISC_IMPL_EMPHF_HPP_ */ diff --git a/gatb-core/src/gatb/tools/collections/impl/MapMPHF.hpp b/gatb-core/src/gatb/tools/collections/impl/MapMPHF.hpp index 1a15f1444..5aaefadff 100644 --- a/gatb-core/src/gatb/tools/collections/impl/MapMPHF.hpp +++ b/gatb-core/src/gatb/tools/collections/impl/MapMPHF.hpp @@ -29,7 +29,7 @@ /********************************************************************************/ #include -#include +#include #include #include @@ -46,7 +46,7 @@ namespace impl { * This hash table implementation uses a minimal perfect hash function (MPHF) for * identifying the keys with a unique number in [0..N-1] where N is the number of items. * - * If the EMPHF library is used, the memory usage is about 2.61 bits per key. + * Using BooPHF, the memory usage is about 3-4 bits per key. * * The values can be stored in a simple vector. The keys are not stored in memory, only * the mphf is needed. @@ -59,13 +59,7 @@ class MapMPHF : public system::SmartPointer public: /** Hash type. */ - typedef MPHF Hash; - - /** Constant telling whether the feature is enabled or not. */ - static const bool enabled = Hash::enabled; - - /** Default constructor. */ - MapMPHF (tools::misc::MPHFKind mphfKind) : hash(mphfKind) {} + typedef BooPHF Hash; /** Default constructor. */ MapMPHF () : hash() {} diff --git a/gatb-core/src/gatb/tools/misc/api/Enums.hpp b/gatb-core/src/gatb/tools/misc/api/Enums.hpp index 5b2c61ce0..9649a8b15 100644 --- a/gatb-core/src/gatb/tools/misc/api/Enums.hpp +++ b/gatb-core/src/gatb/tools/misc/api/Enums.hpp @@ -223,44 +223,6 @@ static std::string toString (BranchingKind kind) /********************************************************************************/ -/** Enumeration for the different kinds of Minimal Perfect Hash Function algorithm supported in GATB. */ -enum MPHFKind -{ - /** No MPHF */ - MPHF_NONE, - /** Usage of EMPHF library */ - MPHF_EMPHF, - /** Usage of BooPHF library */ - MPHF_BOOPHF -}; - -/** Get the enum from a string. - * \param[in] s : string to be parsed - * \param[out] kind : enum to be set from the string parsing. */ -static void parse (const std::string& s, MPHFKind& kind) -{ - if (s == "none") { kind = MPHF_NONE; } - else if (s == "emphf") { kind = MPHF_EMPHF; } - else if (s == "boophf" || s == "BooPHF" ) { kind = MPHF_BOOPHF; } - else { throw system::Exception ("bad mphf kind '%s'", s.c_str()); } -} - -/** Get the string associated to an enum - * \param[in] kind : the enum value - * \return the associated string */ -static std::string toString (MPHFKind kind) -{ - switch (kind) - { - case MPHF_NONE: return "none"; - case MPHF_EMPHF: return "emphf"; - case MPHF_BOOPHF: return "BooPHF"; - default: throw system::Exception ("bad mphf kind %d", kind); - } -} - -/********************************************************************************/ - /** Enumeration for the different kinds of kmer solidity criteria supported in GATB. */ enum KmerSolidityKind { diff --git a/gatb-core/src/gatb/tools/misc/api/StringsRepository.hpp b/gatb-core/src/gatb/tools/misc/api/StringsRepository.hpp index 18918dc88..802e35d91 100644 --- a/gatb-core/src/gatb/tools/misc/api/StringsRepository.hpp +++ b/gatb-core/src/gatb/tools/misc/api/StringsRepository.hpp @@ -109,7 +109,6 @@ class StringRepository const char* debloom_impl () { return "-debloom-impl"; } const char* branching_type () { return "-branching-nodes";} const char* topology_stats () { return "-topology-stats";} - const char* mphf_type () { return "-mphf";} const char* uri_solid_kmers() { return "-solid-kmers-out"; } const char* bank_convert_type () { return "-bank-convert"; } const char* integer_precision () { return "-integer-precision";} @@ -163,7 +162,6 @@ class StringRepository #define STR_DEBLOOM_IMPL gatb::core::tools::misc::StringRepository::singleton().debloom_impl() #define STR_BRANCHING_TYPE gatb::core::tools::misc::StringRepository::singleton().branching_type() #define STR_TOPOLOGY_STATS gatb::core::tools::misc::StringRepository::singleton().topology_stats() -#define STR_MPHF_TYPE gatb::core::tools::misc::StringRepository::singleton().mphf_type() #define STR_URI_SOLID_KMERS gatb::core::tools::misc::StringRepository::singleton().uri_solid_kmers() #define STR_BANK_CONVERT_TYPE gatb::core::tools::misc::StringRepository::singleton().bank_convert_type() #define STR_SOLIDITY_KIND gatb::core::tools::misc::StringRepository::singleton().solidity_kind() diff --git a/gatb-core/src/gatb/tools/misc/impl/Tool.cpp b/gatb-core/src/gatb/tools/misc/impl/Tool.cpp index 62b9f6e9a..bea289fed 100644 --- a/gatb-core/src/gatb/tools/misc/impl/Tool.cpp +++ b/gatb-core/src/gatb/tools/misc/impl/Tool.cpp @@ -45,7 +45,7 @@ namespace gatb { namespace core { namespace tools { namespace misc { namespac ** RETURN : ** REMARKS : *********************************************************************/ -Tool::Tool (const std::string& name) : _name(name), _input(0), _output(0), _info(0), _parser(0), _dispatcher(0),userDisplayHelp(0),userDisplayVersion(0),_helpTarget(0),_versionTarget(0) +Tool::Tool (const std::string& name) : userDisplayHelp(0), _helpTarget(0),userDisplayVersion(0), _versionTarget(0), _name(name), _input(0), _output(0), _info(0), _parser(0), _dispatcher(0) { setOutput (new Properties()); diff --git a/gatb-core/src/gatb/tools/storage/impl/StorageTools.cpp b/gatb-core/src/gatb/tools/storage/impl/StorageTools.cpp index 9a7430dcc..9cb9dc45e 100644 --- a/gatb-core/src/gatb/tools/storage/impl/StorageTools.cpp +++ b/gatb-core/src/gatb/tools/storage/impl/StorageTools.cpp @@ -1,4 +1,3 @@ -#ifdef WITH_MPHF /***************************************************************************** * GATB : Genome Assembly Tool Box * Copyright (C) 2014 INRIA @@ -31,5 +30,3 @@ namespace impl { /********************************************************************************/ } } } } } /* end of namespaces. */ /********************************************************************************/ -#endif - diff --git a/gatb-core/test/benchmark/bench_graph.cpp b/gatb-core/test/benchmark/bench_graph.cpp index 770f39ed9..f482b5ffa 100644 --- a/gatb-core/test/benchmark/bench_graph.cpp +++ b/gatb-core/test/benchmark/bench_graph.cpp @@ -1,8 +1,6 @@ /* this is the most advanced benchmark i've coded (more recent than bench_mphf, and some benchmarks overlap) * */ -#define WITH_MPHF 1 // funny story: even though gatb-core defines WITH_MPHF in its cmakefile, this isn't taken into account in compiling benchmarks. haha. i'm sad it took me so many minutes to find. - #include #define get_wtime() chrono::system_clock::now() #define diff_wtime(x,y) chrono::duration_cast(y - x).count() @@ -390,7 +388,7 @@ void debruijn_mphf () { for (size_t j=0; j (kmerSizes[j], Parameter( kmerSizes[j], args, sequences[i]) ); } @@ -412,7 +410,7 @@ int main (int argc, char* argv[]) if (argc > 2) k = stoi(argv[2]); - string args = "-in " + string(argv[1]) + " -kmer-size " + std::to_string(k) + " -abundance-min 1 -verbose 0 -max-memory 500 -mphf emphf"; + string args = "-in " + string(argv[1]) + " -kmer-size " + std::to_string(k) + " -abundance-min 1 -verbose 0 -max-memory 500"; Integer::apply (k, Parameter( k, args) ); } diff --git a/gatb-core/test/benchmark/bench_mphf.cpp b/gatb-core/test/benchmark/bench_mphf.cpp index 6729e6778..b2ec627d3 100644 --- a/gatb-core/test/benchmark/bench_mphf.cpp +++ b/gatb-core/test/benchmark/bench_mphf.cpp @@ -208,7 +208,7 @@ void debruijn_mphf () /** We create the graph. */ Graph graph = Graph::create ( new BankStrings (sequences[i], 0), - "-kmer-size %d -abundance-min 1 -verbose 0 -max-memory %d -mphf emphf", kmerSizes[j], 500 + "-kmer-size %d -abundance-min 1 -verbose 0 -max-memory %d", kmerSizes[j], 500 ); Integer::apply (kmerSizes[j], Parameter( kmerSizes[j], graph) ); @@ -234,7 +234,7 @@ int main (int argc, char* argv[]) if (argc > 2) k = stoi(argv[2]); - string args = "-in " + string(argv[1]) + " -kmer-size " + std::to_string(k) + " -abundance-min 1 -verbose 0 -max-memory 500 -mphf emphf"; + string args = "-in " + string(argv[1]) + " -kmer-size " + std::to_string(k) + " -abundance-min 1 -verbose 0 -max-memory 500"; Graph graph = Graph::create (args.c_str()); cout << "graph built, benchmarking.." << endl; Integer::apply (k, Parameter( k, graph) ); diff --git a/gatb-core/test/unit/src/debruijn/TestDebruijn.cpp b/gatb-core/test/unit/src/debruijn/TestDebruijn.cpp index 26e65b656..281517afc 100644 --- a/gatb-core/test/unit/src/debruijn/TestDebruijn.cpp +++ b/gatb-core/test/unit/src/debruijn/TestDebruijn.cpp @@ -104,10 +104,8 @@ class TestDebruijn : public Test // CPPUNIT_TEST_GATB (debruijn_mutation); // has been removed due to it crashing clang, and since mutate() isn't really used in apps, i didn't bother. CPPUNIT_TEST_GATB (debruijn_build); CPPUNIT_TEST_GATB (debruijn_checkbranching); -#ifdef WITH_MPHF CPPUNIT_TEST_GATB (debruijn_mphf); CPPUNIT_TEST_GATB (debruijn_mphf_nodeindex); -#endif CPPUNIT_TEST_GATB (debruijn_traversal1); CPPUNIT_TEST_SUITE_GATB_END(); @@ -484,13 +482,10 @@ class TestDebruijn : public Test void debruijn_test7 () { /** We create the graph. */ -#ifdef WITH_MPHF + // useless historical note: // emphf had a known bug where, when there are only like a tiny amount of elements (I tested with three), it will just return mphf(elt)=0 always. - // so this is why I'm adding the dummy "ACTGACTGACTGACTG" sequence, to artificially increase the amount of elements in the mphf - Graph graph = Graph::create (new BankStrings ("AGGCGC", "ACTGACTGACTGACTG",0), "-kmer-size 5 -abundance-min 1 -verbose 0 -max-memory %d -mphf emphf", MAX_MEMORY); -#else + // so this is why I added the dummy "ACTGACTGACTGACTG" sequence, to artificially increase the amount of elements in the mphf Graph graph = Graph::create (new BankStrings ("AGGCGC", "ACTGACTGACTGACTG",0), "-kmer-size 5 -abundance-min 1 -verbose 0 -max-memory %d", MAX_MEMORY); -#endif /** We should get two kmers: * - AGGCG / CGCCT @@ -507,13 +502,10 @@ class TestDebruijn : public Test debruijn_test7_fct fct (graph, n1, n2); graph.iterator().iterate (fct); -#ifdef WITH_MPHF /* rerun this test with adjacency information instead of bloom */ graph.precomputeAdjacency(1, false); graph.iterator().iterate (fct); -#endif - } /********************************************************************************/ @@ -796,7 +788,7 @@ class TestDebruijn : public Test size_t kmerSize = strlen (sequences[0]); // We create the graph. - Graph graph = Graph::create (new BankStrings (sequences, len), "-kmer-size %d -abundance-min 1 -verbose 0 -mphf emphf -max-memory %d", kmerSize, MAX_MEMORY); + Graph graph = Graph::create (new BankStrings (sequences, len), "-kmer-size %d -abundance-min 1 -verbose 0 -max-memory %d", kmerSize, MAX_MEMORY); GraphIterator it = graph.iterator(); @@ -854,7 +846,7 @@ class TestDebruijn : public Test size_t kmerSize = strlen (sequences[0])-1; // We create the graph. - Graph graph = Graph::create (new BankStrings (sequences, 3), "-kmer-size %d -abundance-min 1 -verbose 0 -mphf boophf -max-memory %d", kmerSize, MAX_MEMORY); + Graph graph = Graph::create (new BankStrings (sequences, 3), "-kmer-size %d -abundance-min 1 -verbose 0 -max-memory %d", kmerSize, MAX_MEMORY); GraphIterator it = graph.iterator(); @@ -1197,15 +1189,13 @@ class TestDebruijn : public Test void debruijn_deletenode () { - // MPHF has a known bug where, when there are only like a tiny amount of elements (I tested with three), it will just return mphf(elt)=0 always. - // so this is why I'm adding the dummy "ACTGACTGACTGACTG" sequence, to artificially increase the amount of elements in the mphf - Graph graph = Graph::create (new BankStrings ("AGGCGCC", "ACTGACTGACTGACTG",0), "-kmer-size 5 -abundance-min 1 -verbose 0 -max-memory %d -mphf emphf", MAX_MEMORY); + Graph graph = Graph::create (new BankStrings ("AGGCGCC", "ACTGACTGACTGACTG",0), "-kmer-size 5 -abundance-min 1 -verbose 0 -max-memory %d", MAX_MEMORY); debruijn_deletenode_fct (graph); /* rerun this test with adjacency information instead of bloom */ - Graph graph2 = Graph::create (new BankStrings ("AGGCGCC", "ACTGACTGACTGACTG",0), "-kmer-size 5 -abundance-min 1 -verbose 0 -max-memory %d -mphf emphf", MAX_MEMORY); + Graph graph2 = Graph::create (new BankStrings ("AGGCGCC", "ACTGACTGACTGACTG",0), "-kmer-size 5 -abundance-min 1 -verbose 0 -max-memory %d", MAX_MEMORY); graph2.precomputeAdjacency(1, false); debruijn_deletenode_fct (graph2); @@ -1246,13 +1236,13 @@ class TestDebruijn : public Test void debruijn_deletenode2 () { - Graph graph = Graph::create (new BankStrings ("AGGCGAAGGCGT", "ACTGACTGACTGACTG",0), "-kmer-size 5 -abundance-min 1 -verbose 0 -max-memory %d -mphf emphf", MAX_MEMORY); + Graph graph = Graph::create (new BankStrings ("AGGCGAAGGCGT", "ACTGACTGACTGACTG",0), "-kmer-size 5 -abundance-min 1 -verbose 0 -max-memory %d", MAX_MEMORY); debruijn_deletenode_fct (graph); /* rerun this test with adjacency information instead of bloom */ - Graph graph2 = Graph::create (new BankStrings ("AGGCGAAGGCGT", "ACTGACTGACTGACTG",0), "-kmer-size 5 -abundance-min 1 -verbose 0 -max-memory %d -mphf emphf", MAX_MEMORY); + Graph graph2 = Graph::create (new BankStrings ("AGGCGAAGGCGT", "ACTGACTGACTGACTG",0), "-kmer-size 5 -abundance-min 1 -verbose 0 -max-memory %d", MAX_MEMORY); graph2.precomputeAdjacency(1, false); debruijn_deletenode2_fct (graph2); diff --git a/gatb-core/test/unit/src/debruijn/TestDebruijnUnitigs.cpp b/gatb-core/test/unit/src/debruijn/TestDebruijnUnitigs.cpp index eb2f631c6..2fd4762ea 100644 --- a/gatb-core/test/unit/src/debruijn/TestDebruijnUnitigs.cpp +++ b/gatb-core/test/unit/src/debruijn/TestDebruijnUnitigs.cpp @@ -421,13 +421,10 @@ class TestDebruijnUnitigs : public Test debruijn_unitigs_test7_fct fct (graph, n1, n2); graph.iterator().iterate (fct); -#ifdef WITH_MPHF /* rerun this test with adjacency information instead of bloom */ graph.precomputeAdjacency(1, false); graph.iterator().iterate (fct); -#endif - } void debruijn_unitigs_test7_nocircular () @@ -921,8 +918,6 @@ class TestDebruijnUnitigs : public Test void debruijn_unitigs_deletenode () { - // MPHF has a known bug where, when there are only like a tiny amount of elements (I tested with three), it will just return mphf(elt)=0 always. - // so this is why I'm adding the dummy "ACTGACTGACTGACTG" sequence, to artificially increase the amount of elements in the mphf GraphUnitigs graph = GraphUnitigs::create (new BankStrings ("AGGCGCC", "ACTGACTGACTGACTG",(char*)0), "-kmer-size 5 -abundance-min 1 -verbose 0 -max-memory %d", MAX_MEMORY); debruijn_unitigs_deletenode_fct (graph); @@ -962,13 +957,13 @@ class TestDebruijnUnitigs : public Test void debruijn_unitigs_deletenode2 () { - GraphUnitigs graph = GraphUnitigs::create (new BankStrings ("AGGCGAAGGCGT", "ACTGACTGACTGACTG",(char*)0), "-kmer-size 5 -abundance-min 1 -verbose 0 -max-memory %d -mphf emphf", MAX_MEMORY); + GraphUnitigs graph = GraphUnitigs::create (new BankStrings ("AGGCGAAGGCGT", "ACTGACTGACTGACTG",(char*)0), "-kmer-size 5 -abundance-min 1 -verbose 0 -max-memory %d", MAX_MEMORY); debruijn_unitigs_deletenode_fct (graph); /* rerun this test with adjacency information instead of bloom */ - GraphUnitigs graph2 = GraphUnitigs::create (new BankStrings ("AGGCGAAGGCGT", "ACTGACTGACTGACTG",(char*)0), "-kmer-size 5 -abundance-min 1 -verbose 0 -max-memory %d -mphf emphf", MAX_MEMORY); + GraphUnitigs graph2 = GraphUnitigs::create (new BankStrings ("AGGCGAAGGCGT", "ACTGACTGACTGACTG",(char*)0), "-kmer-size 5 -abundance-min 1 -verbose 0 -max-memory %d", MAX_MEMORY); graph2.precomputeAdjacency(1, false); debruijn_unitigs_deletenode2_fct (graph2); diff --git a/gatb-core/test/unit/src/debruijn/TestSimplifications.cpp b/gatb-core/test/unit/src/debruijn/TestSimplifications.cpp index 1c1f4896d..3831bc780 100644 --- a/gatb-core/test/unit/src/debruijn/TestSimplifications.cpp +++ b/gatb-core/test/unit/src/debruijn/TestSimplifications.cpp @@ -89,13 +89,10 @@ class TestSimplifications : public Test { /********************************************************************************/ CPPUNIT_TEST_SUITE_GATB (TestSimplifications); -// they all need the mphf -#ifdef WITH_MPHF CPPUNIT_TEST_GATB (debruijn_simpl_X); CPPUNIT_TEST_GATB (debruijn_simpl_tip); CPPUNIT_TEST_GATB (debruijn_simpl_bubble); CPPUNIT_TEST_GATB (debruijn_simpl_ec); -#endif CPPUNIT_TEST_SUITE_GATB_END(); public: diff --git a/gatb-core/test/unit/src/debruijn/TestSimplificationsUnitigs.cpp b/gatb-core/test/unit/src/debruijn/TestSimplificationsUnitigs.cpp index 815986289..bc5fafa4f 100644 --- a/gatb-core/test/unit/src/debruijn/TestSimplificationsUnitigs.cpp +++ b/gatb-core/test/unit/src/debruijn/TestSimplificationsUnitigs.cpp @@ -91,13 +91,10 @@ class TestSimplificationsUnitigs : public Test /********************************************************************************/ CPPUNIT_TEST_SUITE_GATB (TestSimplificationsUnitigs); -// they all need the mphf -#ifdef WITH_MPHF CPPUNIT_TEST_GATB (debruijn_simplunitigs_ec); CPPUNIT_TEST_GATB (debruijn_simplunitigs_X); CPPUNIT_TEST_GATB (debruijn_simplunitigs_tip); CPPUNIT_TEST_GATB (debruijn_simplunitigs_bubble); -#endif CPPUNIT_TEST_SUITE_GATB_END(); public: diff --git a/gatb-core/test/unit/src/kmer/TestMPHF.cpp b/gatb-core/test/unit/src/kmer/TestMPHF.cpp index c436288b8..50bbefb49 100644 --- a/gatb-core/test/unit/src/kmer/TestMPHF.cpp +++ b/gatb-core/test/unit/src/kmer/TestMPHF.cpp @@ -33,7 +33,7 @@ #include -#include +#include using namespace std; @@ -72,7 +72,7 @@ class TestMPHF : public Test CPPUNIT_TEST_GATB (MPHF_check1); CPPUNIT_TEST_GATB (MPHF_check2); - CPPUNIT_TEST_GATB (test_mphf1); + // no mphf1 anymore CPPUNIT_TEST_GATB (test_mphf2); CPPUNIT_TEST_SUITE_GATB_END(); @@ -94,10 +94,6 @@ class TestMPHF : public Test /********************************************************************************/ void MPHF_check1 () { - MPHFKind mphfKind = MPHF_BOOPHF; // TODO: test with emphf also - - if (MPHFAlgorithm<>::AbundanceMap::enabled == false) { std::cout << "can't test mphf, it is disabled" << std::endl; return; } - size_t kmerSize = 11; size_t nks = 1; @@ -130,7 +126,7 @@ class TestMPHF : public Test /** We create a mphf instance. */ - MPHFAlgorithm<> mphf (mphfKind, storage->getGroup("dsk"), "mphf", sortingCount.getSolidCounts(), sortingCount.getSolidKmers(), 1, true); + MPHFAlgorithm<> mphf (storage->getGroup("dsk"), "mphf", sortingCount.getSolidCounts(), sortingCount.getSolidKmers(), 1, true); /** We actually execute the mphf construction. */ mphf.execute(); @@ -167,10 +163,7 @@ class TestMPHF : public Test void MPHF_check2 () { /** We define our MPHF type for kmers. */ - typedef MPHF MPHF; - - /** We check that we can use such a type. */ - if (MPHF::enabled == false) { return; } + typedef BooPHF MPHF; size_t kmerSize = 11; size_t nks = 1; @@ -277,40 +270,15 @@ class TestMPHF : public Test } } - /********************************************************************************/ - void test_mphf1 (void) - { - /** Shortcuts. */ - typedef int Key; - typedef MPHF Hash; - - /** We check whether the feature is activated or not. */ - if (Hash::enabled == false) - { - /** We create the hash function. */ - Hash hash; - - try { - /** We try to get a hash value. */ - hash (3); - } - catch (gatb::core::system::ExceptionNotImplemented& e) - { - CPPUNIT_ASSERT (true); - } - } - } /********************************************************************************/ void test_mphf2 (void) { /** Shortcuts. */ typedef int Key; - typedef MPHF Hash; + typedef BooPHF Hash; typedef Hash::Code HashValue; - if (Hash::enabled == true) - { // We create a list of keys. Key values[] = {1,2,3,5,8,13,21,34,55,89}; std::list l (values, values + sizeof(values)/sizeof(values[0]) ); @@ -353,7 +321,6 @@ class TestMPHF : public Test // We check that all codes have been seen for (size_t i=0; i::enabled == false) { return; } - float val = 0; u_int8_t keysValue[] = {14, 35, 1, 9, 65, 37, 12, 24, 98, 124, 32}; diff --git a/gatb-core/thirdparty/CMakeLists.txt b/gatb-core/thirdparty/CMakeLists.txt index d7e7a983b..6e0b5c4f5 100644 --- a/gatb-core/thirdparty/CMakeLists.txt +++ b/gatb-core/thirdparty/CMakeLists.txt @@ -49,37 +49,6 @@ foreach (header ${headerfiles}) add_custom_command (TARGET hdf5_postbuild POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different ${header} ${HDF5_INSTALL_INCLUDE_DIR} ) endforeach() -################################################################################ -# EMPHF HANDLING -################################################################################ - -IF (DEFINED use_mphf) - - ADD_SUBDIRECTORY (emphf) - - # We add a custom target for copying emphf as it is. - add_custom_target (emphf_copyasis ALL) - - SET (EMPHF_INSTALL_INCLUDE_DIR - ${PROJECT_BINARY_DIR}/include/${CMAKE_BUILD_TYPE}/emphf) - - # We define all the header files to be copied - file (GLOB headerfiles ${PROJECT_SOURCE_DIR}/thirdparty/emphf/*.hpp - ${PROJECT_SOURCE_DIR}/thirdparty/emphf/*.cpp) - - # create emphf directory - add_custom_command (TARGET emphf_copyasis COMMAND ${CMAKE_COMMAND} -E - make_directory ${EMPHF_INSTALL_INCLUDE_DIR}) - - # We copy each header file - foreach (header ${headerfiles}) - add_custom_command (TARGET emphf_copyasis - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${header} ${EMPHF_INSTALL_INCLUDE_DIR} - ) - endforeach() -ENDIF() # WITH_MPHF - - # include other smaller libraries (json, Boophf) add_custom_target (thirdparty_copy ALL) @@ -97,7 +66,4 @@ IF (NOT DEFINED GATB_CORE_INSTALL_EXCLUDE) INSTALL (DIRECTORY ${PROJECT_BINARY_DIR}/include/${CMAKE_BUILD_TYPE}/hdf5 DESTINATION include) INSTALL (DIRECTORY ${PROJECT_BINARY_DIR}/include/${CMAKE_BUILD_TYPE}/json DESTINATION include) INSTALL (DIRECTORY ${PROJECT_BINARY_DIR}/include/${CMAKE_BUILD_TYPE}/BooPHF DESTINATION include) - IF (DEFINED WITH_MPHF) - INSTALL (DIRECTORY ${PROJECT_BINARY_DIR}/include/${CMAKE_BUILD_TYPE}/emphf DESTINATION include) - ENDIF() ENDIF () diff --git a/gatb-core/thirdparty/emphf/.gitignore b/gatb-core/thirdparty/emphf/.gitignore deleted file mode 100644 index dff2a8e1f..000000000 --- a/gatb-core/thirdparty/emphf/.gitignore +++ /dev/null @@ -1,14 +0,0 @@ -CMakeCache.txt -CMakeFiles/ -Makefile -*.cmake - -emphf_config.hpp -compute_mphf_hem -compute_mphf_scan -compute_mphf_scan_mmap -compute_mphf_seq -gen_synthetic_data -test_mphf -test_mphf_hem - diff --git a/gatb-core/thirdparty/emphf/CMakeLists.txt b/gatb-core/thirdparty/emphf/CMakeLists.txt deleted file mode 100644 index 400235c9a..000000000 --- a/gatb-core/thirdparty/emphf/CMakeLists.txt +++ /dev/null @@ -1,53 +0,0 @@ -cmake_minimum_required(VERSION 2.8) -project(EMPHF) - -if(NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE "Release") -endif() - -configure_file( - ${EMPHF_SOURCE_DIR}/emphf_config.hpp.in - ${EMPHF_SOURCE_DIR}/emphf_config.hpp) - -option(EMPHF_USE_POPCOUNT - "Use hardware popcount in hash computation (requires SSE4.2)" - OFF) - -if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++") -endif () - -if (UNIX) - # C++11 - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") - - # Extensive warnings - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wno-missing-braces") - - if (EMPHF_USE_POPCOUNT) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2") - endif () -else () - message (FATAL_ERROR "Unsupported platform") -endif () - -OPTION (BUILD_EXECS "Build EMPHF Executabless" OFF) - -IF (BUILD_EXECS) - -# Sequential version -add_executable(compute_mphf_seq compute_mphf_seq.cpp) - -# Sort-and-scan version (internal memory and mmap) -add_executable(compute_mphf_scan compute_mphf_scan.cpp) -add_executable(compute_mphf_scan_mmap compute_mphf_scan_mmap.cpp) - -# HEM -add_executable(compute_mphf_hem compute_mphf_hem.cpp) - -# Utilities -add_executable(test_mphf test_mphf.cpp) -add_executable(test_mphf_hem test_mphf_hem.cpp) -add_executable(gen_synthetic_data gen_synthetic_data.cpp) - -ENDIF (BUILD_EXECS) diff --git a/gatb-core/thirdparty/emphf/LICENSE b/gatb-core/thirdparty/emphf/LICENSE deleted file mode 100644 index 26f4aacf7..000000000 --- a/gatb-core/thirdparty/emphf/LICENSE +++ /dev/null @@ -1,13 +0,0 @@ -Copyright 2013 Giuseppe Ottaviano - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. diff --git a/gatb-core/thirdparty/emphf/README.md b/gatb-core/thirdparty/emphf/README.md deleted file mode 100644 index d0e461099..000000000 --- a/gatb-core/thirdparty/emphf/README.md +++ /dev/null @@ -1,48 +0,0 @@ -emphf -===== - -`emphf` is a minimal perfect hashing library for large-scale key sets focused on -speed and low memory usage. A minimal perfect hash function (MPHF) is a data -structure that maps injectively a static set of strings of size `n` to the -integer set `[0, n-1]`. The overall space usage of the MPHFs generated with -`emphf` is `2.61 n` bits plus a small constant factor. - -The algorithms used in this library are described in the paper -[*Cache-Oblivious Peeling of Random Hypergraphs*](http://arxiv.org/abs/1312.0526) -by Djamal Belazzougui, Paolo Boldi, Giuseppe Ottaviano, Rossano Venturini, and -Sebastiano Vigna. - -All the algorithms implemented here construct a MPHF for a given key set using a -standard scheme based on the Majewski–Wormald–Havas–Czech (MWHC) construction, -that is based on finding a peeling order of a random 3-hypergraph. - -* `compute_mphf_seq` computes the peeling order using the standard in-memory - linear-time peeling algorithm, implemented with the xor-trick described in the - paper. This is the fastest MPHF construction implementation that we know of. - -* `compute_mphf_scan_mmap` computes the peeling order using the new algorithm - proposed in the paper, using an mmapped temporary file for its data structure, - thus in an external-memory setting. - -* `compute_mphf_hem` uses a technique described in F. C. Botelho, R. Pagh, and - N. Ziviani, “Practical perfect hashing in nearly optimal space”, that splits - the key sets in bucket that are small enough that the perfect hash function is - easy to compute in memory for each bucket. While the construction is faster - than by using `compute_mphf_scan_mmap`, the resulting data structure takes - slightly more space and it is slightly slower. - -`compute_mphf_seq` and `compute_mphf_scan_mmap` construct the same data -structure, but the second should be used when the space needed to construct the -MPHF does not fit in main memory. The obtained MPHF data structures can perform -lookups significantly faster than other implementations using the same algorithm -(see the paper for details). - -The script `test_all.py` executes all the algorithms on a given file (or the -standard UNIX dictionary if none is provided) and checks that the generated hash -function is indeed minimal and perfect. - -The project uses CMake. To build it on Unix systems it should be sufficient to -do the following: - - $ cmake . - $ make diff --git a/gatb-core/thirdparty/emphf/base_hash.hpp b/gatb-core/thirdparty/emphf/base_hash.hpp deleted file mode 100644 index d362e5f16..000000000 --- a/gatb-core/thirdparty/emphf/base_hash.hpp +++ /dev/null @@ -1,228 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include "common.hpp" - - -namespace emphf { - - inline uint64_t unaligned_load64(uint8_t const* from) - { - uint64_t tmp; - memcpy(reinterpret_cast(&tmp), from, 8); - // XXX(ot): reverse bytes in big-endian architectures - return tmp; - } - - - struct jenkins64_hasher { - - typedef uint64_t seed_t; - typedef uint64_t hash_t; - typedef std::tuple hash_triple_t; - - jenkins64_hasher() - {} - - jenkins64_hasher(uint64_t seed) - : m_seed(seed) - {} - - template - static jenkins64_hasher generate(Rng& rng) - { - return jenkins64_hasher(rng()); - } - - // Adapted from http://www.burtleburtle.net/bob/c/lookup8.c - hash_triple_t operator()(byte_range_t s) const - { - using std::get; - hash_triple_t h(m_seed, m_seed, 0x9e3779b97f4a7c13ULL); - - size_t len = (size_t)(s.second - s.first); - uint8_t const* cur = s.first; - uint8_t const* end = s.second; - - while (end - cur >= 24) { - get<0>(h) += unaligned_load64(cur); - cur += 8; - get<1>(h) += unaligned_load64(cur); - cur += 8; - get<2>(h) += unaligned_load64(cur); - cur += 8; - - mix(h); - } - - get<2>(h) += len; - - switch (end - cur) { - case 23: get<2>(h) += (uint64_t(cur[22]) << 56); - case 22: get<2>(h) += (uint64_t(cur[21]) << 48); - case 21: get<2>(h) += (uint64_t(cur[20]) << 40); - case 20: get<2>(h) += (uint64_t(cur[19]) << 32); - case 19: get<2>(h) += (uint64_t(cur[18]) << 24); - case 18: get<2>(h) += (uint64_t(cur[17]) << 16); - case 17: get<2>(h) += (uint64_t(cur[16]) << 8); - // the first byte of c is reserved for the length - case 16: get<1>(h) += (uint64_t(cur[15]) << 56); - case 15: get<1>(h) += (uint64_t(cur[14]) << 48); - case 14: get<1>(h) += (uint64_t(cur[13]) << 40); - case 13: get<1>(h) += (uint64_t(cur[12]) << 32); - case 12: get<1>(h) += (uint64_t(cur[11]) << 24); - case 11: get<1>(h) += (uint64_t(cur[10]) << 16); - case 10: get<1>(h) += (uint64_t(cur[ 9]) << 8); - case 9: get<1>(h) += (uint64_t(cur[ 8])); - case 8: get<0>(h) += (uint64_t(cur[ 7]) << 56); - case 7: get<0>(h) += (uint64_t(cur[ 6]) << 48); - case 6: get<0>(h) += (uint64_t(cur[ 5]) << 40); - case 5: get<0>(h) += (uint64_t(cur[ 4]) << 32); - case 4: get<0>(h) += (uint64_t(cur[ 3]) << 24); - case 3: get<0>(h) += (uint64_t(cur[ 2]) << 16); - case 2: get<0>(h) += (uint64_t(cur[ 1]) << 8); - case 1: get<0>(h) += (uint64_t(cur[ 0])); - case 0: break; // nothing to add - default: assert(false); - } - - mix(h); - - return h; - } - - // rehash a hash triple - hash_triple_t operator()(hash_triple_t h) const - { - std::get<0>(h) += m_seed; - std::get<1>(h) += m_seed; - std::get<2>(h) += 0x9e3779b97f4a7c13ULL; - - mix(h); - - return h; - } - - void swap(jenkins64_hasher& other) - { - std::swap(m_seed, other.m_seed); - } - - void save(std::ostream& os) const - { - os.write(reinterpret_cast(&m_seed), sizeof(m_seed)); - } - - void load(std::istream& is) - { - is.read(reinterpret_cast(&m_seed), sizeof(m_seed)); - } - - seed_t seed() const - { - return m_seed; - } - - protected: - - static void mix(hash_triple_t& h) - { - uint64_t& a = std::get<0>(h); - uint64_t& b = std::get<1>(h); - uint64_t& c = std::get<2>(h); - - a -= b; a -= c; a ^= (c >> 43); - b -= c; b -= a; b ^= (a << 9); - c -= a; c -= b; c ^= (b >> 8); - a -= b; a -= c; a ^= (c >> 38); - b -= c; b -= a; b ^= (a << 23); - c -= a; c -= b; c ^= (b >> 5); - a -= b; a -= c; a ^= (c >> 35); - b -= c; b -= a; b ^= (a << 49); - c -= a; c -= b; c ^= (b >> 11); - a -= b; a -= c; a ^= (c >> 12); - b -= c; b -= a; b ^= (a << 18); - c -= a; c -= b; c ^= (b >> 22); - } - - seed_t m_seed; - }; - - - // This is basically a wrapper to jenkins64_hasher that uses a - // 32-bit seed and returns 32-bit hashes by truncation - struct jenkins32_hasher { - - typedef uint32_t seed_t; - typedef uint32_t hash_t; - typedef std::tuple hash_triple_t; - - jenkins32_hasher() - {} - - jenkins32_hasher(uint32_t seed) - : m_seed(seed) - {} - - template - static jenkins32_hasher generate(Rng& rng) - { - return jenkins32_hasher((uint32_t)rng()); - } - - hash_triple_t operator()(byte_range_t s) const - { - using std::get; - auto h64 = jenkins64_hasher(seed64())(s); - return hash_triple_t((uint32_t)get<0>(h64), - (uint32_t)get<1>(h64), - (uint32_t)get<2>(h64)); - } - - hash_triple_t operator()(hash_triple_t h) const - { - using std::get; - auto h64 = jenkins64_hasher::hash_triple_t(get<0>(h), - get<1>(h), - get<2>(h)); - h64 = jenkins64_hasher(seed64())(h64); - return hash_triple_t((uint32_t)get<0>(h64), - (uint32_t)get<1>(h64), - (uint32_t)get<2>(h64)); - } - - void swap(jenkins32_hasher& other) - { - std::swap(m_seed, other.m_seed); - } - - void save(std::ostream& os) const - { - os.write(reinterpret_cast(&m_seed), sizeof(m_seed)); - } - - void load(std::istream& is) - { - is.read(reinterpret_cast(&m_seed), sizeof(m_seed)); - } - - seed_t seed() const - { - return m_seed; - } - - protected: - - uint64_t seed64() const - { - return (uint64_t(m_seed) << 32) | m_seed; - } - - seed_t m_seed; - - }; - -} diff --git a/gatb-core/thirdparty/emphf/bitpair_vector.hpp b/gatb-core/thirdparty/emphf/bitpair_vector.hpp deleted file mode 100644 index e4c853b79..000000000 --- a/gatb-core/thirdparty/emphf/bitpair_vector.hpp +++ /dev/null @@ -1,99 +0,0 @@ -#pragma once - -#include - -namespace emphf { - - class bitpair_vector { - public: - - bitpair_vector() - : m_size(0) - {} - - bitpair_vector(uint64_t n) - : m_size(0) - { - resize(n); - } - - void resize(uint64_t n) - { - // can only grow, for now - assert(n >= size()); - m_size = n; - m_bits.resize((m_size + 31) / 32); - } - - size_t size() const - { - return m_size; - } - - uint64_t operator[](uint64_t pos) const - { - return (m_bits[pos / 32] >> ((pos % 32) * 2)) % 4; - } - - void set(uint64_t pos, uint64_t val) - { - assert(val < 4); - uint64_t word_pos = pos / 32; - uint64_t word_offset = (pos % 32) * 2; - m_bits[word_pos] &= ~(3ULL << word_offset); - m_bits[word_pos] |= val << word_offset; - } - - uint64_t range_nonzeros(uint64_t begin, uint64_t end) const - { - assert(begin <= end); - assert(end <= size()); - - uint64_t word_begin = begin / 32; - uint64_t offset_begin = (begin % 32) * 2; - uint64_t word_end = end / 32; - uint64_t offset_end = (end % 32) * 2; - uint64_t r = 0; - - uint64_t word = (m_bits[word_begin] >> offset_begin) << offset_begin; - for (uint64_t w = word_begin; w < word_end; ++w) { - r += nonzero_pairs(word); - word = m_bits[w + 1]; - } - - uint64_t mask = (uint64_t(1) << offset_end) - 1; - r += nonzero_pairs(word & mask); - - return r; - } - - void swap(bitpair_vector& other) - { - std::swap(m_size, other.m_size); - m_bits.swap(other.m_bits); - } - - void save(std::ostream& os) const - { - os.write(reinterpret_cast(&m_size), sizeof(m_size)); - os.write(reinterpret_cast(m_bits.data()), (std::streamsize)(sizeof(m_bits[0]) * m_bits.size())); - } - - void load(std::istream& is) - { - is.read(reinterpret_cast(&m_size), sizeof(m_size)); - m_bits.resize((m_size + 31) / 32); - is.read(reinterpret_cast(m_bits.data()), (std::streamsize)(sizeof(m_bits[0]) * m_bits.size())); - } - - std::vector const& data() const - { - return m_bits; - } - - protected: - std::vector m_bits; - uint64_t m_size; - }; - -} diff --git a/gatb-core/thirdparty/emphf/bitstream.hpp b/gatb-core/thirdparty/emphf/bitstream.hpp deleted file mode 100644 index c9596e1fa..000000000 --- a/gatb-core/thirdparty/emphf/bitstream.hpp +++ /dev/null @@ -1,179 +0,0 @@ -#pragma once - -#include -#include "common.hpp" - -namespace emphf { - - template - class bitstream { - - public: - - template - using vector = typename MemoryModel::template vector; - - bitstream() - : m_bits() - {} - - bitstream(MemoryModel& mm) - : m_bits(mm.make_vector(uninitialized_uint64())) - {} - - void resize(size_t capacity_bits) - { - m_bits.resize((capacity_bits + 63) / 64); - } - - void swap(bitstream& other) - { - m_bits.swap(other.m_bits); - } - - size_t capacity() const - { - return m_bits.size() * 64; - } - - struct reader { - reader() - {} - - reader(bitstream const& bs) - : m_bs(&bs) - , m_next(0) - , m_buf(0) - , m_avail(0) - {} - - uint64_t read(uint64_t l) - { - uint64_t word = m_buf; - uint64_t left = l; - - if (l > m_avail) { - assert(m_next < m_bs->m_bits.size()); - left = l - m_avail; - m_buf = m_bs->m_bits[m_next++]; - word |= m_buf << m_avail; - m_avail = 64; - } - - m_avail -= left; - m_buf = (left == 64) ? 0 : (m_buf >> left); - word &= (l == 64) ? ~0ULL : ~(~0ULL << l); - - return word; - } - - uint64_t read_unary() - { - uint64_t zs = 0; - while (!m_buf) { - zs += m_avail; - m_avail = 64; - m_buf = m_bs->m_bits[m_next++]; - } - - uint64_t l = __builtin_ctzll(m_buf); - m_buf >>= l; - m_buf >>= 1; - m_avail -= l + 1; - return zs + l; - } - - uint64_t read_gamma() - { - uint64_t l = read_unary(); - return (((uint64_t(1) << l) | read(l)) - 1); - } - - private: - - bitstream const* m_bs; - uint64_t m_next; - uint64_t m_buf; - uint64_t m_avail; - }; - - struct writer { - writer() - {} - - writer(bitstream& bs) - : m_bs(&bs) - , m_next(0) - , m_buf(0) - , m_pos_in_buf(0) - {} - - void write(uint64_t word, uint64_t l) - { - assert(m_pos_in_buf < 64); - m_buf |= word << m_pos_in_buf; - uint64_t left = l; - - if (m_pos_in_buf + l >= 64) { - assert(m_next < m_bs->m_bits.size()); - m_bs->m_bits[m_next++] = m_buf; - left = l - (64 - m_pos_in_buf); - if (m_pos_in_buf) { - m_buf = word >> (64 - m_pos_in_buf); - } else { - m_buf = 0; - } - m_pos_in_buf = 0; - } - - m_pos_in_buf += left; - } - - void write_zeros(uint64_t n) - { - assert(m_pos_in_buf < 64); - while (n >= (64 - m_pos_in_buf)) { - assert(m_next < m_bs->m_bits.size()); - m_bs->m_bits[m_next++] = m_buf; - m_buf = 0; - n -= 64 - m_pos_in_buf; - m_pos_in_buf = 0; - } - - m_pos_in_buf += n; - } - - void write_unary(uint64_t n) - { - write_zeros(n); - write(1, 1); - } - - void write_gamma(uint64_t n) - { - n += 1; - uint64_t l = msb(n); - write_zeros(l); - uint64_t v = n ^ (1ULL << l); - write((v << 1) | 1, l + 1); - } - - void flush() - { - assert(m_next < m_bs->m_bits.size()); - m_bs->m_bits[m_next] = m_buf; - } - - private: - - bitstream* m_bs; - uint64_t m_next; - uint64_t m_buf; - uint64_t m_pos_in_buf; - }; - - private: - vector m_bits; - }; - -} diff --git a/gatb-core/thirdparty/emphf/common.hpp b/gatb-core/thirdparty/emphf/common.hpp deleted file mode 100644 index ff1527b0b..000000000 --- a/gatb-core/thirdparty/emphf/common.hpp +++ /dev/null @@ -1,265 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "emphf_config.hpp" - -namespace emphf { - - inline std::ostream& logger() - { - struct nullstream : std::ostream { - nullstream() : std::ios(0), std::ostream(0) {} - }; - - static nullstream logstream; - return logstream ; // FIXME (rayan) removed logging - - time_t t = std::time(nullptr); - // XXX(ot): put_time unsupported in g++ 4.7 - // return std::cerr - // << std::put_time(std::localtime(&t), "%F %T") - // << ": "; - std::locale loc; - const std::time_put& tp = - std::use_facet>(loc); - const char *fmt = "%F %T"; - tp.put(std::cerr, std::cerr, ' ', - std::localtime(&t), fmt, fmt + strlen(fmt)); - return std::cerr << ": "; - } - - // XXX(ot): the following I/O code is adapted from succinct - // library, avoiding the dependency for now - typedef std::pair byte_range_t; - - struct identity_adaptor - { - byte_range_t operator()(byte_range_t s) const - { - return s; - } - }; - - struct stl_string_adaptor - { - byte_range_t operator()(std::string const& s) const - { - const uint8_t* buf = reinterpret_cast(s.c_str()); - const uint8_t* end = buf + s.size() + 1; // add the null terminator - return byte_range_t(buf, end); - } - }; - - class line_iterator - : public std::iterator { - - public: - line_iterator() - : m_is(nullptr) - , m_buf(nullptr) - {} - - line_iterator(FILE* is) - : m_is(is) - , m_pos(0) - , m_buf(nullptr) - , m_buf_len(0) - { - advance(); - } - - ~line_iterator() - { - free(m_buf); - } - - value_type const& operator*() const { - return m_line; - } - - line_iterator& operator++() { - advance(); - return *this; - } - - friend bool operator==(line_iterator const& lhs, line_iterator const& rhs) - { - if (!lhs.m_is || !rhs.m_is) { - if (!lhs.m_is && !rhs.m_is) { - return true; - } else { - return false; - } - } - - assert(lhs.m_is == rhs.m_is); - - return rhs.m_pos == lhs.m_pos; - } - - friend bool operator!=(line_iterator const& lhs, line_iterator const& rhs) - { - return !(lhs == rhs); - } - - private: - void advance() - { - assert(m_is); - fseek(m_is, m_pos, SEEK_SET); - - // this is significantly faster than std::getline on C++ - // streams - auto avail = getline(&m_buf, &m_buf_len, m_is); - if (avail == -1) { - m_is = nullptr; - return; - } - m_pos = ftell(m_is); - - // trim newline character - if (avail && m_buf[avail - 1] == '\n') { - avail -= 1; - } - - m_line.assign(m_buf, m_buf + avail); - } - - FILE* m_is; - long m_pos; - std::string m_line; - char* m_buf; - size_t m_buf_len; - }; - - class file_lines - { - public: - file_lines(const char* filename) - { - m_is = fopen(filename, "rb"); - if (!m_is) { - throw std::invalid_argument("Error opening " + std::string(filename)); - } - } - - ~file_lines() - { - fclose(m_is); - } - - line_iterator begin() const - { - return line_iterator(m_is); - } - - line_iterator end() const { return line_iterator(); } - - size_t size() const - { - size_t lines = 0; - fseek(m_is, 0, SEEK_SET); - static const size_t buf_size = 4096; - char buf[buf_size]; - size_t avail; - bool last_is_newline = false; - while ((avail = fread(buf, 1, buf_size, m_is))) { - for (size_t i = 0; i < avail; ++i) { - if (buf[i] == '\n') lines += 1; - } - last_is_newline = (buf[avail - 1] == '\n'); - } - - if (!last_is_newline) lines += 1; - - return lines; - } - - private: - // noncopyble - file_lines(file_lines const&); - file_lines& operator=(file_lines const&); - - FILE* m_is; - }; - - template - struct iter_range - { - iter_range(Iterator b, Iterator e) - : m_begin(b) - , m_end(e) - {} - - Iterator begin() const - { return m_begin; } - - Iterator end() const - { return m_end; } - - Iterator m_begin, m_end; - }; - - template - iter_range range(Iterator begin, Iterator end) - { - return iter_range(begin, end); - } - - inline uint64_t nonzero_pairs(uint64_t x) - { - static const uint64_t ones_step_4 = 0x1111111111111111ULL; - x = (x | (x >> 1)) & (0x5 * ones_step_4); - -#if EMPHF_USE_POPCOUNT - return (uint64_t)__builtin_popcountll(x); -#else - static const uint64_t ones_step_8 = 0x0101010101010101ULL; - x = (x & 3 * ones_step_4) + ((x >> 2) & 3 * ones_step_4); - x = (x + (x >> 4)) & 0x0f * ones_step_8; - return (x * ones_step_8) >> 56; -#endif - } - - inline uint64_t msb(uint64_t x) - { - assert(x); - return 63 - __builtin_clzll(x); - } - - struct uninitialized_uint64 { - uninitialized_uint64() {} - - uninitialized_uint64& operator=(uint64_t v) - { - m_val = v; - return *this; - } - - operator uint64_t&() - { - return m_val; - } - - operator uint64_t const&() const - { - return m_val; - } - - private: - uint64_t m_val; - }; - -} diff --git a/gatb-core/thirdparty/emphf/compute_mphf_generic.hpp b/gatb-core/thirdparty/emphf/compute_mphf_generic.hpp deleted file mode 100644 index 98f1970bc..000000000 --- a/gatb-core/thirdparty/emphf/compute_mphf_generic.hpp +++ /dev/null @@ -1,58 +0,0 @@ -#include -#include -#include -#include - -#include "common.hpp" -#include "mphf.hpp" -#include "base_hash.hpp" -#include "perfutils.hpp" - -namespace emphf { - - template - int compute_mphf_main(int argc, char** argv) - { - if (argc < 2) { - std::cerr << "Expected: " << argv[0] << " [output_filename]" << std::endl; - std::terminate(); - } - - const char* filename = argv[1]; - std::string output_filename; - - if (argc >= 3) { - output_filename = argv[2]; - } - - logger() << "Processing " << filename << std::endl; - - file_lines lines(filename); - size_t n = lines.size(); - logger() << n << " strings to process." << std::endl; - - stl_string_adaptor adaptor; - typedef mphfmphf_t; - mphf_t mphf; - - size_t max_nodes = (size_t(std::ceil(double(n) * 1.23)) + 2) / 3 * 3; - if (max_nodes >= uint64_t(1) << 32) { - logger() << "Using 64-bit sorter" << std::endl; - HypergraphSorter64 sorter; - mphf_t(sorter, n, lines, adaptor).swap(mphf); - } else { - logger() << "Using 32-bit sorter" << std::endl; - HypergraphSorter32 sorter; - mphf_t(sorter, n, lines, adaptor).swap(mphf); - } - - if (output_filename.size()) { - std::ofstream os(output_filename, std::ios::binary); - mphf.save(os); - } - - return 0; - } -} diff --git a/gatb-core/thirdparty/emphf/compute_mphf_hem.cpp b/gatb-core/thirdparty/emphf/compute_mphf_hem.cpp deleted file mode 100644 index 654264d60..000000000 --- a/gatb-core/thirdparty/emphf/compute_mphf_hem.cpp +++ /dev/null @@ -1,45 +0,0 @@ -#include -#include -#include -#include - -#include "common.hpp" -#include "mphf_hem.hpp" -#include "mmap_memory_model.hpp" -#include "base_hash.hpp" -#include "perfutils.hpp" - -int main(int argc, char** argv) -{ - using namespace emphf; - - if (argc < 2) { - std::cerr << "Expected: " << argv[0] - << " [output_filename]" << std::endl; - std::terminate(); - } - - const char* filename = argv[1]; - std::string output_filename; - - if (argc >= 3) { - output_filename = argv[2]; - } - - logger() << "Processing " << filename << std::endl; - - file_lines lines(filename); - size_t n = lines.size(); - logger() << n << " strings to process." << std::endl; - - stl_string_adaptor adaptor; - mmap_memory_model mm; - mphf_hem mphf(mm, n, lines, adaptor); - - if (output_filename.size()) { - std::ofstream os(output_filename, std::ios::binary); - mphf.save(os); - } - - return 0; -} diff --git a/gatb-core/thirdparty/emphf/compute_mphf_scan.cpp b/gatb-core/thirdparty/emphf/compute_mphf_scan.cpp deleted file mode 100644 index b6b8078f8..000000000 --- a/gatb-core/thirdparty/emphf/compute_mphf_scan.cpp +++ /dev/null @@ -1,11 +0,0 @@ -#include "compute_mphf_generic.hpp" -#include "internal_memory_model.hpp" -#include "hypergraph_sorter_scan.hpp" - -int main(int argc, char** argv) -{ - using namespace emphf; - return compute_mphf_main, - hypergraph_sorter_scan, - jenkins64_hasher>(argc, argv); -} diff --git a/gatb-core/thirdparty/emphf/compute_mphf_scan_mmap.cpp b/gatb-core/thirdparty/emphf/compute_mphf_scan_mmap.cpp deleted file mode 100644 index 1f9e0d9b7..000000000 --- a/gatb-core/thirdparty/emphf/compute_mphf_scan_mmap.cpp +++ /dev/null @@ -1,11 +0,0 @@ -#include "compute_mphf_generic.hpp" -#include "mmap_memory_model.hpp" -#include "hypergraph_sorter_scan.hpp" - -int main(int argc, char** argv) -{ - using namespace emphf; - return compute_mphf_main, - hypergraph_sorter_scan, - jenkins64_hasher>(argc, argv); -} diff --git a/gatb-core/thirdparty/emphf/compute_mphf_seq.cpp b/gatb-core/thirdparty/emphf/compute_mphf_seq.cpp deleted file mode 100644 index e6551955e..000000000 --- a/gatb-core/thirdparty/emphf/compute_mphf_seq.cpp +++ /dev/null @@ -1,11 +0,0 @@ -#include "compute_mphf_generic.hpp" -#include "hypergraph.hpp" -#include "hypergraph_sorter_seq.hpp" - -int main(int argc, char** argv) -{ - using namespace emphf; - return compute_mphf_main>, - hypergraph_sorter_seq>, - jenkins64_hasher>(argc, argv); -} diff --git a/gatb-core/thirdparty/emphf/emphf_config.hpp.in b/gatb-core/thirdparty/emphf/emphf_config.hpp.in deleted file mode 100644 index ff3b51430..000000000 --- a/gatb-core/thirdparty/emphf/emphf_config.hpp.in +++ /dev/null @@ -1,6 +0,0 @@ -#pragma once - -#cmakedefine EMPHF_USE_POPCOUNT 1 -#ifndef EMPHF_USE_POPCOUNT -# define EMPHF_USE_POPCOUNT 0 -#endif diff --git a/gatb-core/thirdparty/emphf/gen_synthetic_data.cpp b/gatb-core/thirdparty/emphf/gen_synthetic_data.cpp deleted file mode 100644 index 89faf6bef..000000000 --- a/gatb-core/thirdparty/emphf/gen_synthetic_data.cpp +++ /dev/null @@ -1,20 +0,0 @@ -#include - -int main(int argc, char** argv) -{ - if (argc != 3) { - printf("Usage: %s \n", argv[0]); - } - - const char* filename = argv[1]; - unsigned long n = 0; - sscanf(argv[2], "%lu", &n); - - printf("Generating %lu strings\n", n); - - auto f = fopen(filename, "w"); - for (unsigned long i = 0; i < n; ++i) { - fprintf(f, "%lu\n", i); - } - fclose(f); -} diff --git a/gatb-core/thirdparty/emphf/hypergraph.hpp b/gatb-core/thirdparty/emphf/hypergraph.hpp deleted file mode 100644 index 9436a9858..000000000 --- a/gatb-core/thirdparty/emphf/hypergraph.hpp +++ /dev/null @@ -1,137 +0,0 @@ -#pragma once - -#include - -namespace emphf { - - template - struct hypergraph { - - typedef NodeType node_t; // last value is used as sentinel - - struct hyperedge { - // deliberately do not initialize, to avoid polluting the - // page cache when initializing large mmapped arrays - hyperedge() - {} - - hyperedge(NodeType v0_, NodeType v1_, NodeType v2_) - : v0(v0_) - , v1(v1_) - , v2(v2_) - {} - - friend inline - std::ostream& operator<<(std::ostream& os, hyperedge const& t) - { - os << "(" - << t.v0 << ", " - << t.v1 << ", " - << t.v2 << ")"; - return os; - } - - friend inline - bool operator<(hyperedge const& lhs, hyperedge const& rhs) - { - return - std::make_tuple(lhs.v0, lhs.v1, lhs.v2) < - std::make_tuple(rhs.v0, rhs.v1, rhs.v2); - } - - friend inline - bool operator==(hyperedge const& lhs, hyperedge const& rhs) - { - return - lhs.v0 == rhs.v0 && - lhs.v1 == rhs.v1 && - lhs.v2 == rhs.v2; - } - - friend inline - bool operator!=(hyperedge const& lhs, hyperedge const& rhs) - { - return !(lhs == rhs); - } - - NodeType v0, v1, v2; - }; - - static hyperedge sentinel() - { - return hyperedge(-node_t(1), -node_t(1), -node_t(1)); - } - - struct xored_adj_list { - xored_adj_list(node_t degree_= 0, node_t v1s_ = 0, node_t v2s_ = 0) - : degree(degree_) - , v1s(v1s_) - , v2s(v2s_) - {} - - void add_edge(hyperedge const& edge) - { - degree += 1; - xor_edge(edge); - } - - void delete_edge(hyperedge const& edge) - { - assert(degree >= 1); - degree -= 1; - xor_edge(edge); - } - - hyperedge edge_from(node_t v0) const - { - assert(degree == 1); - return hyperedge(v0, v1s, v2s); - } - - node_t degree; - node_t v1s; - node_t v2s; - - private: - - void xor_edge(hyperedge const& edge) - { - assert(edge.v1 < edge.v2); - v1s ^= edge.v1; - v2s ^= edge.v2; - } - - }; - }; - - // a brief note about hyperedge orientations: throughout the - // code we keep the invariant that for every hyperedge (v0, - // v1, v2) it holds v1 < v2. This leaves only three - // orientations, which we index with 0, 1, and 2 depending on - // whether v0 is the first, second, or third smallest node. We - // call the 0-orientation "canonical". - template - static unsigned orientation(HyperEdge const& t) - { - // although it should be v0 < v1 < v2, sometimes we - // compare sentinel edges - assert(t.v1 <= t.v2); - return (t.v0 > t.v1) + (t.v0 > t.v2); - } - - template - static HyperEdge canonicalize_edge(HyperEdge t) - { - assert(t.v1 <= t.v2); - if (t.v0 > t.v2) { - std::swap(t.v0, t.v2); - } - - if (t.v0 > t.v1) { - std::swap(t.v0, t.v1); - } - - assert(orientation(t) == 0); - return t; - } -} diff --git a/gatb-core/thirdparty/emphf/hypergraph_sorter_scan.hpp b/gatb-core/thirdparty/emphf/hypergraph_sorter_scan.hpp deleted file mode 100644 index ae9301dc8..000000000 --- a/gatb-core/thirdparty/emphf/hypergraph_sorter_scan.hpp +++ /dev/null @@ -1,399 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "common.hpp" -#include "hypergraph.hpp" -#include "perfutils.hpp" -#include "packed_edge_list.hpp" -#include "bitstream.hpp" - -namespace emphf { - - template - class hypergraph_sorter_scan { - public: - typedef hypergraph hg; - typedef typename hg::node_t node_t; - typedef typename hg::hyperedge hyperedge; - typedef typename hg::xored_adj_list xored_adj_list; - - hypergraph_sorter_scan() - : m_memory_model() - , m_edges(m_memory_model, 0, 0) - {} - - template - bool try_generate_and_sort(Range const& input_range, - EdgeGenerator const& edge_gen, - size_t n, - size_t hash_domain, - Progress *progress) - { - auto m = hash_domain * 3; - uint64_t node_bits = msb(m - 1) + 1; - logger() << "Using " << node_bits << " bits per node" << std::endl; - - const hyperedge sentinel_edge = hg::sentinel(); - - bitstream adj_lists(m_memory_model); - typedef typename bitstream::writer bs_writer; - typedef typename bitstream::reader bs_reader; - - auto write_adj_list = [&](bs_writer& wr, - node_t& last_v0, - std::pair const& adj) { - assert(adj.first + 1 > last_v0 + 1); - wr.write_gamma(adj.first - last_v0 - 1); - last_v0 = adj.first; - assert(adj.second.degree >= 1); - wr.write_unary(adj.second.degree - 1); - wr.write(adj.second.v1s, node_bits); - wr.write(adj.second.v2s, node_bits); - }; - - auto read_adj_list = [&](bs_reader& rr, node_t& last_v0) { - node_t delta_v0 = (node_t)rr.read_gamma(); - node_t v0 = (last_v0 += delta_v0 + 1); - node_t degree = (node_t)rr.read_unary() + 1; - node_t v1s = (node_t)rr.read(node_bits); - node_t v2s = (node_t)rr.read(node_bits); - return std::make_pair(v0, xored_adj_list(degree, v1s, v2s)); - }; - - // do all the allocations upfront - adj_lists.resize(2 * m + // delta-v0 - 3 * n + // unary degrees - 2 * node_bits * m + // v1 and v2 - 64); // padding - - // to save space, the m_edges vector is used to - // store all the initial edges, then the round hinges and - // the deletion lists, and finally the peeling order - packed_edge_list - (m_memory_model, node_bits, - 3 * n) // one for each of three orientations - .swap(m_edges); - - m_round_boundaries.clear(); - - auto edges_begin = m_edges.begin(); - auto edges_end = edges_begin; - - // generate canonical edges - logger() << "Generating hyperedges" << std::endl; - for (auto const& val: input_range) { - auto edge = edge_gen(val); - // canonical by construction - *edges_end++ = edge; - } - - // we temporarily the first round hinges after the edge list - auto round_hinges_begin = edges_end; - auto round_hinges_end = round_hinges_begin; - size_t initial_nodes = 0; - size_t remaining_nodes = 0; - bs_writer wr(adj_lists); - node_t last_v0 = -1; - - // we exploit the tripartition by populating the adjacency - // lists of each orientation of the tripartition - // separately - - if (progress) - { - progress->reset (3); - progress->init (); - } - - for (unsigned o = 0; o < 3; ++o) { - logger() << "Sorting " << o << "-orientation edges" << std::endl; - - // progress notification - if (progress) - progress->inc (1); - - auto sorted_edges = - m_memory_model.make_sorter(edges_begin, edges_end, - [](hyperedge const& e) { return e.v0; }, - [&](hyperedge const& e, size_t k) { - uint64_t b = e.v0 - o * hash_domain; - return size_t(b * k / hash_domain); - }); - - logger() << "Populating " << o << "-orientation lists" << std::endl; - - xored_adj_list adj; - auto it = sorted_edges.begin(); - hyperedge next_edge = *it; - auto next_orientation = edges_begin; - - while (it != sorted_edges.end()) { - hyperedge edge = next_edge; - next_edge = (++it != sorted_edges.end()) ? *it : sentinel_edge; - - node_t v0 = edge.v0; - assert(orientation(edge) == o); - - adj.add_edge(edge); - - // if last edge of run of edges starting with same v0, - // commit adjacency list - if (v0 != next_edge.v0) { - initial_nodes += 1; - if (adj.degree > 1) { - write_adj_list(wr, last_v0, std::make_pair(v0, adj)); - remaining_nodes += 1; - } else { - *round_hinges_end++ = edge; - } - - adj = xored_adj_list(); - } - - // prepare next orientation - if (o == 0) { - std::swap(edge.v0, edge.v1); - *next_orientation++ = edge; - } else if (o == 1) { - std::swap(edge.v0, edge.v2); - *next_orientation++ = edge; - } - } - } - if (progress) - progress->finish (); - - wr.flush(); - - auto peeling_order_begin = m_edges.begin(); - auto peeling_order_end = peeling_order_begin; - - - // we now move the initial hinges to the beginning of the - // peeling_order vector, to leave room for the deletion - // lists (can be as many as 2*n edges) - round_hinges_end = std::copy(round_hinges_begin, round_hinges_end, - peeling_order_begin); - round_hinges_begin = peeling_order_begin; - - // progress initialization - if (progress) - { - progress->reset (initial_nodes); - progress->init (); - } - - // iterate rounds until no more hinges are found - for (size_t round = 0; round_hinges_begin != round_hinges_end; ++round) { - - logger() << "Round " << round << ", " - << 100 * double(remaining_nodes) / double(initial_nodes) - << "% nodes remaining" << std::endl; - - // progress notification - if (progress) - progress->set (initial_nodes - remaining_nodes); - - // sort hinges by their canonical orientation - auto sorted_hinges = - m_memory_model.make_sorter(round_hinges_begin, round_hinges_end, - [](hyperedge const& e) { - return canonicalize_edge(e); - }, - [&](hyperedge const& e, size_t k) { - uint64_t c_v0 = std::min(e.v0, std::min(e.v1, e.v2)); - return size_t(c_v0 * k / m); - }); - - m_round_boundaries.push_back((size_t)(std::distance(peeling_order_begin, peeling_order_end))); - // we store the deletion list after the round hinges - auto round_deletion_begin = round_hinges_end; - auto round_deletion_end = round_deletion_begin; - // prepare the edge deletion list; in round_hinges, - // the same edge may appear with different - // orientations, since we already removed the hinges - // from their adjacency lists we only need to delete - // the missing orientations - bool orientations[] = {false, false, false}; - auto it = sorted_hinges.begin(); - hyperedge next_edge = *it; - - while (it != sorted_hinges.end()) { - hyperedge edge = next_edge; - next_edge = (++it != sorted_hinges.end()) ? *it : sentinel_edge; - - orientations[orientation(edge)] = true; - auto edge_canonical = canonicalize_edge(edge); - - if (edge_canonical != canonicalize_edge(next_edge)) { - // append the current hinge to the global - // peeling order - *peeling_order_end++ = edge; - - // add to the deletion list the missing orientations - auto cur_edge = edge_canonical; - if (!orientations[0]) *round_deletion_end++ = cur_edge; - orientations[0] = false; - - std::swap(cur_edge.v0, cur_edge.v1); - assert(orientation(cur_edge) == 1); - if (!orientations[1]) *round_deletion_end++ = cur_edge; - orientations[1] = false; - - std::swap(cur_edge.v0, cur_edge.v2); - assert(orientation(cur_edge) == 2); - if (!orientations[2]) *round_deletion_end++ = cur_edge; - orientations[2] = false; - } - } - - auto sorted_deletions = - m_memory_model.make_sorter(round_deletion_begin, round_deletion_end, - [](hyperedge const& e) { return e.v0; }, - [&](hyperedge const& e, size_t k) { - return size_t(uint64_t(e.v0) * k / m); - }); - - // finally delete the current hinge edges, and find - // the new hinges - round_hinges_begin = peeling_order_end; - round_hinges_end = round_hinges_begin; - - size_t new_remaining_nodes = 0; - - auto cur_del = sorted_deletions.begin(); - bs_reader rr(adj_lists); - node_t last_v0_r = -1; - bs_writer wr(adj_lists); - node_t last_v0_w = -1; - - for (size_t i = 0; i < remaining_nodes; ++i) { - auto adj = read_adj_list(rr, last_v0_r); - - assert(cur_del == sorted_deletions.end() || - hyperedge(*cur_del).v0 >= adj.first); - - while (cur_del != sorted_deletions.end() && - hyperedge(*cur_del).v0 == adj.first) { - adj.second.delete_edge(*cur_del); - ++cur_del; - } - - if (adj.second.degree > 1) { - write_adj_list(wr, last_v0_w, adj); - new_remaining_nodes += 1; - } else if (adj.second.degree == 1) { - *round_hinges_end++ = adj.second.edge_from(adj.first); - } - } - remaining_nodes = new_remaining_nodes; - wr.flush(); - } - - if (progress) - progress->finish (); - - if (remaining_nodes > 0) { - logger() << "Hypergraph is not peelable" << std::endl; - return false; - } - - assert((size_t)std::distance(peeling_order_begin, peeling_order_end) == n); - m_round_boundaries.push_back(n); - - return true; - } - - struct peeling_iterator - { - typedef hyperedge value_type; - - peeling_iterator() - : m_sorter(nullptr) - , m_cur_round(0) - , m_cur_pos(0) - {} - - peeling_iterator(hypergraph_sorter_scan const& sorter) - : m_sorter(&sorter) - , m_cur_round(m_sorter->m_round_boundaries.size() - 2) - , m_cur_pos(m_sorter->m_round_boundaries[m_cur_round]) - { - refresh(); - } - - bool operator!=(peeling_iterator const& other) const - { - if (!m_sorter || !other.m_sorter) { - return m_sorter != other.m_sorter; - } else { - assert(m_sorter == other.m_sorter); - return - m_cur_round != other.m_cur_round || - m_cur_pos != other.m_cur_pos; - } - } - - peeling_iterator& operator++() - { - ++m_cur_pos; - if (m_cur_pos == m_sorter->m_round_boundaries[m_cur_round + 1]) { - if (m_cur_round == 0) { - m_sorter = nullptr; - } else { - m_cur_round -= 1; - m_cur_pos = m_sorter->m_round_boundaries[m_cur_round]; - } - } - - refresh(); - return *this; - } - - value_type const& operator*() const - { - return m_cur; - } - - value_type const* operator->() const - { - return &m_cur; - } - - - private: - - void refresh() - { - if (m_sorter) { - m_cur = m_sorter->m_edges[m_cur_pos]; - } - } - - hypergraph_sorter_scan const* m_sorter; - size_t m_cur_round; - size_t m_cur_pos; - hyperedge m_cur; - }; - - std::pair - get_peeling_order() const - { - return std::make_pair(peeling_iterator(*this), peeling_iterator()); - } - - private: - - MemoryModel m_memory_model; - packed_edge_list m_edges; - std::vector m_round_boundaries; - }; -} diff --git a/gatb-core/thirdparty/emphf/hypergraph_sorter_seq.hpp b/gatb-core/thirdparty/emphf/hypergraph_sorter_seq.hpp deleted file mode 100644 index a1f2c16bb..000000000 --- a/gatb-core/thirdparty/emphf/hypergraph_sorter_seq.hpp +++ /dev/null @@ -1,130 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "common.hpp" -#include "hypergraph.hpp" -#include "perfutils.hpp" - -namespace emphf { - - template - class hypergraph_sorter_seq { - public: - typedef HypergraphType hg; - typedef typename hg::node_t node_t; - typedef typename hg::hyperedge hyperedge; - typedef typename hg::xored_adj_list xored_adj_list; - - hypergraph_sorter_seq() - {} - - template - bool try_generate_and_sort(Range const& input_range, - EdgeGenerator const& edge_gen, - size_t n, - size_t hash_domain, - Progress *progress, - bool verbose = true) - { - using std::get; - std::vector adj_lists; - - size_t m = hash_domain * 3; - - // do all the allocations upfront - m_peeling_order.clear(); - m_peeling_order.reserve(n); - adj_lists.resize(m); - - // generate edges - if (verbose) { - logger() << "Generating hyperedges and populating adjacency lists" - << std::endl; - } - - for (auto const& val: input_range) { - auto edge = edge_gen(val); - // canonical by construction - assert(orientation(edge) == 0); - - adj_lists[edge.v0].add_edge(edge); - - std::swap(edge.v0, edge.v1); - adj_lists[edge.v0].add_edge(edge); - - std::swap(edge.v0, edge.v2); - adj_lists[edge.v0].add_edge(edge); - } - - // peel - if (verbose) { - logger() << "Peeling" << std::endl; - } - - auto visit = [&](node_t v0) { - if (adj_lists[v0].degree == 1) { - auto edge = adj_lists[v0].edge_from(v0); - m_peeling_order.push_back(edge); - - edge = canonicalize_edge(edge); - adj_lists[edge.v0].delete_edge(edge); - - std::swap(edge.v0, edge.v1); - adj_lists[edge.v0].delete_edge(edge); - - std::swap(edge.v0, edge.v2); - adj_lists[edge.v0].delete_edge(edge); - } - }; - - size_t queue_position = 0; - for (node_t v0 = 0; v0 < m; ++v0) { - visit(v0); - - while (queue_position < m_peeling_order.size()) { - auto const& cur_edge = m_peeling_order[queue_position]; - - visit(cur_edge.v1); - visit(cur_edge.v2); - queue_position += 1; - } - } - - if (m_peeling_order.size() < n) { - if (verbose) { - logger() << "Hypergraph is not peelable: " - << (n - m_peeling_order.size()) << " edges remaining" - << std::endl; - } - return false; - } - - assert(m_peeling_order.size() == n); - - return true; - } - - typedef typename std::vector::const_reverse_iterator - peeling_iterator; - - std::pair - get_peeling_order() const - { - return std::make_pair(m_peeling_order.crbegin(), - m_peeling_order.crend()); - } - - private: - - size_t m_hash_domain; - std::vector m_peeling_order; - }; -} diff --git a/gatb-core/thirdparty/emphf/internal_memory_model.hpp b/gatb-core/thirdparty/emphf/internal_memory_model.hpp deleted file mode 100644 index c14f690bd..000000000 --- a/gatb-core/thirdparty/emphf/internal_memory_model.hpp +++ /dev/null @@ -1,79 +0,0 @@ -#pragma once - -#include -#include - -namespace emphf { - - struct internal_memory_model { - - template - using vector = std::vector; - - template - vector make_vector(T const&) const - { - return vector(); - } - - // for internal memory model, this is just a wrapper for an - // already sorted range - template - struct sorter { - - typedef typename std::iterator_traits::value_type value_type; - typedef Iterator iterator; - - sorter(Iterator begin, Iterator end, - KeyFunctor const&, - PartitionFunctor const&) - : m_begin(begin) - , m_end(end) - {} - - iterator begin() const - { - return m_begin; - } - - iterator end() const - { - return m_end; - } - - - private: - iterator m_begin; - iterator m_end; - }; - - template - sorter - make_sorter(Iterator begin, Iterator end, - KeyFunctor const& kf, - PartitionFunctor const& pf) const - { - typedef typename std::iterator_traits::value_type value_type; - std::sort(begin, end, - [&](value_type const& lhs, value_type const& rhs) { - return kf(lhs) < kf(rhs); - }); - return sorter(begin, end, kf, pf); - } - - template - void sort(Iterator begin, Iterator end, - KeyFunctor const& kf, - PartitionFunctor const& pf) const - { - auto s = make_sorter(begin, end, kf, pf); - } - }; - -} diff --git a/gatb-core/thirdparty/emphf/mmap_memory_model.hpp b/gatb-core/thirdparty/emphf/mmap_memory_model.hpp deleted file mode 100644 index 62ac008d7..000000000 --- a/gatb-core/thirdparty/emphf/mmap_memory_model.hpp +++ /dev/null @@ -1,388 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "internal_memory_model.hpp" - -namespace emphf { - - typedef std::map> mappings_map; - - template - struct mmap_allocator : std::allocator - { - typedef T * pointer; - typedef const T * const_pointer; - typedef T& reference; - typedef const T& const_reference; - typedef T value_type; - typedef size_t size_type; - typedef ptrdiff_t difference_type; - - template - struct rebind { - typedef mmap_allocator other; - }; - - mmap_allocator() - : m_mappings(nullptr) - {} - - mmap_allocator(mappings_map& mappings) - : std::allocator() - , m_mappings(&mappings) - {} - - pointer allocate(size_type n, const void* /* hint */) - { - return allocate(n); - } - - pointer allocate(size_type n) - { - assert(m_mappings); - if (!n) return nullptr; - - size_type size = n * sizeof(T); - - // create temporary file - char tmpl[] = "mphf.temp.XXXXXX"; - int fd = mkstemp(tmpl); - if (fd == -1) throw std::runtime_error("Impossible to create temp file"); - - int ret; - ret = unlink(tmpl); - if (ret) { - std::cerr << "WARNING: Error unlinking temporary file " << tmpl << std::endl; - } - - ret = ftruncate(fd, (off_t)size); - if (ret) throw std::runtime_error("Impossible to resize temp file"); - - void* addr = mmap(nullptr, size, - PROT_READ | PROT_WRITE, - MAP_SHARED, fd, 0); - if (!addr) throw std::runtime_error("Impossible to create mapping"); - - ret = posix_madvise(addr, size, POSIX_MADV_SEQUENTIAL); - if (ret) logger() << "Error calling madvice: " << errno << std::endl; - - (*m_mappings)[addr] = std::make_pair(fd, size); - - return static_cast(addr); - } - - void deallocate(pointer p, size_type /* n */) - { - assert(m_mappings); - if (!p) return; - auto mapping = m_mappings->find(static_cast(p)); - if (mapping == m_mappings->end()) { - throw std::runtime_error("Trying to deallocate non-existent mapping"); - } - int ret = munmap(mapping->first, mapping->second.second); - if (ret) throw std::runtime_error("Error unmapping file"); - - ret = close(mapping->second.first); - if (ret) throw std::runtime_error("Error closing file"); - - m_mappings->erase(mapping); - } - - private: - - mappings_map* m_mappings; - }; - - struct mmap_memory_model { - - mmap_memory_model() - {} - - ~mmap_memory_model() - { - if (!m_mappings.empty()) { - std::cerr << "ERROR: leaked mappings in mmap_allocator"; - } - } - - template - using vector = std::vector>; - - template - vector make_vector(T const&) - { - return vector(mmap_allocator(m_mappings)); - } - - - template - struct sorter { - - typedef typename std::iterator_traits::value_type value_type; - - sorter(Iterator begin, Iterator end, - KeyFunctor const& kf, - PartitionFunctor const& pf) - : m_cur_part(0) - , m_buf_pos(0) - , m_partition_sorted(false) - , m_begin(begin) - , m_kf(kf) - { - using std::swap; - - const size_t memory = 512 * 1024 * 1024; // maybe TODO make that a param as it contorls memory usage - auto len = (size_t)std::distance(begin, end); - - if (len * sizeof(value_type) <= memory) { - // create a single partition - m_partitions.push_back(0); - m_partitions.push_back(len); - m_sort_buf.reserve(len); - return; - } - - // use ceil(size / memory) * 2 buckets so that maximum - // bucket is smaller than memory with high probability - size_t k = (len * sizeof(value_type) + memory - 1) / memory * 2; - m_partitions.resize(k + 1); - - // distribution count - std::for_each(begin, end, [&](value_type const& v) { - m_partitions[pf(v, k) + 1] += 1; - }); - - size_t largest_part = *std::max_element(m_partitions.begin(), - m_partitions.end() - 1); - m_sort_buf.reserve(largest_part); - - // cumulative sum - std::partial_sum(m_partitions.begin(), m_partitions.end(), - m_partitions.begin()); - assert((size_t)m_partitions[k] == len); - - // partitioning - size_t buffer_bytes = 1024 * 1024; - std::vector> cursors; - cursors.reserve(k); - - for (size_t p = 0; p < k; ++p) { - cursors.emplace_back(begin + m_partitions[p], - begin + m_partitions[p + 1], - buffer_bytes); - } - - // I think this is an in-place partitioning algorithms in linear time. - // starting from the last cursor, - // for each element in a cursor, - // if this element is not supposed to be in that cursor, - // then find the cursor where it should be, and select the current element - // and swap those two elements (invariant: one of the two swapped elements is never swapped again) - for (size_t p = k - 1; p + 1 > 0; --p) { - auto& cur = cursors[p]; - while (!cur.empty()) { - size_t other_p; - while ((other_p = pf(cur.value(), k)) != p) { - swap(cur.value(), cursors[other_p].value()); - cursors[other_p].advance(); - } - cur.advance(); - } - } - } - - struct iterator : std::iterator { - - iterator(sorter* s = nullptr) - : m_s(s) - { - if (m_s) { - advance_part(); - } - } - - value_type const& operator*() const - { - assert(m_s); - assert(m_s->m_cur_part < m_s->m_partitions.size() - 1); - - if (!m_s->m_partition_sorted) { - ptrdiff_t part_begin = m_s->m_partitions[m_s->m_cur_part]; - ptrdiff_t part_end = m_s->m_partitions[m_s->m_cur_part + 1]; - m_s->m_sort_buf.clear(); - std::copy(m_s->m_begin + part_begin, m_s->m_begin + part_end, - std::back_inserter(m_s->m_sort_buf)); - std::sort(m_s->m_sort_buf.begin(), m_s->m_sort_buf.end(), - [&](value_type const& lhs, value_type const& rhs) { - return m_s->m_kf(lhs) < m_s->m_kf(rhs); - }); - m_s->m_partition_sorted = true; - } - - assert(m_s->m_buf_pos < m_s->m_sort_buf.size()); - return m_s->m_sort_buf[m_s->m_buf_pos]; - } - - iterator& operator++() - { - assert(m_s); - ++m_s->m_buf_pos; - advance_part(); - return *this; - } - - bool operator==(iterator const& rhs) const - { - return m_s == rhs.m_s; - } - - bool operator!=(iterator const& rhs) const - { - return !(*this == rhs); - } - - private: - void advance_part() - { - while (m_s->m_partitions[m_s->m_cur_part] + m_s->m_buf_pos == - m_s->m_partitions[m_s->m_cur_part + 1]) { - ++m_s->m_cur_part; - m_s->m_buf_pos = 0; - m_s->m_partition_sorted = false; - - if (m_s->m_cur_part == m_s->m_partitions.size() - 1) { - m_s = nullptr; - return; - } - } - } - - sorter* m_s; - }; - - iterator begin() - { - return iterator(this); - } - - iterator end() - { - return iterator(); - } - - - private: - std::vector m_partitions; - std::vector m_sort_buf; - size_t m_cur_part; - size_t m_buf_pos; - bool m_partition_sorted; - - Iterator m_begin; - KeyFunctor m_kf; - }; - - template - sorter - make_sorter(Iterator begin, Iterator end, - KeyFunctor const& kf, - PartitionFunctor const& pf) const - { - return sorter(begin, end, kf, pf); - } - - template - void sort(Iterator begin, Iterator end, - KeyFunctor const& kf, - PartitionFunctor const& pf) const - { - auto s = make_sorter(begin, end, kf, pf); - std::copy(s.begin(), s.end(), begin); - } - - private: - - template - struct buffered_cursor { - typedef typename Iterator::value_type value_type; - - buffered_cursor(Iterator begin, Iterator end, size_t buf_bytes) - : m_base_cur(begin) - , m_base_end(end) - , m_max_bufsize((buf_bytes + sizeof(value_type) - 1) / sizeof(value_type)) - , m_buf_pos(0) - { - m_buffer.reserve(m_max_bufsize); - flush(); - } - - ~buffered_cursor() - { - assert(empty()); - assert(m_buffer.empty()); - } - - bool empty() const - { - return m_base_cur == m_base_end; - } - - void advance() - { - m_buf_pos += 1; - if (m_buf_pos == m_buffer.size()) { - flush(); - } - } - - value_type& value() - { - return m_buffer[m_buf_pos]; - } - - - private: - typedef typename std::iterator_traits::difference_type diff_t; - - void flush() - { - // flush current buffer - assert(m_buf_pos == m_buffer.size()); - std::copy(m_buffer.begin(), m_buffer.end(), m_base_cur); - m_base_cur += m_buffer.size(); - m_buffer.clear(); - m_buf_pos = 0; - - // fill buffer - auto avail = std::min(m_max_bufsize, - std::distance(m_base_cur, m_base_end)); - assert(avail >= 0); - m_buffer.assign(m_base_cur, m_base_cur + avail); - - } - - Iterator m_base_cur, m_base_end; - std::vector m_buffer; - diff_t m_max_bufsize; - size_t m_buf_pos; - }; - - mappings_map m_mappings; - }; - -} diff --git a/gatb-core/thirdparty/emphf/mphf.hpp b/gatb-core/thirdparty/emphf/mphf.hpp deleted file mode 100644 index c1c01a8a1..000000000 --- a/gatb-core/thirdparty/emphf/mphf.hpp +++ /dev/null @@ -1,138 +0,0 @@ -#pragma once - -#include - -#include "bitpair_vector.hpp" -#include "ranked_bitpair_vector.hpp" -#include "perfutils.hpp" - -namespace emphf { - - template - class mphf { - public: - mphf() - {} - - template - mphf(HypergraphSorter& sorter, size_t n, - Range const& input_range, Adaptor adaptor, Progress *progress, - double gamma = 1.23) - : m_n(n) - , m_hash_domain((size_t(std::ceil(double(m_n) * gamma)) + 2) / 3) - { - typedef typename HypergraphSorter::node_t node_t; - typedef typename HypergraphSorter::hyperedge hyperedge; - typedef decltype(*std::begin(input_range)) value_type; - - size_t nodes_domain = m_hash_domain * 3; - - if (nodes_domain >= std::numeric_limits::max()) { - throw std::invalid_argument("Too many nodes for node_t"); - } - - auto edge_gen = [&](value_type s) { - using std::get; - auto hashes = m_hasher(adaptor(s)); - return hyperedge((node_t)(get<0>(hashes) % m_hash_domain), - (node_t)(m_hash_domain + - (get<1>(hashes) % m_hash_domain)), - (node_t)(2 * m_hash_domain + - (get<2>(hashes) % m_hash_domain))); - }; - - std::mt19937_64 rng(37); // deterministic seed - - for (size_t trial = 0; ; ++trial) { - logger() << "Hypergraph generation: trial " << trial << std::endl; - m_hasher = BaseHasher::generate(rng); - if (sorter.try_generate_and_sort(input_range, edge_gen, - m_n, m_hash_domain, progress)) break; - } - - auto peeling_order = sorter.get_peeling_order(); - bitpair_vector bv(nodes_domain); - - logger() << "Assigning values" << std::endl; - size_t d=0; for (auto edge = peeling_order.first; edge != peeling_order.second; ++edge) { d++; } - if (progress) - { - progress->reset (d); - progress->init (); - } - for (auto edge = peeling_order.first; - edge != peeling_order.second; - ++edge) { - - uint64_t target = orientation(*edge); - uint64_t assigned = bv[edge->v1] + bv[edge->v2]; - - // "assigned values" must be nonzeros to be ranked, so - // if the result is 0 we assign 3 - bv.set(edge->v0, ((target - assigned + 9) % 3) ?: 3); - - if (progress) progress->inc (1); - } - if (progress) progress->finish (); - - m_bv.build(std::move(bv)); - } - - uint64_t size() const - { - return m_n; - } - - BaseHasher const& base_hasher() const - { - return m_hasher; - } - - template - uint64_t lookup(T val, Adaptor adaptor) - { - using std::get; - auto hashes = m_hasher(adaptor(val)); - uint64_t nodes[3] = {get<0>(hashes) % m_hash_domain, - m_hash_domain + (get<1>(hashes) % m_hash_domain), - 2 * m_hash_domain + (get<2>(hashes) % m_hash_domain)}; - - uint64_t hidx = (m_bv[nodes[0]] + m_bv[nodes[1]] + m_bv[nodes[2]]) % 3; - return m_bv.rank(nodes[hidx]); - } - - void swap(mphf& other) - { - std::swap(m_n, other.m_n); - std::swap(m_hash_domain, other.m_hash_domain); - m_hasher.swap(other.m_hasher); - m_bv.swap(other.m_bv); - } - - void save(std::ostream& os) const - { - os.write(reinterpret_cast(&m_n), sizeof(m_n)); - os.write(reinterpret_cast(&m_hash_domain), - sizeof(m_hash_domain)); - m_hasher.save(os); - m_bv.save(os); - } - - void load(std::istream& is) - { - is.read(reinterpret_cast(&m_n), sizeof(m_n)); - is.read(reinterpret_cast(&m_hash_domain), - sizeof(m_hash_domain)); - m_hasher.load(is); - m_bv.load(is); - } - - - private: - - uint64_t m_n; - uint64_t m_hash_domain; - BaseHasher m_hasher; - ranked_bitpair_vector m_bv; - }; -} diff --git a/gatb-core/thirdparty/emphf/mphf_hem.hpp b/gatb-core/thirdparty/emphf/mphf_hem.hpp deleted file mode 100644 index a51180e64..000000000 --- a/gatb-core/thirdparty/emphf/mphf_hem.hpp +++ /dev/null @@ -1,312 +0,0 @@ -#pragma once - -#include -#include - -#include "bitpair_vector.hpp" -#include "ranked_bitpair_vector.hpp" -#include "perfutils.hpp" -#include "base_hash.hpp" -#include "hypergraph_sorter_seq.hpp" - -namespace emphf { - - template - class mphf_hem { - public: - - mphf_hem() - {} - - template - mphf_hem(MemoryModel& mm, size_t n, Range const& input_range, - Adaptor adaptor, Progress *progress, double gamma = 1.23, - size_t log2_expected_bucket = 8) - : m_n(n) - { - using std::get; - - std::mt19937_64 rng(37); // deterministic seed - - auto hashes = mm.make_vector(hash_triple_t()); - hashes.reserve(m_n); - - auto high_bits = int(std::ceil(std::log2(n >> log2_expected_bucket))); - m_chunk_shift = size_t(sizeof(get<0>(hash_triple_t())) * 8 - high_bits); - - if (progress) - { - progress->reset (3); // should perform only one mphf round (if it successed); so if counter goes above 100%, means it had to retry - progress->init (); - } - - - while (true) { - m_hasher = BaseHasher::generate(rng); - - hashes.clear(); - - logger() << "Generating hashes" << std::endl; - hash_triple_t max_hash; - for (auto const& val: input_range) { - auto h = m_hasher(adaptor(val)); - hashes.push_back(h); - max_hash = std::max(max_hash, h); - } - - // progress notification - if (progress) - progress->inc (1); - - - logger() << "Sorting hashes" << std::endl; - auto sorted_hashes = - mm.make_sorter(hashes.begin(), - hashes.end(), - [](hash_triple_t const& h) { - return h; - }, - [&](hash_triple_t const& h, size_t k) { - return uint64_t(this->chunk_of(h) * k) >> high_bits; - }); - - // progress notification - if (progress) - progress->inc (1); - - bitpair_vector bv; - logger() << "Generating chunk functions" << std::endl; - if (try_generate_outer_hashes(sorted_hashes, max_hash, - bv, rng, gamma, progress)) { - m_bv.build(std::move(bv)); - - //progress notification - if (progress) - progress->inc (1); - - break; - } - //std::cout << "Unsuccessful mphf_hem creation, retrying.." << std::endl; // (rayan) - if (progress) - progress->inc (1); - } - - if (progress) - progress->finish (); - - - } - - uint64_t size() const - { - return m_n; - } - - BaseHasher const& base_hasher() const - { - return m_hasher; - } - - template - uint64_t lookup(T val, Adaptor adaptor) - { - using std::get; - auto hashes = m_hasher(adaptor(val)); - auto chunk = chunk_of(hashes); - - auto const& chunk_hasher = m_chunk_hashers[chunk]; - auto offset = m_offsets[chunk]; - auto nodes_domain = m_offsets[chunk + 1] - offset; - - auto ih = chunk_hasher(hashes); - auto hd = nodes_domain / 3; - - uint64_t nodes[3] = {offset + get<0>(ih) % hd, - offset + hd + (get<1>(ih) % hd), - offset + 2 * hd + (get<2>(ih) % hd)}; - - uint64_t hidx = (m_bv[nodes[0]] + m_bv[nodes[1]] + m_bv[nodes[2]]) % 3; - return m_bv.rank(nodes[hidx]); - } - - void swap(mphf_hem& other) - { - std::swap(m_n, other.m_n); - std::swap(m_chunk_shift, other.m_chunk_shift); - - m_hasher.swap(other.m_hasher); - m_bv.swap(other.m_bv); - m_offsets.swap(other.m_offsets); - m_chunk_hashers.swap(other.m_chunk_hashers); - } - - void save(std::ostream& os) const - { - os.write(reinterpret_cast(&m_n), sizeof(m_n)); - os.write(reinterpret_cast(&m_chunk_shift), - sizeof(m_chunk_shift)); - m_hasher.save(os); - m_bv.save(os); - - uint64_t chunks = m_chunk_hashers.size(); - os.write(reinterpret_cast(&chunks), - sizeof(chunks)); - - os.write(reinterpret_cast(m_offsets.data()), - (std::streamsize)(sizeof(m_offsets[0]) * (chunks + 1))); - - for (size_t i = 0; i < chunks; ++i) { - m_chunk_hashers[i].save(os); - } - - - } - - void load(std::istream& is) - { - is.read(reinterpret_cast(&m_n), sizeof(m_n)); - is.read(reinterpret_cast(&m_chunk_shift), - sizeof(m_chunk_shift)); - m_hasher.load(is); - m_bv.load(is); - - uint64_t chunks; - is.read(reinterpret_cast(&chunks), - sizeof(chunks)); - - m_offsets.resize(chunks + 1); - is.read(reinterpret_cast(m_offsets.data()), - (std::streamsize)(sizeof(m_offsets[0]) * (chunks + 1))); - - m_chunk_hashers.resize(chunks); - for (size_t i = 0; i < chunks; ++i) { - m_chunk_hashers[i].load(is); - } - } - - - private: - - typedef typename BaseHasher::hash_triple_t hash_triple_t; - typedef OffsetType offset_type; - - template - bool try_generate_outer_hashes(HashesVector& hashes, - hash_triple_t max_hash, - bitpair_vector& bv, - Rng& rng, double gamma, Progress *progress) - { - size_t n_chunks = chunk_of(max_hash) + 1; - m_offsets.resize(n_chunks + 1); - m_chunk_hashers.resize(n_chunks); - - auto hash_it = hashes.begin(); - auto last_hash = *hash_it; - std::vector chunk_hashes; - - while (hash_it != hashes.end()) { - chunk_hashes.clear(); - chunk_hashes.push_back(*hash_it); - auto chunk = chunk_of(chunk_hashes.front()); - - while (++hash_it != hashes.end() && - chunk_of(*hash_it) == chunk) { - if (*hash_it == last_hash) { - std::cout << "MPHF HEM problem: Duplicate hash value found; restarting." << std::endl; - logger() << "Duplicate hash value found; restarting." - << std::endl; - return false; - } - chunk_hashes.push_back(*hash_it); - last_hash = *hash_it; - } - - - while (true) { - m_chunk_hashers[chunk] = BaseHasher::generate(rng); - if (try_generate_inner_hashes(chunk, bv, - chunk_hashes.begin(), - chunk_hashes.end(), - gamma, progress)) { - break; - } - //std::cout << "MPHF HEM : restarting inner hash." << std::endl; - } - } - - return true; - } - - template - bool try_generate_inner_hashes(size_t chunk, bitpair_vector& bv, - Iterator begin, Iterator end, - double gamma, Progress *progress) - { - typedef uint64_t node_t; // (rayan) changed 32 to 64 here - typedef hypergraph_sorter_seq> sorter_t; - sorter_t sorter; - typedef sorter_t::hyperedge hyperedge; - - - auto chunk_size = (size_t)std::distance(begin, end); - - auto chunk_hash_domain = (size_t(std::ceil(double(chunk_size) - * gamma)) + 2) / 3; - auto chunk_nodes = chunk_hash_domain * 3; - - auto const& hasher = m_chunk_hashers[chunk]; - - auto edge_gen = [&](hash_triple_t t) { - using std::get; - auto h = hasher(t); - return hyperedge((node_t)(get<0>(h) % chunk_hash_domain), - (node_t)(chunk_hash_domain + - (get<1>(h) % chunk_hash_domain)), - (node_t)(2 * chunk_hash_domain + - (get<2>(h) % chunk_hash_domain))); - }; - - if (!sorter.try_generate_and_sort(range(begin, end), - edge_gen, - chunk_size, chunk_hash_domain, - progress, - false)) { - return false; - } - - size_t offset = bv.size(); - m_offsets[chunk] = (offset_type)offset; - m_offsets[chunk + 1] = (offset_type)(offset + chunk_nodes); - bv.resize(m_offsets[chunk + 1]); - - auto peeling_order = sorter.get_peeling_order(); - - for (auto edge = peeling_order.first; - edge != peeling_order.second; - ++edge) { - - uint64_t target = orientation(*edge); - uint64_t assigned = bv[offset + edge->v1] + bv[offset + edge->v2]; - - // "assigned values" must be nonzeros to be ranked, so - // if the result is 0 we assign 3 - bv.set(offset + edge->v0, ((target - assigned + 9) % 3) ?: 3); - } - - return true; - } - - size_t chunk_of(hash_triple_t const& h) const - { - return std::get<0>(h) >> m_chunk_shift; - } - - uint64_t m_n; - uint64_t m_chunk_shift; - BaseHasher m_hasher; - ranked_bitpair_vector m_bv; - - std::vector m_offsets; - std::vector m_chunk_hashers; - }; -} diff --git a/gatb-core/thirdparty/emphf/packed_edge_list.hpp b/gatb-core/thirdparty/emphf/packed_edge_list.hpp deleted file mode 100644 index 052f7440c..000000000 --- a/gatb-core/thirdparty/emphf/packed_edge_list.hpp +++ /dev/null @@ -1,269 +0,0 @@ -#pragma once - -#include - -#include "packed_vector.hpp" - -namespace emphf { - - template - class packed_edge_list { - - protected: - // forward declarations - struct value_reference; - - public: - typedef typename Hypergraph::hyperedge value_type; - typedef typename Hypergraph::node_t node_t; - - packed_edge_list() - {} - - packed_edge_list(MemoryModel& mm, uint64_t bits, size_t n) - : m_bits(bits) - , m_n(n) - , m_v(mm, m_bits, n * 3) - {} - - size_t size() const - { - return m_n; - } - - void set(uint64_t pos, value_type v) - { - assert(pos < size()); - m_v.set(pos * 3 + 0, v.v0); - m_v.set(pos * 3 + 1, v.v1); - m_v.set(pos * 3 + 2, v.v2); - } - - value_type get(uint64_t pos) const - { - assert(pos < size()); - node_t v0, v1, v2; - v0 = (node_t)m_v[pos * 3 + 0]; - v1 = (node_t)m_v[pos * 3 + 1]; - v2 = (node_t)m_v[pos * 3 + 2]; - - return value_type(v0, v1, v2); - } - - value_reference operator[](uint64_t pos) - { - return *iterator(this, pos); - } - - value_type operator[](uint64_t pos) const - { - return get(pos); - } - - struct iterator : public std::iterator - { - iterator() - {} - - bool operator==(iterator const& rhs) const - { - assert(m_l == rhs.m_l); - return m_pos == rhs.m_pos; - } - - ptrdiff_t operator-(iterator const& rhs) const - { - return ptrdiff_t(m_pos) - ptrdiff_t(rhs.m_pos); - } - - bool operator<(iterator const& rhs) const - { - assert(m_l == rhs.m_l); - return m_pos < rhs.m_pos; - } - - bool operator>(iterator const& rhs) const - { - assert(m_l == rhs.m_l); - return m_pos > rhs.m_pos; - } - - bool operator>=(iterator const& rhs) const - { - return !(*this < rhs); - } - - bool operator<=(iterator const& rhs) const - { - return !(*this > rhs); - } - - bool operator!=(iterator const& rhs) const - { - return !(*this == rhs); - } - - value_reference operator*() const - { - return value_reference(m_l, m_pos); - } - - iterator& operator+=(ptrdiff_t n) - { - m_pos += n; - return *this; - } - - iterator operator+(ptrdiff_t n) const - { - iterator copy(*this); - return (copy += n); - } - - iterator& operator-=(ptrdiff_t n) - { - return (*this += -n); - } - - iterator operator-(ptrdiff_t n) const - { - iterator copy(*this); - return (copy -= n); - } - - iterator& operator++() - { - return (*this += 1); - } - - iterator operator++(int) - { - iterator copy(*this); - ++(*this); - return copy; - } - - iterator& operator--() - { - return (*this -= 1); - } - - iterator operator--(int) - { - iterator copy(*this); - --(*this); - return copy; - } - - private: - friend class packed_edge_list; - - iterator(packed_edge_list* l, uint64_t pos) - : m_l(l) - , m_pos(pos) - {} - - packed_edge_list* m_l; - uint64_t m_pos; - }; - - iterator begin() - { - return iterator(this, 0); - } - - iterator end() - { - return iterator(this, this->size()); - } - - void swap(packed_edge_list& other) - { - std::swap(m_bits, other.m_bits); - std::swap(m_n, other.m_n); - m_v.swap(other.m_v); - } - - protected: - - // the idea of proxy object is similar to that of vector - struct value_reference { - - operator value_type() const - { - return m_l->get(m_pos); - } - - value_reference& operator=(value_type v) - { - assert(m_l); - m_l->set(m_pos, v); - return *this; - } - - value_reference& operator=(value_reference const& rhs) - { - return *this = value_type(rhs); - } - - bool operator==(value_reference const& rhs) const - { - return value_type(*this) == value_type(rhs); - } - - bool operator<(value_reference const& rhs) const - { - return value_type(*this) < value_type(rhs); - } - - friend inline - void swap(value_reference lhs, value_reference rhs) - { - value_type tmp = lhs; - lhs = value_type(rhs); - rhs = value_type(tmp); - } - - friend inline - void swap(value_type& lhs, value_reference rhs) - { - value_type tmp = lhs; - lhs = value_type(rhs); - rhs = value_type(tmp); - } - - friend inline - void swap(value_reference lhs, value_type& rhs) - { - value_type tmp = lhs; - lhs = value_type(rhs); - rhs = value_type(tmp); - } - - private: - value_reference() - : m_l(nullptr) - , m_pos(-1) - {} - - value_reference(packed_edge_list* l, uint64_t pos = 0) - : m_l(l) - , m_pos(pos) - {} - - value_reference& operator&() = delete; - value_reference& operator&() const = delete; - - friend struct iterator; - - packed_edge_list* m_l; - uint64_t m_pos; - }; - - uint64_t m_bits; - size_t m_n; - packed_vector m_v; - }; - -} diff --git a/gatb-core/thirdparty/emphf/packed_vector.hpp b/gatb-core/thirdparty/emphf/packed_vector.hpp deleted file mode 100644 index d3a19d306..000000000 --- a/gatb-core/thirdparty/emphf/packed_vector.hpp +++ /dev/null @@ -1,118 +0,0 @@ -#pragma once - -#include "common.hpp" - -namespace emphf { - - template - class packed_vector { - - template - using vector = typename MemoryModel::template vector; - - public: - - packed_vector() - : m_size(0) - , m_k(0) - , m_mask(~0ULL) - , m_bits() - {} - - packed_vector(MemoryModel& mm, uint64_t k, uint64_t n = 0) - : m_size(0) - , m_k(k) - , m_mask(m_k == 64 ? ~0ULL : ~(~0ULL << m_k)) - , m_bits(mm.make_vector(uninitialized_uint64())) - { - resize(n); - } - - void resize(uint64_t n) - { - // can only grow, for now - assert(n >= size()); - m_size = n; - m_bits.resize((m_size * m_k + 63) / 64); - } - - size_t size() const - { - return m_size; - } - - void set(uint64_t i, uint64_t val) - { - assert(i < size()); - assert(m_k == 64 || (val >> m_k) == 0); - - uint64_t pos = i * m_k; - uint64_t word_pos = pos / 64; - uint64_t offset_pos = pos % 64; - - m_bits[word_pos] &= ~(m_mask << offset_pos); - m_bits[word_pos] |= val << offset_pos; - - uint64_t stored = 64 - offset_pos; - if (stored < m_k) { - m_bits[word_pos + 1] &= ~(m_mask >> stored); - m_bits[word_pos + 1] |= val >> stored; - } - } - - uint64_t operator[](uint64_t i) const - { - assert(i < size()); - uint64_t pos = i * m_k; - uint64_t word_pos = pos / 64; - uint64_t offset_pos = pos % 64; - - uint64_t val = m_bits[word_pos] >> offset_pos; - uint64_t read = 64 - offset_pos; - - if (read < m_k) { - val |= m_bits[word_pos + 1] << read; - } - - val &= m_mask; - return val; - } - - void swap(packed_vector& other) - { - std::swap(m_size, other.m_size); - std::swap(m_k, other.m_k); - std::swap(m_mask, other.m_mask); - m_bits.swap(other.m_bits); - } - - void save(std::ostream& os) const - { - os.write(reinterpret_cast(&m_size), sizeof(m_size)); - os.write(reinterpret_cast(&m_k), sizeof(m_k)); - os.write(reinterpret_cast(m_bits.data()), - (std::streamsize)(sizeof(m_bits[0]) * m_bits.size())); - } - - void load(std::istream& is) - { - is.read(reinterpret_cast(&m_size), sizeof(m_size)); - is.read(reinterpret_cast(&m_k), sizeof(m_k)); - m_bits.resize((m_size * m_k + 63) / 64); - is.read(reinterpret_cast(m_bits.data()), - (std::streamsize)(sizeof(m_bits[0]) * m_bits.size())); - } - - vector const& data() const - { - return m_bits; - } - - protected: - uint64_t m_size; - uint64_t m_k; - uint64_t m_mask; - vector m_bits; - }; - -} diff --git a/gatb-core/thirdparty/emphf/perfutils.hpp b/gatb-core/thirdparty/emphf/perfutils.hpp deleted file mode 100644 index 6003121c2..000000000 --- a/gatb-core/thirdparty/emphf/perfutils.hpp +++ /dev/null @@ -1,60 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include - -namespace emphf { - - inline double get_time_usecs() { - struct timeval tv; - gettimeofday(&tv, NULL); - return double(tv.tv_sec) * 1000000 + double(tv.tv_usec); - } - - // stolen from folly - template - inline void do_not_optimize_away(T&& datum) { - asm volatile("" : "+r" (datum)); - } - - struct stats_accumulator { - stats_accumulator() - : m_n(0) - , m_mean(0) - , m_m2(0) - {} - - void add(double x) - { - m_n += 1; - auto delta = x - m_mean; - m_mean += delta / m_n; - m_m2 += delta * (x - m_mean); - } - - double mean() const - { - return m_mean; - } - - double variance() const - { - return m_m2 / (m_n - 1); - } - - double relative_stddev() const - { - return std::sqrt(variance()) / mean() * 100; - } - - private: - double m_n; - double m_mean; - double m_m2; - }; - -} diff --git a/gatb-core/thirdparty/emphf/ranked_bitpair_vector.hpp b/gatb-core/thirdparty/emphf/ranked_bitpair_vector.hpp deleted file mode 100644 index 9444f99a2..000000000 --- a/gatb-core/thirdparty/emphf/ranked_bitpair_vector.hpp +++ /dev/null @@ -1,87 +0,0 @@ -#pragma once - -#include - -#include "emphf_config.hpp" -#include "bitpair_vector.hpp" - -namespace emphf { - - class ranked_bitpair_vector { - public: - - ranked_bitpair_vector() - {} - - void build(bitpair_vector&& bv) - { - m_bv.swap(bv); - - uint64_t cur_rank = 0; - auto const& words = m_bv.data(); - for (size_t i = 0; i < words.size(); ++i) { - if (((i * 32) % pairs_per_block) == 0) { - m_block_ranks.push_back(cur_rank); - } - cur_rank += nonzero_pairs(words[i]); - } - } - - size_t size() const - { - return m_bv.size(); - } - - uint64_t operator[](uint64_t pos) const - { - return m_bv[pos]; - } - - uint64_t rank(uint64_t pos) const - { - uint64_t word_idx = pos / 32; - uint64_t word_offset = pos % 32; - uint64_t block = pos / pairs_per_block; - uint64_t r = m_block_ranks[block]; - - for (uint64_t w = block * pairs_per_block / 32; w < word_idx; ++w) { - r += nonzero_pairs(m_bv.data()[w]); - } - - uint64_t mask = (uint64_t(1) << (word_offset * 2)) - 1; - r += nonzero_pairs(m_bv.data()[word_idx] & mask); - - return r; - } - - void swap(ranked_bitpair_vector& other) - { - m_bv.swap(other.m_bv); - m_block_ranks.swap(other.m_block_ranks); - } - - void save(std::ostream& os) const - { - m_bv.save(os); - assert(m_block_ranks.size() == - (m_bv.size() + pairs_per_block - 1) / pairs_per_block); - os.write(reinterpret_cast(m_block_ranks.data()), - (std::streamsize)(sizeof(m_block_ranks[0]) * m_block_ranks.size())); - } - - void load(std::istream& is) - { - m_bv.load(is); - m_block_ranks.resize((m_bv.size() + pairs_per_block - 1) / pairs_per_block); - is.read(reinterpret_cast(m_block_ranks.data()), - (std::streamsize)(sizeof(m_block_ranks[0]) * m_block_ranks.size())); - } - - protected: - - static const uint64_t pairs_per_block = 512; - bitpair_vector m_bv; - std::vector m_block_ranks; - }; - -} diff --git a/gatb-core/thirdparty/emphf/test b/gatb-core/thirdparty/emphf/test deleted file mode 100644 index ac96ab9d3..000000000 --- a/gatb-core/thirdparty/emphf/test +++ /dev/null @@ -1,1000 +0,0 @@ -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14 -15 -16 -17 -18 -19 -20 -21 -22 -23 -24 -25 -26 -27 -28 -29 -30 -31 -32 -33 -34 -35 -36 -37 -38 -39 -40 -41 -42 -43 -44 -45 -46 -47 -48 -49 -50 -51 -52 -53 -54 -55 -56 -57 -58 -59 -60 -61 -62 -63 -64 -65 -66 -67 -68 -69 -70 -71 -72 -73 -74 -75 -76 -77 -78 -79 -80 -81 -82 -83 -84 -85 -86 -87 -88 -89 -90 -91 -92 -93 -94 -95 -96 -97 -98 -99 -100 -101 -102 -103 -104 -105 -106 -107 -108 -109 -110 -111 -112 -113 -114 -115 -116 -117 -118 -119 -120 -121 -122 -123 -124 -125 -126 -127 -128 -129 -130 -131 -132 -133 -134 -135 -136 -137 -138 -139 -140 -141 -142 -143 -144 -145 -146 -147 -148 -149 -150 -151 -152 -153 -154 -155 -156 -157 -158 -159 -160 -161 -162 -163 -164 -165 -166 -167 -168 -169 -170 -171 -172 -173 -174 -175 -176 -177 -178 -179 -180 -181 -182 -183 -184 -185 -186 -187 -188 -189 -190 -191 -192 -193 -194 -195 -196 -197 -198 -199 -200 -201 -202 -203 -204 -205 -206 -207 -208 -209 -210 -211 -212 -213 -214 -215 -216 -217 -218 -219 -220 -221 -222 -223 -224 -225 -226 -227 -228 -229 -230 -231 -232 -233 -234 -235 -236 -237 -238 -239 -240 -241 -242 -243 -244 -245 -246 -247 -248 -249 -250 -251 -252 -253 -254 -255 -256 -257 -258 -259 -260 -261 -262 -263 -264 -265 -266 -267 -268 -269 -270 -271 -272 -273 -274 -275 -276 -277 -278 -279 -280 -281 -282 -283 -284 -285 -286 -287 -288 -289 -290 -291 -292 -293 -294 -295 -296 -297 -298 -299 -300 -301 -302 -303 -304 -305 -306 -307 -308 -309 -310 -311 -312 -313 -314 -315 -316 -317 -318 -319 -320 -321 -322 -323 -324 -325 -326 -327 -328 -329 -330 -331 -332 -333 -334 -335 -336 -337 -338 -339 -340 -341 -342 -343 -344 -345 -346 -347 -348 -349 -350 -351 -352 -353 -354 -355 -356 -357 -358 -359 -360 -361 -362 -363 -364 -365 -366 -367 -368 -369 -370 -371 -372 -373 -374 -375 -376 -377 -378 -379 -380 -381 -382 -383 -384 -385 -386 -387 -388 -389 -390 -391 -392 -393 -394 -395 -396 -397 -398 -399 -400 -401 -402 -403 -404 -405 -406 -407 -408 -409 -410 -411 -412 -413 -414 -415 -416 -417 -418 -419 -420 -421 -422 -423 -424 -425 -426 -427 -428 -429 -430 -431 -432 -433 -434 -435 -436 -437 -438 -439 -440 -441 -442 -443 -444 -445 -446 -447 -448 -449 -450 -451 -452 -453 -454 -455 -456 -457 -458 -459 -460 -461 -462 -463 -464 -465 -466 -467 -468 -469 -470 -471 -472 -473 -474 -475 -476 -477 -478 -479 -480 -481 -482 -483 -484 -485 -486 -487 -488 -489 -490 -491 -492 -493 -494 -495 -496 -497 -498 -499 -500 -501 -502 -503 -504 -505 -506 -507 -508 -509 -510 -511 -512 -513 -514 -515 -516 -517 -518 -519 -520 -521 -522 -523 -524 -525 -526 -527 -528 -529 -530 -531 -532 -533 -534 -535 -536 -537 -538 -539 -540 -541 -542 -543 -544 -545 -546 -547 -548 -549 -550 -551 -552 -553 -554 -555 -556 -557 -558 -559 -560 -561 -562 -563 -564 -565 -566 -567 -568 -569 -570 -571 -572 -573 -574 -575 -576 -577 -578 -579 -580 -581 -582 -583 -584 -585 -586 -587 -588 -589 -590 -591 -592 -593 -594 -595 -596 -597 -598 -599 -600 -601 -602 -603 -604 -605 -606 -607 -608 -609 -610 -611 -612 -613 -614 -615 -616 -617 -618 -619 -620 -621 -622 -623 -624 -625 -626 -627 -628 -629 -630 -631 -632 -633 -634 -635 -636 -637 -638 -639 -640 -641 -642 -643 -644 -645 -646 -647 -648 -649 -650 -651 -652 -653 -654 -655 -656 -657 -658 -659 -660 -661 -662 -663 -664 -665 -666 -667 -668 -669 -670 -671 -672 -673 -674 -675 -676 -677 -678 -679 -680 -681 -682 -683 -684 -685 -686 -687 -688 -689 -690 -691 -692 -693 -694 -695 -696 -697 -698 -699 -700 -701 -702 -703 -704 -705 -706 -707 -708 -709 -710 -711 -712 -713 -714 -715 -716 -717 -718 -719 -720 -721 -722 -723 -724 -725 -726 -727 -728 -729 -730 -731 -732 -733 -734 -735 -736 -737 -738 -739 -740 -741 -742 -743 -744 -745 -746 -747 -748 -749 -750 -751 -752 -753 -754 -755 -756 -757 -758 -759 -760 -761 -762 -763 -764 -765 -766 -767 -768 -769 -770 -771 -772 -773 -774 -775 -776 -777 -778 -779 -780 -781 -782 -783 -784 -785 -786 -787 -788 -789 -790 -791 -792 -793 -794 -795 -796 -797 -798 -799 -800 -801 -802 -803 -804 -805 -806 -807 -808 -809 -810 -811 -812 -813 -814 -815 -816 -817 -818 -819 -820 -821 -822 -823 -824 -825 -826 -827 -828 -829 -830 -831 -832 -833 -834 -835 -836 -837 -838 -839 -840 -841 -842 -843 -844 -845 -846 -847 -848 -849 -850 -851 -852 -853 -854 -855 -856 -857 -858 -859 -860 -861 -862 -863 -864 -865 -866 -867 -868 -869 -870 -871 -872 -873 -874 -875 -876 -877 -878 -879 -880 -881 -882 -883 -884 -885 -886 -887 -888 -889 -890 -891 -892 -893 -894 -895 -896 -897 -898 -899 -900 -901 -902 -903 -904 -905 -906 -907 -908 -909 -910 -911 -912 -913 -914 -915 -916 -917 -918 -919 -920 -921 -922 -923 -924 -925 -926 -927 -928 -929 -930 -931 -932 -933 -934 -935 -936 -937 -938 -939 -940 -941 -942 -943 -944 -945 -946 -947 -948 -949 -950 -951 -952 -953 -954 -955 -956 -957 -958 -959 -960 -961 -962 -963 -964 -965 -966 -967 -968 -969 -970 -971 -972 -973 -974 -975 -976 -977 -978 -979 -980 -981 -982 -983 -984 -985 -986 -987 -988 -989 -990 -991 -992 -993 -994 -995 -996 -997 -998 -999 diff --git a/gatb-core/thirdparty/emphf/test_all.py b/gatb-core/thirdparty/emphf/test_all.py deleted file mode 100755 index 022e1299d..000000000 --- a/gatb-core/thirdparty/emphf/test_all.py +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env python - -import os -import sys -from subprocess import check_call - -DEFAULT_FILE = '/usr/share/dict/words' - -EXES = [ - ('compute_mphf_seq', 'test_mphf'), - ('compute_mphf_scan', 'test_mphf'), - ('compute_mphf_scan_mmap', 'test_mphf'), - ('compute_mphf_hem', 'test_mphf_hem'), - ] - -def main(argv): - if len(argv) == 1: - filename = DEFAULT_FILE - print >> sys.stderr, "Using default file %s" % filename - print >> sys.stderr, "To use another file:" - print >> sys.stderr, "\t%s " % argv[0] - else: - filename = argv[1] - print >> sys.stderr, "Using default file %s" % filename - - - for constructor, tester in EXES: - print >> sys.stderr - print >> sys.stderr, '=' * 4, 'Testing %s' % constructor, '=' * 40 - mphf_name = 'mphf.output.bin' - - check_call(['./' + constructor, filename, mphf_name]) - check_call(['./' + tester, filename, mphf_name, '--check']) - check_call(['rm', mphf_name]) - - -if __name__ == '__main__': - main(sys.argv) diff --git a/gatb-core/thirdparty/emphf/test_mphf.cpp b/gatb-core/thirdparty/emphf/test_mphf.cpp deleted file mode 100644 index ce89a6606..000000000 --- a/gatb-core/thirdparty/emphf/test_mphf.cpp +++ /dev/null @@ -1,12 +0,0 @@ -#include -#include - -#include "test_mphf_generic.hpp" -#include "mphf.hpp" -#include "base_hash.hpp" - -int main(int argc, char** argv) -{ - using namespace emphf; - return test_mphf_main>(argc, argv); -} diff --git a/gatb-core/thirdparty/emphf/test_mphf_generic.hpp b/gatb-core/thirdparty/emphf/test_mphf_generic.hpp deleted file mode 100644 index b0e6e4921..000000000 --- a/gatb-core/thirdparty/emphf/test_mphf_generic.hpp +++ /dev/null @@ -1,146 +0,0 @@ -#include -#include - -#include "common.hpp" -#include "perfutils.hpp" - -namespace emphf { - - template - int test_mphf_main(int argc, char** argv) - { - - if (argc < 3) { - std::cerr << "Expected: " << argv[0] - << " [--check]" << std::endl; - std::terminate(); - } - - const char* values_filename = argv[1]; - const char* hash_filename = argv[2]; - - bool check = false; - if (argc > 3 && argv[3] == std::string("--check")) { - logger() << "Will perform results checking (this affects avg. time)" - << std::endl; - check = true; - } - - logger() << "Testing " << values_filename << std::endl; - - // load in memory for faster lookup - std::vector strings_pool; - std::vector string_endpoints; - string_endpoints.push_back(0); - - { - logger() << "Loading strings" << std::endl; - file_lines lines(values_filename); - for (auto& s: lines) { - const char* cs = s.c_str(); - strings_pool.insert(strings_pool.end(), - cs, - cs + s.size() + 1); // add null terminator - string_endpoints.push_back(strings_pool.size()); - } - } - - size_t test_strings = string_endpoints.size() - 1; - - identity_adaptor adaptor; - MPHF mphf; - size_t file_size; - { - logger() << "Loading mphf" << std::endl; - std::ifstream is(hash_filename, std::ios::binary); - mphf.load(is); - file_size = (size_t)is.tellg(); - } - - size_t n = mphf.size(); - - std::vector all_lookups; - if (check) { - all_lookups.reserve(n); - } - - uint8_t const* pool_base = (uint8_t const*)strings_pool.data(); - - logger() << "Performing base hashing (for reference)" << std::endl; - double tick = get_time_usecs(); - for (size_t i = 0; i < test_strings; ++i) { - byte_range_t s(pool_base + string_endpoints[i], - pool_base + string_endpoints[i + 1]); - - auto h = mphf.base_hasher()(adaptor(s)); - do_not_optimize_away(std::get<0>(h)); - } - double elapsed = get_time_usecs() - tick; - - logger() << "Avg. " << elapsed / double(test_strings) - << " usecs per base hash computation" << std::endl; - - logger() << "Performing lookups" << std::endl; - - size_t runs = check ? 1 : 10; - - stats_accumulator stats; - tick = get_time_usecs(); - size_t lookups = 0; - static const size_t lookups_per_sample = 1 << 16; - - for (size_t run = 0; run < runs; ++run) { - for (size_t i = 0; i < test_strings; ++i) { - byte_range_t s(pool_base + string_endpoints[i], - pool_base + string_endpoints[i + 1]); - - uint64_t h = mphf.lookup(s, adaptor); - do_not_optimize_away(h); - - if (check) { - if (h >= n) { - logger() << "ERROR: value out of bounds " - << h << std::endl; - return 2; - } - all_lookups.push_back(h); - } - - if (++lookups == lookups_per_sample) { - elapsed = get_time_usecs() - tick; - stats.add(elapsed / (double)lookups); - tick = get_time_usecs(); - lookups = 0; - } - } - - } - - logger() << "Avg. " << stats.mean() - << " usecs per lookup" << std::endl; - - if (check) { - logger() << "Checking hash output" << std::endl; - std::sort(all_lookups.begin(), all_lookups.end()); - auto distinct_lookups = (size_t)std::distance(all_lookups.begin(), - std::unique(all_lookups.begin(), - all_lookups.end())); - if (distinct_lookups == n) { - logger() << "OK" << std::endl; - } else { - logger() << "Expected " << n << " distinct values, got " - << distinct_lookups << std::endl; - return 1; - } - } - - double bits_per_key = 8.0 * (double)file_size / (double)mphf.size(); - std::cout << "avg_lookup_time\t" << stats.mean() << std::endl - << "stddev_lookup_time_percentage\t" - << stats.relative_stddev() << std::endl - << "bits_per_key\t" << bits_per_key << std::endl; - - return 0; - } - -} diff --git a/gatb-core/thirdparty/emphf/test_mphf_hem.cpp b/gatb-core/thirdparty/emphf/test_mphf_hem.cpp deleted file mode 100644 index e74b0f0f3..000000000 --- a/gatb-core/thirdparty/emphf/test_mphf_hem.cpp +++ /dev/null @@ -1,12 +0,0 @@ -#include -#include - -#include "test_mphf_generic.hpp" -#include "mphf_hem.hpp" -#include "base_hash.hpp" - -int main(int argc, char** argv) -{ - using namespace emphf; - return test_mphf_main>(argc, argv); -}