Skip to content

Commit

Permalink
getting rid of emphf; enabling BooPHF by default for Graph, in previs…
Browse files Browse the repository at this point in the history
…ion of dropping support for non-c++11 compilers
  • Loading branch information
rchikhi committed Mar 3, 2017
1 parent 22bb120 commit 1682fed
Show file tree
Hide file tree
Showing 59 changed files with 243 additions and 5,358 deletions.
41 changes: 3 additions & 38 deletions gatb-core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,6 @@ if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set (use_new_cxx 1)
endif()

if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.7)
set (use_mphf 0)
else()
set (use_mphf 1)
endif()

elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")

if ("${CMAKE_CXX_COMPILER_VERSION}" STREQUAL "")
Expand All @@ -87,18 +81,14 @@ elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")

IF(CMAKE_SYSTEM_NAME MATCHES "(Darwin)") # different clang versions number between linux and mac
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.3)
set (use_mphf 0)
else()
set (use_mphf 1)
set (use_new_cxx 1)
endif()

else()
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.2)
set (use_mphf 0)
else()
set (use_new_cxx 1)
set (use_mphf 1)
endif()

endif()
Expand Down Expand Up @@ -156,7 +146,9 @@ if (use_new_cxx)
set (LIBRARY_COMPILE_DEFINITIONS "${LIBRARY_COMPILE_DEFINITIONS} -DWITH_LAMBDA_EXPRESSION ${CXX_STD_VERSION}")
endif()

# detect SSE
# detect SSE for popcount
# this was for emphf, maybe it's for something else also? otherwise this part can be removed.
#
# from https://github.com/rurban/smhasher/blob/master/CMakeLists.txt
# i do not see much performance gain for now, but let's keep that code here, might be useful later.
# list of performance gain observed:
Expand Down Expand Up @@ -195,29 +187,6 @@ ENDIF()
if (use_new_cxx)
set (LIBRARY_COMPILE_DEFINITIONS "${LIBRARY_COMPILE_DEFINITIONS} -DUSE_NEW_CXX ")
endif()

if (use_mphf)
set (LIBRARY_COMPILE_DEFINITIONS "${LIBRARY_COMPILE_DEFINITIONS} -DWITH_MPHF ")
message ("-------------------------------------------------------------------------------------")
message ("-- WILL COMPILE MPHF! (COMPILER VERSION IS HIGH ENOUGH) ")
message ("-------------------------------------------------------------------------------------")
if (SSE4_2_FOUND)
set (LIBRARY_COMPILE_DEFINITIONS "${LIBRARY_COMPILE_DEFINITIONS} -DEMPHF_USE_POPCOUNT=1")
endif()

# for emphf
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
message ("-- Using stdlib=libc++ for clang/osx")

IF(CMAKE_SYSTEM_NAME MATCHES "(Darwin)")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++")
endif()
# my clang 3.7 (linux) and 3.9 (linux also) doesn't like when i use stdlib=libc++, it refuses to compile because cannot found standard libs like <vector>
# so i'm setting this as mac-only

endif ()

endif()

# In case we use an "old" version of clang with boost and c++0x, we have to skip rvalue usage
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
Expand Down Expand Up @@ -321,10 +290,6 @@ ADD_SUBDIRECTORY(thirdparty)
# we must be sure that hdf5 is built and installed before building gatb-core
ADD_DEPENDENCIES (gatbcore-static hdf5 hdf5_postbuild)

IF (DEFINED WITH_MPHF)
ADD_DEPENDENCIES(gatbcore-static emphf_copyasis)
ENDIF()

################################################################################
# DOCUMENTATION GENERATION
################################################################################
Expand Down
2 changes: 1 addition & 1 deletion gatb-core/src/gatb/bcalm2/bglue_algo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -473,7 +473,7 @@ struct Comp{
template<typename Key>
class hasher_t
{
typedef emphf::jenkins64_hasher BaseHasher;
typedef jenkins64_hasher BaseHasher; /* from BooPHF.hpp, which itself is from emphf:base_hasher */
BaseHasher emphf_hasher;
AdaptatorDefault<Key> adaptor;

Expand Down
2 changes: 1 addition & 1 deletion gatb-core/src/gatb/bcalm2/bglue_algo.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
#include <gatb/kmer/impl/PartiInfo.hpp> // for repartitor
#include <gatb/tools/misc/impl/Progress.hpp>
#include <gatb/tools/designpattern/impl/IteratorHelpers.hpp>
#include <gatb/tools/collections/impl/MPHF.hpp>
#include <gatb/tools/collections/impl/BooPHF.hpp>


//heh at this point I could have maybe just included gatb_core.hpp but well, no circular dependencies, this file is part of gatb-core now.
Expand Down
52 changes: 6 additions & 46 deletions gatb-core/src/gatb/debruijn/impl/Graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ void configure_visitor<Node,Edge,GraphDataVariant>::operator() (GraphData<span>&
data.setBranching (algo.getBranchingCollection());
}

if ((graph._mphfKind != MPHF_NONE) && (graph.getState() & GraphTemplate<Node, Edge, GraphDataVariant>::STATE_MPHF_DONE) && (graph.getState() & GraphTemplate<Node, Edge, GraphDataVariant>::STATE_SORTING_COUNT_DONE))
if ((graph.getState() & GraphTemplate<Node, Edge, GraphDataVariant>::STATE_MPHF_DONE) && (graph.getState() & GraphTemplate<Node, Edge, GraphDataVariant>::STATE_SORTING_COUNT_DONE))
{
typedef typename Kmer<span>::Count Count;
typedef typename Kmer<span>::Type Type;
Expand All @@ -219,7 +219,6 @@ void configure_visitor<Node,Edge,GraphDataVariant>::operator() (GraphData<span>&
Iterable<Type>* solidKmers = new IterableAdaptor<Count,Type,Count2TypeAdaptor<span> > (*solidCounts);

MPHFAlgorithm<span> mphf_algo (
graph._mphfKind,
dskGroup,
"mphf",
solidCounts,
Expand Down Expand Up @@ -474,15 +473,14 @@ void build_visitor_postsolid<Node,Edge,GraphDataVariant>::operator() (GraphData<
Partition<Count>* solidCounts = & dskGroup.getPartition<Count> ("solid");

/** We create an instance of the MPHF Algorithm class (why is that a class, and not a function?) and execute it. */
if (graph._mphfKind != MPHF_NONE && (!graph.checkState(GraphTemplate<Node, Edge, GraphDataVariant>::STATE_MPHF_DONE)))
if ((!graph.checkState(GraphTemplate<Node, Edge, GraphDataVariant>::STATE_MPHF_DONE)))
{
DEBUG ((cout << "build_visitor : MPHFAlgorithm BEGIN\n"));

/** We get the iterable for the solid counts and solid kmers. */
Iterable<Type>* solidKmers = new IterableAdaptor<Count,Type,Count2TypeAdaptor<span> > (*solidCounts);

MPHFAlgorithm<span> mphf_algo (
graph._mphfKind,
dskGroup,
"mphf",
solidCounts,
Expand Down Expand Up @@ -645,20 +643,6 @@ IOptionsParser* GraphTemplate<Node, Edge, GraphDataVariant>::getOptionsParser (b
parser->push_back (DebloomAlgorithm<>::getOptionsParser());
parser->push_back (BranchingAlgorithm<>::getOptionsParser());

/** We activate MPHF option only if available. */
if (MPHF<char>::enabled)
{
IOptionsParser* parserEmphf = new OptionsParser ("mphf");
parserEmphf->push_back (new tools::misc::impl::OptionOneParam (STR_MPHF_TYPE, "mphf type ('none' or 'emphf' or 'BooPHF')", false, "BooPHF"));
parser->push_back (parserEmphf);
}
else //we still activate option for command line compatibility
{
IOptionsParser* parserEmphf = new OptionsParser ("mphf");
parserEmphf->push_back (new tools::misc::impl::OptionOneParam (STR_MPHF_TYPE, "mphf type ('none')", false, "none"));
parser->push_back (parserEmphf);
}

/** We create a "general options" parser. */
IOptionsParser* parserGeneral = new OptionsParser ("general");
parserGeneral->push_front (new OptionOneParam (STR_INTEGER_PRECISION, "integers precision (0 for optimized value)", false, "0", false));
Expand Down Expand Up @@ -747,7 +731,7 @@ GraphTemplate<Node, Edge, GraphDataVariant_t>::GraphTemplate (size_t kmerSize)
_variant(new GraphDataVariant_t()), _kmerSize(kmerSize), _info("graph"),
_state(GraphTemplate<Node, Edge, GraphDataVariant>::STATE_INIT_DONE),
_bloomKind(BLOOM_DEFAULT), _debloomKind(DEBLOOM_DEFAULT), _debloomImpl(DEBLOOM_IMPL_DEFAULT),
_branchingKind(BRANCHING_STORED), _mphfKind(MPHF_NONE)
_branchingKind(BRANCHING_STORED)
{
/** We configure the data variant according to the provided kmer size. */
setVariant (_variant, _kmerSize);
Expand All @@ -768,7 +752,7 @@ template<typename Node, typename Edge, typename GraphDataVariant_t>
GraphTemplate<Node, Edge, GraphDataVariant_t>::GraphTemplate (const std::string& uri)
: _storageMode(PRODUCT_MODE_DEFAULT), _storage(0),
_variant(new GraphDataVariant_t()), _kmerSize(0), _info("graph"),
_name(System::file().getBaseName(uri)), _mphfKind(MPHF_BOOPHF)
_name(System::file().getBaseName(uri))

{
/** We create a storage instance. */
Expand Down Expand Up @@ -816,10 +800,6 @@ GraphTemplate<Node, Edge, GraphDataVariant>::GraphTemplate (bank::IBank* bank, t
parse (params->getStr(STR_DEBLOOM_IMPL), _debloomImpl);
parse (params->getStr(STR_BRANCHING_TYPE), _branchingKind);

/** This one is conditional. */
if (params->get(STR_MPHF_TYPE) && MPHF<char>::enabled) { parse (params->getStr(STR_MPHF_TYPE), _mphfKind); }
else { _mphfKind = MPHF_NONE; }

/** We configure the data variant according to the provided kmer size. */
setVariant (_variant, _kmerSize, integerPrecision);

Expand Down Expand Up @@ -854,10 +834,6 @@ GraphTemplate<Node, Edge, GraphDataVariant>::GraphTemplate (tools::misc::IProper
parse (params->getStr(STR_DEBLOOM_IMPL), _debloomImpl);
parse (params->getStr(STR_BRANCHING_TYPE), _branchingKind);

/** This one is conditional. */
if (params->get(STR_MPHF_TYPE) && MPHF<char>::enabled) { parse (params->getStr(STR_MPHF_TYPE), _mphfKind); }
else { _mphfKind = MPHF_NONE; }

/** We configure the data variant according to the provided kmer size. */
setVariant (_variant, _kmerSize, integerPrecision);

Expand Down Expand Up @@ -925,7 +901,7 @@ GraphTemplate<Node, Edge, GraphDataVariant>::GraphTemplate ()
_variant(new GraphDataVariant()), _kmerSize(0), _info("graph"),
_state(GraphTemplate<Node, Edge, GraphDataVariant>::STATE_INIT_DONE),
_bloomKind(BLOOM_DEFAULT),
_debloomKind(DEBLOOM_DEFAULT), _debloomImpl(DEBLOOM_IMPL_DEFAULT), _branchingKind(BRANCHING_STORED), _mphfKind(MPHF_NONE)
_debloomKind(DEBLOOM_DEFAULT), _debloomImpl(DEBLOOM_IMPL_DEFAULT), _branchingKind(BRANCHING_STORED)
{
//std::cout << "empty graphtemplate constructor" << std::endl;
}
Expand All @@ -941,8 +917,7 @@ GraphTemplate<Node, Edge, GraphDataVariant>::GraphTemplate ()
template<typename Node, typename Edge, typename GraphDataVariant>
GraphTemplate<Node, Edge, GraphDataVariant>::GraphTemplate (const GraphTemplate<Node, Edge, GraphDataVariant>& graph)
: _storageMode(graph._storageMode), _storage(0),
_variant(new GraphDataVariant()), _kmerSize(graph._kmerSize), _info("graph"), _name(graph._name), _state(graph._state),
_mphfKind(graph._mphfKind)
_variant(new GraphDataVariant()), _kmerSize(graph._kmerSize), _info("graph"), _name(graph._name), _state(graph._state)
{
setStorage (graph._storage);

Expand All @@ -969,7 +944,6 @@ GraphTemplate<Node, Edge, GraphDataVariant>& GraphTemplate<Node, Edge, GraphData
_bloomKind = graph._bloomKind;
_debloomKind = graph._debloomKind;
_debloomImpl = graph._debloomImpl;
_mphfKind = graph._mphfKind;
_branchingKind = graph._branchingKind;
_state = graph._state;

Expand Down Expand Up @@ -3497,10 +3471,6 @@ struct allocateAdjacency_visitor : public boost::static_visitor<void> {
template<typename Node, typename Edge, typename GraphDataVariant>
void GraphTemplate<Node, Edge, GraphDataVariant>::precomputeAdjacency(unsigned int nbCores, bool verbose)
{
#ifndef WITH_MPHF
std::cout << "Adjacency precomputation isn't supported when GATB-core is compiled with a non-C++11 compiler" << std::endl;
#else

ProgressGraphIteratorTemplate<Node, ProgressTimerAndSystem> itNode (iterator(), "precomputing adjacency", verbose);

bool hasMPHF = getState() & GraphTemplate<Node, Edge, GraphDataVariant>::STATE_MPHF_DONE;
Expand Down Expand Up @@ -3565,7 +3535,6 @@ void GraphTemplate<Node, Edge, GraphDataVariant>::precomputeAdjacency(unsigned i


// TODO delete _container here
#endif
}

// now deleteNode depends on getNodeAdjacency
Expand Down Expand Up @@ -3720,10 +3689,6 @@ bool GraphTemplate<Node, Edge, GraphDataVariant>::debugCompareNeighborhoods(Node
template<typename Node, typename Edge, typename GraphDataVariant>
void GraphTemplate<Node, Edge, GraphDataVariant>::deleteNodesByIndex(vector<bool> &bitmap, int nbCores, gatb::core::system::ISynchronizer* synchro) const
{
#ifndef WITH_MPHF
std::cout << "Node deletion isn't supported when GATB-core is compiled with a non-C++11 compiler" << std::endl;
#else

GraphIterator<Node> itNode = this->iterator();
Dispatcher dispatcher (nbCores);

Expand All @@ -3743,7 +3708,6 @@ void GraphTemplate<Node, Edge, GraphDataVariant>::deleteNodesByIndex(vector<bool
synchro->unlock();
}
}); // end of parallel node iteration
#endif
}

template<typename Node, typename Edge, typename GraphDataVariant>
Expand Down Expand Up @@ -3825,9 +3789,6 @@ struct allocateNonSimpleNodeCache_visitor : public boost::static_visitor<void>
template<typename Node, typename Edge, typename GraphDataVariant>
void GraphTemplate<Node, Edge, GraphDataVariant>::cacheNonSimpleNodes(unsigned int nbCores, bool verbose)
{
#ifndef WITH_MPHF
std::cout << "cacheNonSimpleNode isn't supported when GATB-core is compiled with a non-C++11 compiler" << std::endl;
#else
boost::apply_visitor (allocateNonSimpleNodeCache_visitor<Node, Edge, GraphDataVariant>(), *(GraphDataVariant*)_variant);
setState(GraphTemplate<Node, Edge, GraphDataVariant>::STATE_NONSIMPLE_CACHE);
GraphIterator<Node> itNode = this->iterator();
Expand All @@ -3845,7 +3806,6 @@ void GraphTemplate<Node, Edge, GraphDataVariant>::cacheNonSimpleNodes(unsigned i
}
}); // end of parallel node iteration
std::cout << "Cached " << nbCachedNodes << " non-simple nodes" << std::endl;
#endif
}

template<typename Node, typename Edge, typename GraphDataVariant>
Expand Down
1 change: 0 additions & 1 deletion gatb-core/src/gatb/debruijn/impl/Graph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1074,7 +1074,6 @@ class GraphTemplate
tools::misc::DebloomKind _debloomKind;
tools::misc::DebloomImpl _debloomImpl;
tools::misc::BranchingKind _branchingKind;
tools::misc::MPHFKind _mphfKind;

/** */
GraphIterator<Node> getNodes () const;
Expand Down
33 changes: 18 additions & 15 deletions gatb-core/src/gatb/debruijn/impl/GraphUnitigs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,9 @@ get_from_navigational_vector(const std::vector<uint64_t> &v, uint64_t utig, cons
template<size_t span>
void GraphUnitigsTemplate<span>::load_unitigs(string unitigs_filename)
{
std::cout << "loading unitigs from disk to memory" << std::endl;
bool verbose = (nb_unitigs > 1000000); // big dataset, let's show some memory usage verbosity here
if (verbose)
std::cout << "loading unitigs from disk to memory" << std::endl;

BankFasta inputBank (unitigs_filename);
//bank::IBank* inputBank = Bank::open (unitigs_filename);
Expand Down Expand Up @@ -448,18 +450,21 @@ void GraphUnitigsTemplate<span>::load_unitigs(string unitigs_filename)
// an estimation of memory usage
uint64_t nb_kmers = unitigs.size();
uint64_t mem_vec = (unitigs.capacity() * sizeof(string) + nb_utigs_nucl_mem);
std::cout << "Memory usage:" << std::endl;
std::cout << " " << (sizeof(uint64_t) * incoming.size()) / 1024 / 1024 << " MB keys in incoming dict" << std::endl;
std::cout << " " << (sizeof(uint64_t) * outcoming.size()) / 1024 / 1024 << " MB keys in outcoming dict" << std::endl;
std::cout << " " << (sizeof(uint64_t) * incoming_map.size()) / 1024 / 1024 << " MB keys in incoming_map dict" << std::endl;
std::cout << " " << (sizeof(uint64_t) * outcoming_map.size()) / 1024 / 1024 << " MB keys in outcoming_map dict" << std::endl;
std::cout << " " << mem_vec /1024 /1024 << " MB unitigs nucleotides" << std::endl;
std::cout << " " << (nb_kmers*sizeof(float)) / 1024 / 1024 << " MB unitigs abundances" << std::endl;
std::cout << " " << (2*nb_kmers/8) / 1024 / 1024 << " MB deleted/visited bitvectors" << std::endl;
std::cout << "Estimated total: " << (nb_kmers*(sizeof(float) + 2.0/8.0) + sizeof(uint64_t) * ( incoming.size() + outcoming.size() + incoming.size() + outcoming_map.size()) + mem_vec) / 1024 / 1024 << " MB" << std::endl;

if (nb_utigs_nucl != nb_utigs_nucl_mem)
std::cout << "unitigs strings size " << nb_utigs_nucl << " vs capacity " << nb_utigs_nucl_mem << std::endl;
if (verbose)
{
std::cout << "Memory usage:" << std::endl;
std::cout << " " << (sizeof(uint64_t) * incoming.size()) / 1024 / 1024 << " MB keys in incoming dict" << std::endl;
std::cout << " " << (sizeof(uint64_t) * outcoming.size()) / 1024 / 1024 << " MB keys in outcoming dict" << std::endl;
std::cout << " " << (sizeof(uint64_t) * incoming_map.size()) / 1024 / 1024 << " MB keys in incoming_map dict" << std::endl;
std::cout << " " << (sizeof(uint64_t) * outcoming_map.size()) / 1024 / 1024 << " MB keys in outcoming_map dict" << std::endl;
std::cout << " " << mem_vec /1024 /1024 << " MB unitigs nucleotides" << std::endl;
std::cout << " " << (nb_kmers*sizeof(float)) / 1024 / 1024 << " MB unitigs abundances" << std::endl;
std::cout << " " << (2*nb_kmers/8) / 1024 / 1024 << " MB deleted/visited bitvectors" << std::endl;
std::cout << "Estimated total: " << (nb_kmers*(sizeof(float) + 2.0/8.0) + sizeof(uint64_t) * ( incoming.size() + outcoming.size() + incoming.size() + outcoming_map.size()) + mem_vec) / 1024 / 1024 << " MB" << std::endl;

if (nb_utigs_nucl != nb_utigs_nucl_mem)
std::cout << "unitigs strings size " << nb_utigs_nucl << " vs capacity " << nb_utigs_nucl_mem << std::endl;
}
}

/*********************************************************************
Expand Down Expand Up @@ -616,7 +621,6 @@ GraphUnitigsTemplate<span>& GraphUnitigsTemplate<span>::operator= (GraphUnitigsT
BaseGraph::_storageMode = graph._storageMode;
BaseGraph::_name = graph._name;
BaseGraph::_info = graph._info;
BaseGraph::_mphfKind = graph._mphfKind;
BaseGraph::_state = graph._state;

BaseGraph::setStorage (graph._storage);
Expand Down Expand Up @@ -660,7 +664,6 @@ GraphUnitigsTemplate<span>& GraphUnitigsTemplate<span>::operator= (GraphUnitigsT
BaseGraph::_storageMode = graph._storageMode;
BaseGraph::_name = graph._name;
BaseGraph::_info = graph._info;
BaseGraph::_mphfKind = graph._mphfKind;
BaseGraph::_state = graph._state;

BaseGraph::setStorage (graph._storage);
Expand Down
Loading

0 comments on commit 1682fed

Please sign in to comment.