Commit 9e43eb6a authored by Tizian Schulz's avatar Tizian Schulz
Browse files

Added quorum precalculation feature

parent 734dfe1e
......@@ -17,4 +17,11 @@
- An input parameter check has been added to ensure that a given minial seed
length is always positive.
- Seeds that could not be extended without gaps are no longer discarded if
they are longer than the minimal seed length.
\ No newline at end of file
they are longer than the minimal seed length.
- Quorum precalculation is now possible during index building. Information is
saved as part of the index
- Introduced a new index format to enable storage of quorum information.
- Enabled multi-threaded graph loading
- Closed some smaller memory leaks.
- Disabled verbose mode for graph reading and removed some logging info.
- Some refactoring.
\ No newline at end of file
This diff is collapsed.
......@@ -12,51 +12,51 @@
#define DEFAULT_X 3
//This function simply calculates the unity score for two bases ((mis)match = (-)1)
inline int32_t compUScore(const char &q, const char &u){ return (q == u ? USCORE_MATCH : USCORE_MISMATCH); }
inline int32_t compUScore(const char &q, const char &u, const uint16_t &mscore, const int16_t &mmscore){ return (q == u ? mscore : mmscore); }
//The good old X-drop algorithm (extension to the right) for seeds matching the query's reference strand considering quorum and search color set
void startRightX_Drop(Hit* hit, const string &q, const int16_t &X, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet);
void startRightX_Drop(Hit* hit, const string &q, const uint16_t &mscore, const int16_t &mmscore, const int16_t &X, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet, const bool& advIdx);
//The good old X-drop algorithm (extension to the right) for seeds matching the query's reverse complement considering quorum and search color set
void startRightX_Drop_OnRevComp(Hit* hit, const string &q, const int16_t &X, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet);
void startRightX_Drop_OnRevComp(Hit* hit, const string &q, const uint16_t &mscore, const int16_t &mmscore, const int16_t &X, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet, const bool& advIdx);
//This function continues an extension to the right on a successive unitig of a seed lying on the query's reference strand considering a quorum and a search color set. Returns the maximum score reached.
int32_t contRightX_Drop(const neighborIterator<DataAccessor<seedlist>, DataStorage<seedlist>, false> &sucUnitig, const uint32_t &iniQoff, uint32_t &hitLen, const uint32_t &extLen, const string &q, const int16_t &X, const int32_t &lastUniTmpScore, uint32_t uniSeqPos, list<uint16_t> &extPth, uint32_t &explCount, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet);
int32_t contRightX_Drop(const neighborIterator<DataAccessor<UnitigInfo>, DataStorage<UnitigInfo>, false> &sucUnitig, const uint32_t &iniQoff, uint32_t &hitLen, const uint32_t &extLen, const string &q, const uint16_t &mscore, const int16_t &mmscore, const int16_t &X, const int32_t &lastSeedTmpScore, uint32_t uniSeqPos, list<uint16_t> &extPth, uint32_t &explCount, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet, const bool& advIdx);
//This function continues an extension to the right on a successive unitig of a seed lying on the query's reverse complement considering a quorum and a search color set. Returns the maximum score reached.
int32_t contRightX_Drop_OnRevComp(const neighborIterator<DataAccessor<seedlist>, DataStorage<seedlist>, false> &sucUnitig, const uint32_t &iniQoff, uint32_t &hitLen, const uint32_t &extLen, const string &q, const int16_t &X, const int32_t &lastUniTmpScore, list<uint16_t> &extPth, uint32_t &explCount, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet);
int32_t contRightX_Drop_OnRevComp(const neighborIterator<DataAccessor<UnitigInfo>, DataStorage<UnitigInfo>, false> &sucUnitig, const uint32_t &iniQoff, uint32_t &hitLen, const uint32_t &extLen, const string &q, const uint16_t &mscore, const int16_t &mmscore, const int16_t &X, const int32_t &lastSeedTmpScore, list<uint16_t> &extPth, uint32_t &explCount, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet, const bool& advIdx);
//This function goes through an extension list and incorporates all seeds that are not already part of a seed list into their corresponding seed lists NOTE: This function won't be used at some point anymore!
void processExtPtr(struct Ext_ptr*& extPtr);
//This function initiates the extension on all successors of a unitig and returns the best one considering a quorum and a search color set
int32_t extendAtNextUnitig(const ForwardCDBG<DataAccessor<seedlist>, DataStorage<seedlist>, false> sucIter, const uint32_t &iniQoff, uint32_t &hitLen, const uint32_t extLen, const string &q, const int16_t &X, const int32_t &lastExtSeedTmpScore, uint32_t &uniPos, list<uint16_t> &extPth, uint32_t &explCount, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet);
int32_t extendAtNextUnitig(const ForwardCDBG<DataAccessor<UnitigInfo>, DataStorage<UnitigInfo>, false> sucIter, const uint32_t &iniQoff, uint32_t &hitLen, const uint32_t extLen, const string &q, const uint16_t &mscore, const int16_t &mmscore, const int16_t &X, const int32_t &lastExtSeedTmpScore, uint32_t &uniPos, list<uint16_t> &extPth, uint32_t &explCount, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet, const bool& advIdx);
//This function initiates the extension on all successors of a unitig and returns the best one considering a quorum and a search color set. This function is explicitly designed for seeds lying on the query's reverse complement (considering the overlap between unitigs in sequences' beginning)
int32_t extendAtNextUnitig_OnRevComp(const ForwardCDBG<DataAccessor<seedlist>, DataStorage<seedlist>, false> sucIter, const uint32_t &iniQoff, uint32_t &hitLen, const uint32_t extLen, const string &q, const int16_t &X, const int32_t &lastExtSeedTmpScore, list<uint16_t> &extPth, uint32_t &explCount, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet);
int32_t extendAtNextUnitig_OnRevComp(const ForwardCDBG<DataAccessor<UnitigInfo>, DataStorage<UnitigInfo>, false> sucIter, const uint32_t &iniQoff, uint32_t &hitLen, const uint32_t extLen, const string &q, const uint16_t &mscore, const int16_t &mmscore, const int16_t &X, const int32_t &lastExtSeedTmpScore, list<uint16_t> &extPth, uint32_t &explCount, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet, const bool& advIdx);
//This function performs an extension on all possible predecessors of a unitig considering a quorum and a search color set and returns the maximum scoring one
int32_t extendAtPrevUnitig(const BackwardCDBG<DataAccessor<seedlist>, DataStorage<seedlist>, false> bwIter, uint32_t qPos, uint32_t &hitLen, const uint32_t &tmpExtLen, const string &q, const int16_t &X, const int32_t &lastExtSeedTmpScore, list<uint16_t> &extPth, uint32_t &explCount, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet);
int32_t extendAtPrevUnitig(const BackwardCDBG<DataAccessor<UnitigInfo>, DataStorage<UnitigInfo>, false> bwIter, uint32_t qPos, uint32_t &hitLen, const uint32_t &tmpExtLen, const string &q, const uint16_t &mscore, const int16_t &mmscore, const int16_t &X, const int32_t &lastExtSeedTmpScore, list<uint16_t> &extPth, uint32_t &explCount, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet, const bool& advIdx);
//This function performs an extension on all possible predecessors of a unitig considering a quorum and a search color set and returns the maximum scoring one. This function is explicitly designed for seeds lying on the query's reverse complement (considering the overlap between unitigs in sequences' beginning)
int32_t extendAtPrevUnitigOnRevComp(const BackwardCDBG<DataAccessor<seedlist>, DataStorage<seedlist>, false> bwIter, uint32_t qPos, uint32_t &hitLen, const uint32_t &tmpExtLen, const string &q, const int16_t &X, const int32_t &lastExtSeedTmpScore, list<uint16_t> &extPth, const uint32_t &lead, uint32_t &explCount, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet);
int32_t extendAtPrevUnitigOnRevComp(const BackwardCDBG<DataAccessor<UnitigInfo>, DataStorage<UnitigInfo>, false> bwIter, uint32_t qPos, uint32_t &hitLen, const uint32_t &tmpExtLen, const string &q, const uint16_t &mscore, const int16_t &mmscore, const int16_t &X, const int32_t &lastExtSeedTmpScore, list<uint16_t> &extPth, const uint32_t &lead, uint32_t &explCount, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet, const bool& advIdx);
//This function starts the left extension for seeds lying on the query's reference strand considering a quorum and a search color set
void startLeftX_Drop(Hit* hit, const string &q, const int16_t &X, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet);
void startLeftX_Drop(Hit* hit, const string &q, const uint16_t &mscore, const int16_t &mmscore, const int16_t &X, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet, const bool& advIdx);
//This function starts the left extension for seeds lying on the query's reverse complement considering a quorum and a search color set
void startLeftX_Drop_OnRevComp(Hit* hit, const string &q, const int16_t &X, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet);
void startLeftX_Drop_OnRevComp(Hit* hit, const string &q, const uint16_t &mscore, const int16_t &mmscore, const int16_t &X, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet, const bool& advIdx);
//This function continues a left extension on a predecessive unitig of a seed lying on the query's reference strand considering a quorum and a search color set and returns the achieved score
int32_t contLeftX_Drop(const neighborIterator<DataAccessor<seedlist>, DataStorage<seedlist>, false> &prevUni, uint32_t qPos, uint32_t &hitLen, const string &q, const int16_t &X, const int32_t &lastSeedTmpScore, list<uint16_t> &extPth, uint32_t &explCount, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet);
int32_t contLeftX_Drop(const neighborIterator<DataAccessor<UnitigInfo>, DataStorage<UnitigInfo>, false> &prevUni, uint32_t qPos, uint32_t &hitLen, const string &q, const uint16_t &mscore, const int16_t &mmscore, const int16_t &X, const int32_t &lastSeedTmpScore, list<uint16_t> &extPth, uint32_t &explCount, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet, const bool& advIdx);
//This function continues a left extension on a predecessive unitig of a seed lying on the query's reverse complement considering a quorum and a search color set and returns the achieved score
int32_t contLeftX_DropOnRevComp(const neighborIterator<DataAccessor<seedlist>, DataStorage<seedlist>, false> &prevUni, uint32_t qPos, uint32_t &hitLen, const string &q, const int16_t &X, const int32_t &lastSeedTmpScore, list<uint16_t> &extPth, uint32_t uPos, uint32_t &explCount, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet);
int32_t contLeftX_DropOnRevComp(const neighborIterator<DataAccessor<UnitigInfo>, DataStorage<UnitigInfo>, false> &prevUni, uint32_t qPos, uint32_t &hitLen, const string &q, const uint16_t &mscore, const int16_t &mmscore, const int16_t &X, const int32_t &lastSeedTmpScore, list<uint16_t> &extPth, uint32_t uPos, uint32_t &explCount, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet, const bool& advIdx);
//This function checks if a hit's start offset for the gapped extension lies inside the overlap of a unitig sequence's end. If so it moves the start position to a unitig where it is not inside the overlap anymore.
void mvStartToValUni(Hit* h, list<uint16_t>&);
void mvStartToValUni(Hit* h, list<uint16_t>& lExtPth);
//This function moves an offset from a given unitig to its successor while keeping left and right extension paths updated. If the offset at the successive unitig lies inside the overlap at the unitig sequence's end the function calls itself recursively
void switUni(uint32_t &offset, UnitigColorMap<seedlist> &currUni, list<uint16_t> &lExtPth, list<uint16_t> &rExtPth);
void switUni(uint32_t &offset, UnitigColorMap<UnitigInfo> &currUni, list<uint16_t> &lExtPth, list<uint16_t> &rExtPth);
#endif
\ No newline at end of file
This diff is collapsed.
......@@ -2,45 +2,46 @@
#define GAPPEDALIGN_HPP
#define MAX_MATRIX_SIZE 1000
#define GAP_SCORE -2
#define GAP_OPEN_SCORE -2
#define GAP_EXT_SCORE -1
#define GAP_RATIO 20
// #define GAP_RATIO 10
#define GAP_SYMB "-"
//This function calculates a semi-global alignment of a unitig and the query sequence considering a quorum and a search color set. Returns true if calculations on unitig could be finished (i.e. the unitig sequence was not to long to be stored inside the edit matrix).
bool calcSemiGlobAlignment(const UnitigColorMap<seedlist> &uni, const string &q, uint32_t& posU, uint32_t& posQ, const uint16_t &X, const uint32_t &maxGaps, uint32_t& maxPosQ, uint32_t& maxPosU, struct Algn &maxAlgn, struct Algn &brdAlgn, int32_t& maxScore, int32_t& eMax, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet, const bool srchCritCheck);
//This function calculates a semi-global alignment of a unitig and the query sequence considering a quorum and a search color set. Returns true if calculations on unitig could be finished (i.e. the unitig sequence was not too long to be stored inside the edit matrix).
bool calcSemiGlobAlignment(const UnitigColorMap<UnitigInfo> &uni, const string &q, uint32_t& posU, uint32_t& posQ, const uint16_t &mscore, const int16_t &mmscore, const int16_t &X, const int32_t &gOpen, const int32_t &gExt, const uint32_t &maxGaps, uint32_t& maxPosQ, uint32_t& maxPosU, struct Algn &maxAlgn, struct Algn &brdAlgn, int32_t& maxScore, int32_t& eMax, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet, const bool srchCritCheck, const bool& advIdx);
//This function calculates a banded alignment of a unitig and the query sequence to the left considering a quorum and a search color set. Returns true if calculations on unitig could be finished (i.e. the unitig sequence was not to long to be stored inside the edit matrix).
bool calcLeftGlobAlignment(const UnitigColorMap<seedlist> &uni, const string &q, uint32_t& posU, uint32_t& posQ, const uint16_t &X, const uint32_t &maxGaps, uint32_t& maxPosQ, uint32_t& maxPosU, struct Algn &maxAlgn, struct Algn &brdAlgn, int32_t& maxScore, int32_t& eMax, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet, const bool srchCritCheck);
//This function calculates a banded alignment of a unitig and the query sequence to the left considering a quorum and a search color set. Returns true if calculations on unitig could be finished (i.e. the unitig sequence was not too long to be stored inside the edit matrix).
bool calcLeftGlobAlignment(const UnitigColorMap<UnitigInfo> &uni, const string &q, uint32_t& posU, uint32_t& posQ, const uint16_t &mscore, const int16_t &mmscore, const int16_t &X, const int32_t &gOpen, const int32_t &gExt, const uint32_t &maxGaps, uint32_t& maxPosQ, uint32_t& maxPosU, struct Algn &maxAlgn, struct Algn &brdAlgn, int32_t& maxScore, int32_t& eMax, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet, const bool srchCritCheck, const bool& advIdx);
//This function calculates the continuation of a gapped alignment on a successive unitig or the next peace of the query considering a quorum and a search color set
void contRightGappedAlignment(UnitigColorMap<seedlist> &uni, list<uint16_t> &extPth, const string &q, uint32_t &posQ, uint32_t &posU, const uint16_t &X, const uint32_t &maxGaps, struct Align &align, int32_t &score, uint32_t &explCount, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet);
void contRightGappedAlignment(UnitigColorMap<UnitigInfo> &uni, list<uint16_t> &extPth, const string &q, uint32_t &posQ, uint32_t &posU, const uint16_t &mscore, const int16_t &mmscore, const int16_t &X, const int32_t &gOpen, const int32_t &gExt, const uint32_t &maxGaps, struct Algn &algn, int32_t &score, uint32_t &explCount, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet, const bool& advIdx);
//This function calculates the continuation of a gapped alignment on a predecessive unitig or the next peace of the query considering a quorum and a search color set
void contLeftGappedAlignment(UnitigColorMap<seedlist> &uni, list<uint16_t> &extPth, const string &q, uint32_t &posQ, uint32_t &posU, const uint16_t &X, const uint32_t &maxGaps, struct Algn &algn, int32_t &score, uint32_t &explCount, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet);
void contLeftGappedAlignment(UnitigColorMap<UnitigInfo> &uni, list<uint16_t> &extPth, const string &q, uint32_t &posQ, uint32_t &posU, const uint16_t &mscore, const int16_t &mmscore, const int16_t &X, const int32_t &gOpen, const int32_t &gExt, const uint32_t &maxGaps, struct Algn &maxAlgn, int32_t &score, uint32_t &explCount, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet, const bool& advIdx);
//Continues a gapped alignment on the same unitig as before considering a quorum and a search color set. Returns true if a new maximal score has been found.
bool contGappedOnSameUni(UnitigColorMap<seedlist> &uni, list<uint16_t> &extPth, const string &q, uint32_t &qOff, uint32_t &uOff, const int16_t &X, const uint32_t &maxGaps, struct Algn &algn, int32_t &score, uint32_t &explCount, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet);
bool contGappedOnSameUni(UnitigColorMap<UnitigInfo> &uni, list<uint16_t> &extPth, const string &q, uint32_t &qOff, uint32_t &uOff, const uint16_t &mscore, const int16_t &mmscore, const int16_t &X, const int32_t &gOpen, const int32_t &gExt, const uint32_t &maxGaps, struct Algn &algn, int32_t &score, uint32_t &explCount, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet, const bool& advIdx);
//This function continues a gapped alignment calculation to the left on the same unitig considering a quorum and a search color set. Returns true if a new maximal score has been found.
bool contLeftOnSameUni(UnitigColorMap<seedlist> &uni, list<uint16_t> &extPth, const string &q, uint32_t &qOff, uint32_t &uOff, const int16_t &X, const uint32_t &maxGaps, struct Algn &algn, int32_t &score, uint32_t &explCount, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet);
bool contLeftOnSameUni(UnitigColorMap<UnitigInfo> &uni, list<uint16_t> &extPth, const string &q, uint32_t &qOff, uint32_t &uOff, const uint16_t &mscore, const int16_t &mmscore, const int16_t &X, const int32_t &gOpen, const int32_t &gExt, const uint32_t &maxGaps, struct Algn &algn, int32_t &score, uint32_t &explCount, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet, const bool& advIdx);
//Initiates a gapped alignment calculation on all successive unitigs considering a quorum and a search color set and returns the alignment's end position in the unitig
bool contGappedOnSuccUni(UnitigColorMap<seedlist> &uni, list<uint16_t> &extPth, const string &q, uint32_t &qOff, uint32_t &uOff, const int16_t &X, const uint32_t &maxGaps, struct Algn &algn, int32_t &score, uint32_t &explCount, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet);
bool contGappedOnSuccUni(UnitigColorMap<UnitigInfo> &uni, list<uint16_t> &extPth, const string &q, uint32_t &qOff, uint32_t &uOff, const uint16_t &mscore, const int16_t &mmscore, const int16_t &X, const int32_t &gOpen, const int32_t &gExt, const uint32_t &maxGaps, struct Algn &algn, int32_t &score, uint32_t &explCount, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet, const bool& advIdx);
//This function initiates a gapped alignment calculation on all predecessive unitigs considering a quorum and a search color set. Returns true if calculations have been successful
bool contGappedOnPredUni(UnitigColorMap<seedlist> &uni, list<uint16_t> &extPth, const string &q, uint32_t &qOff, uint32_t &uOff, const int16_t &X, const uint32_t &maxGaps, struct Algn &algn, int32_t &score, uint32_t &explCount, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet);
bool contGappedOnPredUni(UnitigColorMap<UnitigInfo> &uni, list<uint16_t> &extPth, const string &q, uint32_t &qOff, uint32_t &uOff, const uint16_t &mscore, const int16_t &mmscore, const int16_t &X, const int32_t &gOpen, const int32_t &gExt, const uint32_t &maxGaps, struct Algn &algn, int32_t &score, uint32_t &explCount, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet, const bool& advIdx);
//This function calculates a gapped alignment to the right side of the starting position considering a quorum and a search color set. ATTENTION: Hit's length attribute will be deprecated after function call!
void startRightGappedAlignment(Hit *h, const string &q, const uint16_t &X, const uint32_t maxGaps, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet);
void startRightGappedAlignment(Hit *h, const string &q, const uint16_t &mscore, const int16_t &mmscore, const int16_t &X, const int32_t &gOpen, const int32_t &gExt, const uint32_t maxGaps, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet, const bool& advIdx);
//This function calculates a gapped alignment to the left side of the starting position considering a quorum and a search color set. ATTENTION: Hit's length attribute will be deprecated after function call!
void startLeftGappedAlignment(Hit *h, const string &q, const uint16_t &X, const uint32_t maxGaps, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet);
void startLeftGappedAlignment(Hit *h, const string &q, const uint16_t &mscore, const int16_t &mmscore, const int16_t &X, const int32_t &gOpen, const int32_t &gExt, const uint32_t maxGaps, const uint32_t &quorum, const list<pair<string, size_t>> &searchSet, const bool& advIdx);
//This function traces back the optimal path of a right gapped extension from a given start position within an edit matrix to get the corresponding alignment
void traceBack(uint32_t &matPosI, uint32_t posQ, const string &query, uint32_t &matPosJ, uint32_t posU, const string &uniSeq, int32_t **mat, struct Algn &algn);
void traceBack(uint32_t &matPosI, uint32_t posQ, const string &query, uint32_t &matPosJ, uint32_t posU, const string &uniSeq, int32_t **mat, const uint16_t &mscore, const int16_t &mmscore, const int32_t &gOpen, const int32_t &gExt, struct Algn &algn);
//This function traces back the optimal path of a left gapped extension from a given start position within an edit matrix to get the corresponding alignment
void traceForth(uint32_t &matPosI, uint32_t posQ, const string &query, uint32_t &matPosJ, uint32_t posU, const string &uniSeq, int32_t **mat, struct Algn &algn);
void traceForth(uint32_t &matPosI, uint32_t posQ, const string &query, uint32_t &matPosJ, uint32_t posU, const string &uniSeq, int32_t **mat, const uint16_t &mscore, const int16_t &mmscore, const int32_t &gOpen, const int32_t &gExt, struct Algn &algn);
#endif
\ No newline at end of file
#include "Graph.h"
//This function builds a graph
void genGraph(ColoredCDBG<seedlist> &cdbg, CCDBG_Build_opt &cdbgOpt){
void genGraph(ColoredCDBG<UnitigInfo> &cdbg, CCDBG_Build_opt &cdbgOpt){
//Setting build options
//Print information messages during execution if true. Default is false.
......@@ -16,7 +16,7 @@ void genGraph(ColoredCDBG<seedlist> &cdbg, CCDBG_Build_opt &cdbgOpt){
}
//This function assigns color ids from the graph to color names in the color set
void mapColorIds(list<pair<string, size_t>>& colorSet, const ColoredCDBG<seedlist>& g){
void mapColorIds(list<pair<string, size_t>>& colorSet, const ColoredCDBG<UnitigInfo>& g){
//Get number of colors in the graph
size_t totCol = g.getNbColors();
......
......@@ -9,9 +9,9 @@
#define DEFAULT_NB_THREADS 1
//This function builds a graph
void genGraph(ColoredCDBG<seedlist> &cdbg, CCDBG_Build_opt &cdbgOpt);
void genGraph(ColoredCDBG<UnitigInfo> &cdbg, CCDBG_Build_opt &cdbgOpt);
//This function assigns color ids from the graph to color names in the color set
void mapColorIds(list<pair<string, size_t>>& colorSet, const ColoredCDBG<seedlist>& g);
void mapColorIds(list<pair<string, size_t>>& colorSet, const ColoredCDBG<UnitigInfo>& g);
#endif
\ No newline at end of file
......@@ -9,6 +9,9 @@ void replWorseRes(list<Hit*> &hList, Hit* hit){
if((*i)->score < hit->score){
//Insert hit
hList.insert(i, hit);
//Remove extension paths
frExtPth(hList.back()->rExt);
frExtPth(hList.back()->lExt);
//Remove worst hit
hList.pop_back();
return;
......@@ -40,12 +43,19 @@ struct ExtPth cmprExtPth(const list<uint16_t>& extPth){
//Get number of elements in our path
ePath.nbElem = extPth.size();
//Calculate how many bytes we need to save the path
nbBytes = (ePath.nbElem / SUCCESSORS_PER_BYTE) + ((ePath.nbElem % SUCCESSORS_PER_BYTE) == 0 ? 0 : 1);
//Allocate space for compressed path
ePath.path = (unsigned char*) malloc(nbBytes);
//Make all bits in the first byte zero
*ePath.path = 0;
//Check if we have to allocate memory for path compression
if(ePath.nbElem){
//Testing
// cout << "ePath.nbElem: " << ePath.nbElem << endl;
//Calculate how many bytes we need to save the path
nbBytes = (ePath.nbElem / SUCCESSORS_PER_BYTE) + ((ePath.nbElem % SUCCESSORS_PER_BYTE) == 0 ? 0 : 1);
//Allocate space for compressed path
ePath.path = (unsigned char*) malloc(nbBytes);
//Make all bits in the first byte zero
*ePath.path = 0;
}
//Walk through extension path
for(list<uint16_t>::const_iterator j = extPth.begin(); j != extPth.end(); ++j){
......@@ -105,7 +115,8 @@ const list<uint16_t> decmprExtPth(const struct ExtPth &cmpPth){
++cmpSucs;
}
//Free memory of compressed path
free(cmpPth.path);
//Free memory of compressed path if existing
if(nbBytes) free(cmpPth.path);
return path;
}
\ No newline at end of file
......@@ -27,7 +27,7 @@ class Hit {
public:
//Default constructor
Hit () { lExt.path = NULL; rExt.path = NULL; }
Hit () {}
//Copy constructor
Hit (const Hit &h) {
......@@ -52,7 +52,7 @@ class Hit {
//The hit's e-value
double eval;
//Initial unitig the extension was started
UnitigColorMap<seedlist> origUni;
UnitigColorMap<UnitigInfo> origUni;
//Left-side extension path of extended seed
struct ExtPth lExt;
//Right-side extension path of extended seed
......@@ -84,4 +84,7 @@ const list<uint16_t> decmprExtPth(const struct ExtPth &cmpPth);
//This function compares two hits based on their e-values. Returns true if fh is smaller than sh.
inline bool compEvals(const Hit *fh, const Hit *sh){ return (fh->eval < sh->eval); };
//This function frees the memory allocated for an extension path if it is not empty
inline void frExtPth(const struct ExtPth &ePth){ if(ePth.nbElem) free(ePth.path); };
#endif
\ No newline at end of file
......@@ -3,7 +3,7 @@
#include "IO.h"
//This function parses the program parameters. Returns false if given arguments are not valid.
const bool parseArgs(int& nb_args, char** argList, int16_t& prepros, string& filePref, int32_t& s, int32_t& k, int32_t& g, CCDBG_Build_opt &gOpt, int32_t& t, string& qFile, string& c, uint32_t& m, SrchStrd& strd, bool& r, int16_t& X, uint16_t &nRes, double &lambda, double &lambdaG, double &C, double &Cgap, double &eValLim, bool &isSim){
const bool parseArgs(int& nb_args, char** argList, int16_t& prepros, string& filePref, int32_t& s, int32_t& k, int32_t& g, CCDBG_Build_opt &gOpt, int32_t& t, string& qFile, string& c, uint32_t& m, SrchStrd& strd, bool& r, uint16_t &mscore, int16_t &mmscore, int16_t& X, int32_t &gOpen, int32_t &gExt, uint16_t &nRes, double &lambda, double &lambdaG, double &C, double &Cgap, double &eValLim, bool &isSim, bool &advIdx){
int option_index = 0, a;
//Check wheather arguments are given for anything at all
......@@ -19,15 +19,20 @@ const bool parseArgs(int& nb_args, char** argList, int16_t& prepros, string& fil
{"threads", optional_argument, 0, 't'},
{"query", optional_argument, 0, 'q'},
{"search-set", optional_argument, 0, 's'},
{"quorum", optional_argument, 0, 'm'},
{"quorum", optional_argument, 0, 'Q'},
{"X-dropoff", optional_argument, 0, 'X'},
{"max-results", optional_argument, 0, 'n'},
{"strand", optional_argument, 0, 'd'},
{"strand", optional_argument, 0, 'o'},
{"lambda", optional_argument, 0, 'l'},
{"lambda-gap", optional_argument, 0, 'L'},
{"stat-C", optional_argument, 0, 'c'},
{"stat-C-gap", optional_argument, 0, 'C'},
{"e-value", optional_argument, 0, 'e'},
{"e-val-thres", optional_argument, 0, 'T'},
{"mismatch", optional_argument, 0, 'm'},
{"match", optional_argument, 0, 'M'},
{"gap-open", optional_argument, 0, 'd'},
{"gap-extension", optional_argument, 0, 'e'},
{"advanced-index", no_argument, 0, 'a'},
{"report-colors", no_argument, 0, 'r'},
{"sim-run", no_argument, 0, 'u'},
{0, 0, 0, 0 }
......@@ -96,7 +101,7 @@ const bool parseArgs(int& nb_args, char** argList, int16_t& prepros, string& fil
case 's':
c = optarg;
break;
case 'm':
case 'Q':
//A quorum has to be positive
if(atoi(optarg) <= 0){
cerr << "ERROR: Quorum parameter has to be a positive number" << endl;
......@@ -130,7 +135,7 @@ const bool parseArgs(int& nb_args, char** argList, int16_t& prepros, string& fil
case 'r':
r = true;
break;
case 'd':
case 'o':
if(*optarg == PLUS_STRAND){
strd = Plus;
} else if(*optarg == MINUS_STRAND){
......@@ -184,7 +189,7 @@ const bool parseArgs(int& nb_args, char** argList, int16_t& prepros, string& fil
}
break;
case 'e':
case 'T':
//Try to read e-value threshold
eValLim = strtod(optarg, NULL);
......@@ -194,9 +199,56 @@ const bool parseArgs(int& nb_args, char** argList, int16_t& prepros, string& fil
return false;
}
break;
case 'a':
advIdx = true;
break;
case 'u':
isSim = true;
break;
case 'd':
gOpen = atoi(optarg);
//Check if score is negative
if(gOpen >= 0){
cerr << "ERROR: Gap open scores should be negative" << endl;
return false;
}
break;
case 'e':
gExt = atoi(optarg);
//TODO: This should be uncommented as soon as affine gap scores become possible!
// //Check if score is negative
// if(gExt >= 0){
// cerr << "ERROR Gap extension scores should be negative" << endl;
// return false
// }
//TODO: This warning should be removed as soon as affine gap costs are supported
cerr << "WARNING: Affine gap scores are not supported yet! Setting this parameter has no effect" << endl;
break;
case 'M':
mscore = atoi(optarg);
//Check if score is not positive
if(atoi(optarg) <= 0){
cerr << "ERROR: Match scores should be positive" << endl;
return false;
}
break;
case 'm':
mmscore = atoi(optarg);
//Check if score is not negative
if(mmscore >= 0){
cerr << "ERROR: Mismatch scores should be negative" << endl;
return false;
}
break;
default:
break;
......@@ -212,6 +264,13 @@ const bool parseArgs(int& nb_args, char** argList, int16_t& prepros, string& fil
//The search command additionally needs a query sequence
if(prepros < 0 && !strcmp(qFile.c_str(), "")) return false;
//TODO This has to be inserted into the code as soon as affine gap scores are possible!
// //Check if gap open scores are smaller than gap extension scores
// if(gOpen < gExt){
// cerr << "ERROR: Using gap open scores smaller than gap extension scores makes no sense" << endl;
// return false;
// }
return true;
}
......@@ -328,7 +387,7 @@ void repAlgn(const Hit *res){
}
//This function outputs the color sets of a given result
void outpColSets(ColoredCDBG<seedlist> &cdbg, const Hit *res){
void outpColSets(ColoredCDBG<UnitigInfo> &cdbg, const Hit *res){
bool identical = false;
string kmerSeq = string(res->origUni.getGraph()->getK(), 'A');
......@@ -341,7 +400,7 @@ void outpColSets(ColoredCDBG<seedlist> &cdbg, const Hit *res){
vector<string> colors;
vector<string>::iterator colIt;
Kmer currK;
UnitigColorMap<seedlist> uni;
UnitigColorMap<UnitigInfo> uni;
ColSet curColSet = ColSet(0, colors);
//Go through graph alignment sequence
......
......@@ -4,7 +4,7 @@
#include "Search.h"
#define MIN_PARAM_NB 4
#define OPTIONS "i:s:w:k:g:S:R:t:q:c:m:X:n:d:l:L:C:e:ru"
#define OPTIONS "i:s:w:k:g:S:R:t:q:c:Q:m:M:X:e:n:o:d:l:L:C:T:aru"
#define BUILD_COMMAND "Build"
#define SEARCH_COMMAND "Search"
#define RUNTIME_FLAG_DEFAULT false
......@@ -23,7 +23,7 @@
const list<pair<string, size_t>> loadSearchColors(const char* filename, uint32_t& nbCols);
//This function parses the program parameters. Returns false if given arguments are not valid
const bool parseArgs(int& nb_args, char** argList, int16_t& prepros, string& filePref, int32_t& s, int32_t& k, int32_t& g, CCDBG_Build_opt &gOpt, int32_t& t, string& qFile, string& c, uint32_t& m, SrchStrd& strd, bool& r, int16_t &X, uint16_t &nRes, double &lambda, double &lambdaG, double &C, double &Cgap, double &eValLim, bool &isSim);
const bool parseArgs(int& nb_args, char** argList, int16_t& prepros, string& filePref, int32_t& s, int32_t& k, int32_t& g, CCDBG_Build_opt &gOpt, int32_t& t, string& qFile, string& c, uint32_t& m, SrchStrd& strd, bool& r, uint16_t &mscore, int16_t &mmscore, int16_t& X, int32_t &gOpen, int32_t &gExt, uint16_t &nRes, double &lambda, double &lambdaG, double &C, double &Cgap, double &eValLim, bool &isSim, bool &advIdx);
//This function prints usage infos
inline void dispHelp(){
......@@ -38,9 +38,11 @@ inline void dispHelp(){
cerr << " -w --seed-length Minimal seed length an index is built for (default is 11)" << endl;
cerr << " -k --kmer-length Length of k-mers in a newly built graph (default is 31)" << endl;
cerr << " -g --min-length Length of minimizers in a newly built graph (default is 23)" << endl;
cerr << " -S --input-seqs Names of raw input sequence file(s) (FASTA/FASTQ) to build a new graph from (all sequences of a file will share a color in the graph)" << endl;
cerr << " -R --input-refs Names of reference input sequence file(s) (FASTA/FASTQ) to build a new graph from (all sequences of a file will share a color in the graph)" << endl;
cerr << " -S --input-seqs Names of raw input sequence file(s) (FASTA/FASTQ) to build a new graph from (all sequences from the same file will share a color in the graph)" << endl;
cerr << " -R --input-refs Names of reference input sequence file(s) (FASTA/FASTQ) to build a new graph from (all sequences from the same file will share a color in the graph)" << endl;
cerr << " -t --threads Number of threads to be used for graph construction (default is 1)" << endl << endl;
cerr << " >Optional without argument:" << endl << endl;
cerr << " -a --advanced-index Construct advanced index for faster quorum searches" << endl << endl;
cerr << "[COMMAND_PARAMETERS]: Search" << endl << endl;
cerr << " >Mandatory with required argument:" << endl << endl;
cerr << " -q --query Query sequence file (1 query per line)" << endl;
......@@ -48,15 +50,19 @@ inline void dispHelp(){
cerr << " >Optional with required argument:" << endl << endl;
cerr << " -w --seed-length Minimal seed length (default is 11)" << endl;
cerr << " -s --search-set Search color set file with one color name per line to consider during the search (default is all colors)" << endl;
cerr << " -m --quorum Quorum (absolute value)" << endl;
cerr << " -Q --quorum Absolute quorum (default is 1 color)" << endl;
cerr << " -m --mismatch Mismatch score used (default is -1)" << endl;
cerr << " -M --match Match score used (default is 1)" << endl;
cerr << " -X --X-dropoff X-dropoff value" << endl;
cerr << " -d --gap-open Gap open score used for gapped alignment (default is -2)" << endl;
cerr << " -e --gap-extension Gap extension score used for gapped alignment (default is -2)" << endl;
cerr << " -n --max-results Maximum number of alignments to be outputted (default is " << DEFAULT_NB_RES << ")" << endl;
cerr << " -d --strand DNA strands to consider during search. Can be '+','-' (default is both)" << endl;
cerr << " -o --strand DNA strands to consider during search. Can be '+','-' (default is both)" << endl;
cerr << " -l --lambda Statistical value lambda for ungapped extension" << endl;
cerr << " -L --lambda-gap Statistical value lambda for gapped extension" << endl;
cerr << " -c --stat-C Statistical value C for ungapped extension" << endl;
cerr << " -C --stat-C-gap Statistical value C for gapped extension" << endl;
cerr << " -e --e-value Expectation value threshold for a hit to be considered (default is 10)" << endl << endl;
cerr << " -T --e-val-thres Expectation value threshold for a hit to be considered (default is 10)" << endl << endl;
cerr << " >Optional without argument:" << endl << endl;
cerr << " -r --report-colors Enable alignment color coverage output" << endl;
}
......@@ -68,10 +74,10 @@ void loadQueries(const string &filename, vector<string> &qList);
void repAlgn(const Hit *res);
//This function outputs the color sets of a given result
void outpColSets(ColoredCDBG<seedlist> &cdbg, const Hit *res);
void outpColSets(ColoredCDBG<UnitigInfo> &cdbg, const Hit *res);
//This function iterates over an UnitigColors object and returns the set of all color names
inline const vector<string> formColSet(ColoredCDBG<seedlist> &cdbg, const UnitigColorMap<seedlist> &uni){
inline const vector<string> formColSet(ColoredCDBG<UnitigInfo> &cdbg, const UnitigColorMap<UnitigInfo> &uni){
string color;
//Initialize color set
vector<string> set = vector<string>();
......
......@@ -52,8 +52,10 @@ uint32_t* loadQProf(ifstream &iFile, const uint32_t size){//TODO: Can we make th
}
//This function saves PLAST's index data structures as binary
void saveIndexBin(const char *filename, const uint32_t &profSize, uint32_t* qProfile, const uint32_t numUni, UnitigColorMap<seedlist>* uniArr, struct s_mer_pos*& pos, size_t &seedNum, const int32_t &k){
void saveIndexBin(const char *filename, const int32_t& sdLen, const uint32_t& profSize, uint32_t* qProfile, const uint32_t numUni, UnitigColorMap<UnitigInfo>* uniArr, struct S_mer_pos*& pos, const size_t &seedNum, const int32_t &k, const bool& advIdx){
uint16_t qrm;
size_t i;
int32_t brd;
//Open a file to write
ofstream destFile(filename, ios::out | ios::binary);
......@@ -62,8 +64,16 @@ void saveIndexBin(const char *filename, const uint32_t &profSize, uint32_t* qPro
if(!destFile.is_open()){
cerr << "ERROR: Destination file to save index could not be opened" << endl;
exit(EXIT_FAILURE);
}
}
//Save general meta data//
//Write index version
destFile.write(INDEX_VERSION, sizeof(INDEX_VERSION));
//Write minimum seed length used for this index
destFile.write((char*) &sdLen, sizeof(int32_t));
//Write if this index contains quorum information
destFile.write((char*) &advIdx, sizeof(bool));
//Save the q-gram profile
saveProfile(destFile, profSize, qProfile);
......@@ -73,8 +83,26 @@ void saveIndexBin(const char *filename, const uint32_t &profSize, uint32_t* qPro
for(i = 0; i < numUni; ++i){
//Write the compacted representation of current unitig's starting k-mer to file
destFile.write(cmprSeq(uniArr[i].referenceUnitigToString().substr(0, k), nBytes), nBytes);
//Check if we have to save precalculated quorum information
if(advIdx){
//Write it to file
qrm = uniArr[i].getData()->getData(uniArr[i])->getGlobQrm();
destFile.write((char*) &qrm, sizeof(uint16_t));
qrm = uniArr[i].getData()->getData(uniArr[i])->getLBrdQrm();
destFile.write((char*) &qrm, sizeof(uint16_t));
qrm = uniArr[i].getData()->getData(uniArr[i])->getRBrdQrm();
destFile.write((char*) &qrm, sizeof(uint16_t));
brd = uniArr[i].getData()->getData(uniArr[i])->getPrecLBrd();
destFile.write((char*) &brd, sizeof(int32_t));
brd = uniArr[i].getData()->getData(uniArr[i])->getPrecRBrd();
destFile.write((char*) &brd, sizeof(int32_t));
}
}
//Free unitig array
free(uniArr);
//Save pos array
//Walk through pos array
......@@ -85,13 +113,16 @@ void saveIndexBin(const char *filename, const uint32_t &profSize, uint32_t* qPro
destFile.write((char*) &pos[i].offset, sizeof(uint32_t));
}
//Free pos array
free(pos);
//Close the file
destFile.close();
}
//This function loads the indices from a binary file
void loadIndexesBin(const char* fName, const uint32_t &qProfSize, uint32_t *qProf, ColoredCDBG<seedlist>& graph, UnitigColorMap<seedlist>*& uniArr, size_t &posSize, struct s_mer_pos*& pos){
char *memBlock;
void loadIndexesBin(const char* fName, int32_t& sdLen, uint32_t &qProfSize, uint32_t *&qProf, ColoredCDBG<UnitigInfo>& graph, UnitigColorMap<UnitigInfo>*& uniArr, size_t &posSize, struct S_mer_pos*& pos, bool& advIdx){
char *memBlock, *qrmBlock = NULL, *brdBlock = NULL;