/* BWTConstruct.c BWT-Index Construction This module constructs BWT and auxiliary data structures. Copyright (C) 2004, Wong Chi Kwong. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include #include #include "bwt_gen.h" #include "QSufSort.h" static unsigned int TextLengthFromBytePacked(unsigned int bytePackedLength, unsigned int bitPerChar, unsigned int lastByteLength) { if (bytePackedLength > ALL_ONE_MASK / (BITS_IN_BYTE / bitPerChar)) { fprintf(stderr, "TextLengthFromBytePacked(): text length > 2^32!\n"); exit(1); } return (bytePackedLength - 1) * (BITS_IN_BYTE / bitPerChar) + lastByteLength; } static void initializeVAL(unsigned int *startAddr, const unsigned int length, const unsigned int initValue) { unsigned int i; for (i=0; i>= 2; } } } // for BWTIncCreate() static unsigned int BWTOccValueMajorSizeInWord(const unsigned int numChar) { unsigned int numOfOccValue; unsigned int numOfOccIntervalPerMajor; numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding numOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL; return (numOfOccValue + numOfOccIntervalPerMajor - 1) / numOfOccIntervalPerMajor * ALPHABET_SIZE; } // for BWTIncCreate() static unsigned int BWTOccValueMinorSizeInWord(const unsigned int numChar) { unsigned int numOfOccValue; numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding return (numOfOccValue + OCC_VALUE_PER_WORD - 1) / OCC_VALUE_PER_WORD * ALPHABET_SIZE; } // for BWTIncCreate() static unsigned int BWTResidentSizeInWord(const unsigned int numChar) { unsigned int numCharRoundUpToOccInterval; // The $ in BWT at the position of inverseSa0 is not encoded numCharRoundUpToOccInterval = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL; return (numCharRoundUpToOccInterval + CHAR_PER_WORD - 1) / CHAR_PER_WORD; } static void BWTIncSetBuildSizeAndTextAddr(BWTInc *bwtInc) { unsigned int maxBuildSize; if (bwtInc->bwt->textLength == 0) { // initial build // Minus 2 because n+1 entries of seq and rank needed for n char maxBuildSize = (bwtInc->availableWord - 2 - OCC_INTERVAL / CHAR_PER_WORD) / (2 * CHAR_PER_WORD + 1) * CHAR_PER_WORD; if (bwtInc->initialMaxBuildSize > 0) { bwtInc->buildSize = min(bwtInc->initialMaxBuildSize, maxBuildSize); } else { bwtInc->buildSize = maxBuildSize; } } else { // Minus 3 because n+1 entries of sorted rank, seq and rank needed for n char // Minus numberOfIterationDone because bwt slightly shift to left in each iteration maxBuildSize = (bwtInc->availableWord - bwtInc->bwt->bwtSizeInWord - bwtInc->bwt->occSizeInWord - 3 - bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR) / 3; if (maxBuildSize < CHAR_PER_WORD) { fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n"); exit(1); } if (bwtInc->incMaxBuildSize > 0) { bwtInc->buildSize = min(bwtInc->incMaxBuildSize, maxBuildSize); } else { bwtInc->buildSize = maxBuildSize; } if (bwtInc->buildSize < CHAR_PER_WORD) { bwtInc->buildSize = CHAR_PER_WORD; } } if (bwtInc->buildSize < CHAR_PER_WORD) { fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n"); exit(1); } bwtInc->buildSize = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD; bwtInc->packedText = bwtInc->workingMemory + 2 * (bwtInc->buildSize + 1); bwtInc->textBuffer = (unsigned char*)(bwtInc->workingMemory + bwtInc->buildSize + 1); } // for ceilLog2() unsigned int leadingZero(const unsigned int input) { unsigned int l; const static unsigned int leadingZero8bit[256] = {8,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; if (input & 0xFFFF0000) { if (input & 0xFF000000) { l = leadingZero8bit[input >> 24]; } else { l = 8 + leadingZero8bit[input >> 16]; } } else { if (input & 0x0000FF00) { l = 16 + leadingZero8bit[input >> 8]; } else { l = 24 + leadingZero8bit[input]; } } return l; } // for BitPerBytePackedChar() static unsigned int ceilLog2(const unsigned int input) { if (input <= 1) return 0; return BITS_IN_WORD - leadingZero(input - 1); } // for ConvertBytePackedToWordPacked() static unsigned int BitPerBytePackedChar(const unsigned int alphabetSize) { unsigned int bitPerChar; bitPerChar = ceilLog2(alphabetSize); // Return the largest number of bit that does not affect packing efficiency if (BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar) > bitPerChar) bitPerChar = BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar); return bitPerChar; } // for ConvertBytePackedToWordPacked() static unsigned int BitPerWordPackedChar(const unsigned int alphabetSize) { return ceilLog2(alphabetSize); } static void ConvertBytePackedToWordPacked(const unsigned char *input, unsigned int *output, const unsigned int alphabetSize, const unsigned int textLength) { unsigned int i, j, k; unsigned int c; unsigned int bitPerBytePackedChar; unsigned int bitPerWordPackedChar; unsigned int charPerWord; unsigned int charPerByte; unsigned int bytePerIteration; unsigned int byteProcessed = 0; unsigned int wordProcessed = 0; unsigned int mask, shift; unsigned int buffer[BITS_IN_WORD]; bitPerBytePackedChar = BitPerBytePackedChar(alphabetSize); bitPerWordPackedChar = BitPerWordPackedChar(alphabetSize); charPerByte = BITS_IN_BYTE / bitPerBytePackedChar; charPerWord = BITS_IN_WORD / bitPerWordPackedChar; bytePerIteration = charPerWord / charPerByte; mask = truncateRight(ALL_ONE_MASK, BITS_IN_WORD - bitPerWordPackedChar); shift = BITS_IN_WORD - BITS_IN_BYTE + bitPerBytePackedChar - bitPerWordPackedChar; while ((wordProcessed + 1) * charPerWord < textLength) { k = 0; for (i=0; i> bitPerWordPackedChar * i; } output[wordProcessed] = c; wordProcessed++; } k = 0; for (i=0; i < (textLength - wordProcessed * charPerWord - 1) / charPerByte + 1; i++) { c = (unsigned int)input[byteProcessed] << shift; for (j=0; j> bitPerWordPackedChar * i; } output[wordProcessed] = c; } BWT *BWTCreate(const unsigned int textLength, unsigned int *decodeTable) { BWT *bwt; bwt = (BWT*)calloc(1, sizeof(BWT)); bwt->textLength = 0; bwt->inverseSa = 0; bwt->cumulativeFreq = (unsigned*)calloc((ALPHABET_SIZE + 1), sizeof(unsigned int*)); initializeVAL(bwt->cumulativeFreq, ALPHABET_SIZE + 1, 0); bwt->bwtSizeInWord = 0; bwt->saValueOnBoundary = NULL; // Generate decode tables if (decodeTable == NULL) { bwt->decodeTable = (unsigned*)calloc(DNA_OCC_CNT_TABLE_SIZE_IN_WORD, sizeof(unsigned int)); GenerateDNAOccCountTable(bwt->decodeTable); } else { bwt->decodeTable = decodeTable; } bwt->occMajorSizeInWord = BWTOccValueMajorSizeInWord(textLength); bwt->occValueMajor = (unsigned*)calloc(bwt->occMajorSizeInWord, sizeof(unsigned int)); bwt->occSizeInWord = 0; bwt->occValue = NULL; bwt->saInterval = ALL_ONE_MASK; bwt->saValueSize = 0; bwt->saValue = NULL; bwt->inverseSaInterval = ALL_ONE_MASK; bwt->inverseSaSize = 0; bwt->inverseSa = NULL; return bwt; } BWTInc *BWTIncCreate(const unsigned int textLength, const float targetNBit, const unsigned int initialMaxBuildSize, const unsigned int incMaxBuildSize) { BWTInc *bwtInc; unsigned int i; if (targetNBit == 0) { fprintf(stderr, "BWTIncCreate() : targetNBit = 0!\n"); exit(1); } bwtInc = (BWTInc*)calloc(1, sizeof(BWTInc)); bwtInc->numberOfIterationDone = 0; bwtInc->bwt = BWTCreate(textLength, NULL); bwtInc->initialMaxBuildSize = initialMaxBuildSize; bwtInc->incMaxBuildSize = incMaxBuildSize; bwtInc->targetNBit = targetNBit; bwtInc->cumulativeCountInCurrentBuild = (unsigned*)calloc((ALPHABET_SIZE + 1), sizeof(unsigned int)); initializeVAL(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0); // Build frequently accessed data bwtInc->packedShift = (unsigned*)calloc(CHAR_PER_WORD, sizeof(unsigned int)); for (i=0; ipackedShift[i] = BITS_IN_WORD - (i+1) * BIT_PER_CHAR; } bwtInc->targetTextLength = textLength; bwtInc->availableWord = (unsigned int)((textLength + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL / BITS_IN_WORD * bwtInc->targetNBit); if (bwtInc->availableWord < BWTResidentSizeInWord(textLength) + BWTOccValueMinorSizeInWord(textLength)) { fprintf(stderr, "BWTIncCreate() : targetNBit is too low!\n"); exit(1); } bwtInc->workingMemory = (unsigned*)calloc(bwtInc->availableWord, BYTES_IN_WORD); return bwtInc; } // for BWTIncConstruct() static void BWTIncPutPackedTextToRank(const unsigned int *packedText, unsigned int* __restrict rank, unsigned int* __restrict cumulativeCount, const unsigned int numChar) { unsigned int i, j; unsigned int c, t; unsigned int packedMask; unsigned int rankIndex; unsigned int lastWord, numCharInLastWord; lastWord = (numChar - 1) / CHAR_PER_WORD; numCharInLastWord = numChar - lastWord * CHAR_PER_WORD; packedMask = ALL_ONE_MASK >> (BITS_IN_WORD - BIT_PER_CHAR); rankIndex = numChar - 1; t = packedText[lastWord] >> (BITS_IN_WORD - numCharInLastWord * BIT_PER_CHAR); for (i=0; i>= BIT_PER_CHAR; } for (i=lastWord; i--;) { // loop from lastWord - 1 to 0 t = packedText[i]; for (j=0; j>= BIT_PER_CHAR; } } // Convert occurrence to cumulativeCount cumulativeCount[2] += cumulativeCount[1]; cumulativeCount[3] += cumulativeCount[2]; cumulativeCount[4] += cumulativeCount[3]; } static void ForwardDNAAllOccCountNoLimit(const unsigned int* dna, const unsigned int index, unsigned int* __restrict occCount, const unsigned int* dnaDecodeTable) { static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000, 0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000, 0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00, 0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC }; unsigned int iteration, wordToCount, charToCount; unsigned int i, j, c; unsigned int sum; occCount[0] = 0; occCount[1] = 0; occCount[2] = 0; occCount[3] = 0; iteration = index / 256; wordToCount = (index - iteration * 256) / 16; charToCount = index - iteration * 256 - wordToCount * 16; for (i=0; i> 16]; sum += dnaDecodeTable[*dna & 0x0000FFFF]; dna++; } if (!DNA_OCC_SUM_EXCEPTION(sum)) { occCount[0] += sum & 0x000000FF; sum >>= 8; occCount[1] += sum & 0x000000FF; sum >>= 8; occCount[2] += sum & 0x000000FF; sum >>= 8; occCount[3] += sum; } else { // only some or all of the 3 bits are on // in reality, only one of the four cases are possible if (sum == 0x00000100) { occCount[0] += 256; } else if (sum == 0x00010000) { occCount[1] += 256; } else if (sum == 0x01000000) { occCount[2] += 256; } else if (sum == 0x00000000) { occCount[3] += 256; } else { fprintf(stderr, "ForwardDNAAllOccCountNoLimit(): DNA occ sum exception!\n"); exit(1); } } } sum = 0; for (j=0; j> 16]; sum += dnaDecodeTable[*dna & 0x0000FFFF]; dna++; } if (charToCount > 0) { c = *dna & truncateRightMask[charToCount]; // increase count of 'a' by 16 - c; sum += dnaDecodeTable[c >> 16]; sum += dnaDecodeTable[c & 0xFFFF]; sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess } occCount[0] += sum & 0x000000FF; sum >>= 8; occCount[1] += sum & 0x000000FF; sum >>= 8; occCount[2] += sum & 0x000000FF; sum >>= 8; occCount[3] += sum; } static void BWTIncBuildPackedBwt(const unsigned int *relativeRank, unsigned int* __restrict bwt, const unsigned int numChar, const unsigned int *cumulativeCount, const unsigned int *packedShift) { unsigned int i, c, r; unsigned int previousRank, currentRank; unsigned int wordIndex, charIndex; unsigned int inverseSa0; inverseSa0 = previousRank = relativeRank[0]; for (i=1; i<=numChar; i++) { currentRank = relativeRank[i]; // previousRank > cumulativeCount[c] because $ is one of the char c = (previousRank > cumulativeCount[1]) + (previousRank > cumulativeCount[2]) + (previousRank > cumulativeCount[3]); // set bwt for currentRank if (c > 0) { // c <> 'a' r = currentRank; if (r > inverseSa0) { // - 1 because $ at inverseSa0 is not encoded r--; } wordIndex = r / CHAR_PER_WORD; charIndex = r - wordIndex * CHAR_PER_WORD; bwt[wordIndex] |= c << packedShift[charIndex]; } previousRank = currentRank; } } static inline unsigned int BWTOccValueExplicit(const BWT *bwt, const unsigned int occIndexExplicit, const unsigned int character) { unsigned int occIndexMajor; occIndexMajor = occIndexExplicit * OCC_INTERVAL / OCC_INTERVAL_MAJOR; if (occIndexExplicit % OCC_VALUE_PER_WORD == 0) { return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] + (bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] >> 16); } else { return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] + (bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] & 0x0000FFFF); } } static unsigned int ForwardDNAOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character, const unsigned int* dnaDecodeTable) { static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000, 0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000, 0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00, 0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC }; unsigned int wordToCount, charToCount; unsigned int i, c; unsigned int sum = 0; wordToCount = index / 16; charToCount = index - wordToCount * 16; for (i=0; i> 16]; sum += dnaDecodeTable[dna[i] & 0x0000FFFF]; } if (charToCount > 0) { c = dna[i] & truncateRightMask[charToCount]; // increase count of 'a' by 16 - c; sum += dnaDecodeTable[c >> 16]; sum += dnaDecodeTable[c & 0xFFFF]; sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess } return (sum >> (character * 8)) & 0x000000FF; } static unsigned int BackwardDNAOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character, const unsigned int* dnaDecodeTable) { static const unsigned int truncateLeftMask[16] = { 0x00000000, 0x00000003, 0x0000000F, 0x0000003F, 0x000000FF, 0x000003FF, 0x00000FFF, 0x00003FFF, 0x0000FFFF, 0x0003FFFF, 0x000FFFFF, 0x003FFFFF, 0x00FFFFFF, 0x03FFFFFF, 0x0FFFFFFF, 0x3FFFFFFF }; unsigned int wordToCount, charToCount; unsigned int i, c; unsigned int sum = 0; wordToCount = index / 16; charToCount = index - wordToCount * 16; dna -= wordToCount + 1; if (charToCount > 0) { c = *dna & truncateLeftMask[charToCount]; // increase count of 'a' by 16 - c; sum += dnaDecodeTable[c >> 16]; sum += dnaDecodeTable[c & 0xFFFF]; sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess } for (i=0; i> 16]; sum += dnaDecodeTable[*dna & 0x0000FFFF]; } return (sum >> (character * 8)) & 0x000000FF; } unsigned int BWTOccValue(const BWT *bwt, unsigned int index, const unsigned int character) { unsigned int occValue; unsigned int occExplicitIndex, occIndex; // $ is supposed to be positioned at inverseSa0 but it is not encoded // therefore index is subtracted by 1 for adjustment if (index > bwt->inverseSa0) { index--; } occExplicitIndex = (index + OCC_INTERVAL / 2 - 1) / OCC_INTERVAL; // Bidirectional encoding occIndex = occExplicitIndex * OCC_INTERVAL; occValue = BWTOccValueExplicit(bwt, occExplicitIndex, character); if (occIndex == index) { return occValue; } if (occIndex < index) { return occValue + ForwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, index - occIndex, character, bwt->decodeTable); } else { return occValue - BackwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, occIndex - index, character, bwt->decodeTable); } } static unsigned int BWTIncGetAbsoluteRank(BWT *bwt, unsigned int* __restrict absoluteRank, unsigned int* __restrict seq, const unsigned int *packedText, const unsigned int numChar, const unsigned int* cumulativeCount, const unsigned int firstCharInLastIteration) { unsigned int saIndex; unsigned int lastWord; unsigned int packedMask; unsigned int i, j; unsigned int c, t; unsigned int rankIndex; unsigned int shift; unsigned int seqIndexFromStart[ALPHABET_SIZE]; unsigned int seqIndexFromEnd[ALPHABET_SIZE]; for (i=0; i> shift; saIndex = bwt->inverseSa0; rankIndex = numChar - 1; lastWord = numChar / CHAR_PER_WORD; for (i=lastWord; i--;) { // loop from lastWord - 1 to 0 t = packedText[i]; for (j=0; jcumulativeFreq[c] + BWTOccValue(bwt, saIndex, c) + 1; // A counting sort using the first character of suffix is done here // If rank > inverseSa0 -> fill seq from end, otherwise fill seq from start -> to leave the right entry for inverseSa0 if (saIndex > bwt->inverseSa0) { seq[seqIndexFromEnd[c]] = rankIndex; absoluteRank[seqIndexFromEnd[c]] = saIndex; seqIndexFromEnd[c]--; } else { seq[seqIndexFromStart[c]] = rankIndex; absoluteRank[seqIndexFromStart[c]] = saIndex; seqIndexFromStart[c]++; } rankIndex--; t >>= BIT_PER_CHAR; } } absoluteRank[seqIndexFromStart[firstCharInLastIteration]] = bwt->inverseSa0; // representing the substring of all preceding characters seq[seqIndexFromStart[firstCharInLastIteration]] = numChar; return seqIndexFromStart[firstCharInLastIteration]; } static void BWTIncSortKey(unsigned int* __restrict key, unsigned int* __restrict seq, const unsigned int numItem) { #define EQUAL_KEY_THRESHOLD 4 // Partition for equal key if data array size / the number of data with equal value with pivot < EQUAL_KEY_THRESHOLD int lowIndex, highIndex, midIndex; int lowPartitionIndex, highPartitionIndex; int lowStack[32], highStack[32]; int stackDepth; int i, j; unsigned int tempSeq, tempKey; int numberOfEqualKey; if (numItem < 2) return; stackDepth = 0; lowIndex = 0; highIndex = numItem - 1; for (;;) { for (;;) { // Sort small array of data if (highIndex - lowIndex < BWTINC_INSERT_SORT_NUM_ITEM) { // Insertion sort on smallest arrays for (i=lowIndex+1; i<=highIndex; i++) { tempSeq = seq[i]; tempKey = key[i]; for (j = i; j > lowIndex && key[j-1] > tempKey; j--) { seq[j] = seq[j-1]; key[j] = key[j-1]; } if (j != i) { seq[j] = tempSeq; key[j] = tempKey; } } break; } // Choose pivot as median of the lowest, middle, and highest data; sort the three data midIndex = average(lowIndex, highIndex); if (key[lowIndex] > key[midIndex]) { tempSeq = seq[lowIndex]; tempKey = key[lowIndex]; seq[lowIndex] = seq[midIndex]; key[lowIndex] = key[midIndex]; seq[midIndex] = tempSeq; key[midIndex] = tempKey; } if (key[lowIndex] > key[highIndex]) { tempSeq = seq[lowIndex]; tempKey = key[lowIndex]; seq[lowIndex] = seq[highIndex]; key[lowIndex] = key[highIndex]; seq[highIndex] = tempSeq; key[highIndex] = tempKey; } if (key[midIndex] > key[highIndex]) { tempSeq = seq[midIndex]; tempKey = key[midIndex]; seq[midIndex] = seq[highIndex]; key[midIndex] = key[highIndex]; seq[highIndex] = tempSeq; key[highIndex] = tempKey; } // Partition data numberOfEqualKey = 0; lowPartitionIndex = lowIndex + 1; highPartitionIndex = highIndex - 1; for (;;) { while (lowPartitionIndex <= highPartitionIndex && key[lowPartitionIndex] <= key[midIndex]) { numberOfEqualKey += (key[lowPartitionIndex] == key[midIndex]); lowPartitionIndex++; } while (lowPartitionIndex < highPartitionIndex) { if (key[midIndex] >= key[highPartitionIndex]) { numberOfEqualKey += (key[midIndex] == key[highPartitionIndex]); break; } highPartitionIndex--; } if (lowPartitionIndex >= highPartitionIndex) { break; } tempSeq = seq[lowPartitionIndex]; tempKey = key[lowPartitionIndex]; seq[lowPartitionIndex] = seq[highPartitionIndex]; key[lowPartitionIndex] = key[highPartitionIndex]; seq[highPartitionIndex] = tempSeq; key[highPartitionIndex] = tempKey; if (highPartitionIndex == midIndex) { // partition key has been moved midIndex = lowPartitionIndex; } lowPartitionIndex++; highPartitionIndex--; } // Adjust the partition index highPartitionIndex = lowPartitionIndex; lowPartitionIndex--; // move the partition key to end of low partition tempSeq = seq[midIndex]; tempKey = key[midIndex]; seq[midIndex] = seq[lowPartitionIndex]; key[midIndex] = key[lowPartitionIndex]; seq[lowPartitionIndex] = tempSeq; key[lowPartitionIndex] = tempKey; if (highIndex - lowIndex + BWTINC_INSERT_SORT_NUM_ITEM <= EQUAL_KEY_THRESHOLD * numberOfEqualKey) { // Many keys = partition key; separate the equal key data from the lower partition midIndex = lowIndex; for (;;) { while (midIndex < lowPartitionIndex && key[midIndex] < key[lowPartitionIndex]) { midIndex++; } while (midIndex < lowPartitionIndex && key[lowPartitionIndex] == key[lowPartitionIndex - 1]) { lowPartitionIndex--; } if (midIndex >= lowPartitionIndex) { break; } tempSeq = seq[midIndex]; tempKey = key[midIndex]; seq[midIndex] = seq[lowPartitionIndex - 1]; key[midIndex] = key[lowPartitionIndex - 1]; seq[lowPartitionIndex - 1] = tempSeq; key[lowPartitionIndex - 1] = tempKey; midIndex++; lowPartitionIndex--; } } if (lowPartitionIndex - lowIndex > highIndex - highPartitionIndex) { // put the larger partition to stack lowStack[stackDepth] = lowIndex; highStack[stackDepth] = lowPartitionIndex - 1; stackDepth++; // sort the smaller partition first lowIndex = highPartitionIndex; } else { // put the larger partition to stack lowStack[stackDepth] = highPartitionIndex; highStack[stackDepth] = highIndex; stackDepth++; // sort the smaller partition first if (lowPartitionIndex > lowIndex) { highIndex = lowPartitionIndex - 1; } else { // all keys in the partition equals to the partition key break; } } continue; } // Pop a range from stack if (stackDepth > 0) { stackDepth--; lowIndex = lowStack[stackDepth]; highIndex = highStack[stackDepth]; continue; } else return; } } static void BWTIncBuildRelativeRank(unsigned int* __restrict sortedRank, unsigned int* __restrict seq, unsigned int* __restrict relativeRank, const unsigned int numItem, unsigned int oldInverseSa0, const unsigned int *cumulativeCount) { unsigned int i, c; unsigned int s, r; unsigned int lastRank, lastIndex; unsigned int oldInverseSa0RelativeRank = 0; unsigned int freq; lastIndex = numItem; lastRank = sortedRank[numItem]; if (lastRank > oldInverseSa0) { sortedRank[numItem]--; // to prepare for merging; $ is not encoded in bwt } s = seq[numItem]; relativeRank[s] = numItem; if (lastRank == oldInverseSa0) { oldInverseSa0RelativeRank = numItem; oldInverseSa0++; // so that this segment of code is not run again lastRank++; // so that oldInverseSa0 become a sorted group with 1 item } c = ALPHABET_SIZE - 1; freq = cumulativeCount[c]; for (i=numItem; i--;) { // from numItem - 1 to 0 r = sortedRank[i]; if (r > oldInverseSa0) { sortedRank[i]--; // to prepare for merging; $ is not encoded in bwt } s = seq[i]; if (i < freq) { if (lastIndex >= freq) { lastRank++; // to trigger the group across alphabet boundary to be split } c--; freq = cumulativeCount[c]; } if (r == lastRank) { relativeRank[s] = lastIndex; } else { if (i == lastIndex - 1) { if (lastIndex < numItem && (int)seq[lastIndex + 1] < 0) { seq[lastIndex] = seq[lastIndex + 1] - 1; } else { seq[lastIndex] = (unsigned int)-1; } } lastIndex = i; lastRank = r; relativeRank[s] = i; if (r == oldInverseSa0) { oldInverseSa0RelativeRank = i; oldInverseSa0++; // so that this segment of code is not run again lastRank++; // so that oldInverseSa0 become a sorted group with 1 item } } } } static void BWTIncBuildBwt(unsigned int* seq, const unsigned int *relativeRank, const unsigned int numChar, const unsigned int *cumulativeCount) { unsigned int i, c; unsigned int previousRank, currentRank; previousRank = relativeRank[0]; for (i=1; i<=numChar; i++) { currentRank = relativeRank[i]; c = (previousRank >= cumulativeCount[1]) + (previousRank >= cumulativeCount[2]) + (previousRank >= cumulativeCount[3]); seq[currentRank] = c; previousRank = currentRank; } } static void BWTIncMergeBwt(const unsigned int *sortedRank, const unsigned int* oldBwt, const unsigned int *insertBwt, unsigned int* __restrict mergedBwt, const unsigned int numOldBwt, const unsigned int numInsertBwt) { unsigned int bitsInWordMinusBitPerChar; unsigned int leftShift, rightShift; unsigned int o; unsigned int oIndex, iIndex, mIndex; unsigned int mWord, mChar, oWord, oChar; unsigned int numInsert; bitsInWordMinusBitPerChar = BITS_IN_WORD - BIT_PER_CHAR; oIndex = 0; iIndex = 0; mIndex = 0; mWord = 0; mChar = 0; mergedBwt[0] = 0; // this can be cleared as merged Bwt slightly shift to the left in each iteration while (oIndex < numOldBwt) { // copy from insertBwt while (iIndex <= numInsertBwt && sortedRank[iIndex] <= oIndex) { if (sortedRank[iIndex] != 0) { // special value to indicate that this is for new inverseSa0 mergedBwt[mWord] |= insertBwt[iIndex] << (BITS_IN_WORD - (mChar + 1) * BIT_PER_CHAR); mIndex++; mChar++; if (mChar == CHAR_PER_WORD) { mChar = 0; mWord++; mergedBwt[mWord] = 0; // no need to worry about crossing mergedBwt boundary } } iIndex++; } // Copy from oldBwt to mergedBwt if (iIndex <= numInsertBwt) { o = sortedRank[iIndex]; } else { o = numOldBwt; } numInsert = o - oIndex; oWord = oIndex / CHAR_PER_WORD; oChar = oIndex - oWord * CHAR_PER_WORD; if (oChar > mChar) { leftShift = (oChar - mChar) * BIT_PER_CHAR; rightShift = (CHAR_PER_WORD + mChar - oChar) * BIT_PER_CHAR; mergedBwt[mWord] = mergedBwt[mWord] | (oldBwt[oWord] << (oChar * BIT_PER_CHAR) >> (mChar * BIT_PER_CHAR)) | (oldBwt[oWord+1] >> rightShift); oIndex += min(numInsert, CHAR_PER_WORD - mChar); while (o > oIndex) { oWord++; mWord++; mergedBwt[mWord] = (oldBwt[oWord] << leftShift) | (oldBwt[oWord+1] >> rightShift); oIndex += CHAR_PER_WORD; } } else if (oChar < mChar) { rightShift = (mChar - oChar) * BIT_PER_CHAR; leftShift = (CHAR_PER_WORD + oChar - mChar) * BIT_PER_CHAR; mergedBwt[mWord] = mergedBwt[mWord] | (oldBwt[oWord] << (oChar * BIT_PER_CHAR) >> (mChar * BIT_PER_CHAR)); oIndex += min(numInsert, CHAR_PER_WORD - mChar); while (o > oIndex) { oWord++; mWord++; mergedBwt[mWord] = (oldBwt[oWord-1] << leftShift) | (oldBwt[oWord] >> rightShift); oIndex += CHAR_PER_WORD; } } else { // oChar == mChar mergedBwt[mWord] = mergedBwt[mWord] | truncateLeft(oldBwt[oWord], mChar * BIT_PER_CHAR); oIndex += min(numInsert, CHAR_PER_WORD - mChar); while (o > oIndex) { oWord++; mWord++; mergedBwt[mWord] = oldBwt[oWord]; oIndex += CHAR_PER_WORD; } } oIndex = o; mIndex += numInsert; // Clear the trailing garbage in mergedBwt mWord = mIndex / CHAR_PER_WORD; mChar = mIndex - mWord * CHAR_PER_WORD; if (mChar == 0) { mergedBwt[mWord] = 0; } else { mergedBwt[mWord] = truncateRight(mergedBwt[mWord], (BITS_IN_WORD - mChar * BIT_PER_CHAR)); } } // copy from insertBwt while (iIndex <= numInsertBwt) { if (sortedRank[iIndex] != 0) { mergedBwt[mWord] |= insertBwt[iIndex] << (BITS_IN_WORD - (mChar + 1) * BIT_PER_CHAR); mIndex++; mChar++; if (mChar == CHAR_PER_WORD) { mChar = 0; mWord++; mergedBwt[mWord] = 0; // no need to worry about crossing mergedBwt boundary } } iIndex++; } } void BWTClearTrailingBwtCode(BWT *bwt) { unsigned int bwtResidentSizeInWord; unsigned int wordIndex, offset; unsigned int i; bwtResidentSizeInWord = BWTResidentSizeInWord(bwt->textLength); wordIndex = bwt->textLength / CHAR_PER_WORD; offset = (bwt->textLength - wordIndex * CHAR_PER_WORD) * BIT_PER_CHAR; if (offset > 0) { bwt->bwtCode[wordIndex] = truncateRight(bwt->bwtCode[wordIndex], BITS_IN_WORD - offset); } else { if (wordIndex < bwtResidentSizeInWord) { bwt->bwtCode[wordIndex] = 0; } } for (i=wordIndex+1; ibwtCode[i] = 0; } } void BWTGenerateOccValueFromBwt(const unsigned int* bwt, unsigned int* __restrict occValue, unsigned int* __restrict occValueMajor, const unsigned int textLength, const unsigned int* decodeTable) { unsigned int numberOfOccValueMajor, numberOfOccValue; unsigned int wordBetweenOccValue; unsigned int numberOfOccIntervalPerMajor; unsigned int c; unsigned int i, j; unsigned int occMajorIndex; unsigned int occIndex, bwtIndex; unsigned int sum; unsigned int tempOccValue0[ALPHABET_SIZE], tempOccValue1[ALPHABET_SIZE]; wordBetweenOccValue = OCC_INTERVAL / CHAR_PER_WORD; // Calculate occValue // [lh3] by default: OCC_INTERVAL_MAJOR=65536, OCC_INTERVAL=256 numberOfOccValue = (textLength + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding numberOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL; numberOfOccValueMajor = (numberOfOccValue + numberOfOccIntervalPerMajor - 1) / numberOfOccIntervalPerMajor; tempOccValue0[0] = 0; tempOccValue0[1] = 0; tempOccValue0[2] = 0; tempOccValue0[3] = 0; occValueMajor[0] = 0; occValueMajor[1] = 0; occValueMajor[2] = 0; occValueMajor[3] = 0; occIndex = 0; bwtIndex = 0; for (occMajorIndex=1; occMajorIndex> 16]; sum += decodeTable[c & 0x0000FFFF]; bwtIndex++; } if (!DNA_OCC_SUM_EXCEPTION(sum)) { tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8; tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8; tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8; tempOccValue1[3] += sum; } else { if (sum == 0x00000100) { tempOccValue1[0] += 256; } else if (sum == 0x00010000) { tempOccValue1[1] += 256; } else if (sum == 0x01000000) { tempOccValue1[2] += 256; } else { tempOccValue1[3] += 256; } } occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0]; occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1]; occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2]; occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3]; tempOccValue0[0] = tempOccValue1[0]; tempOccValue0[1] = tempOccValue1[1]; tempOccValue0[2] = tempOccValue1[2]; tempOccValue0[3] = tempOccValue1[3]; sum = 0; occIndex++; for (j=0; j> 16]; sum += decodeTable[c & 0x0000FFFF]; bwtIndex++; } if (!DNA_OCC_SUM_EXCEPTION(sum)) { tempOccValue0[0] += (sum & 0x000000FF); sum >>= 8; tempOccValue0[1] += (sum & 0x000000FF); sum >>= 8; tempOccValue0[2] += (sum & 0x000000FF); sum >>= 8; tempOccValue0[3] += sum; } else { if (sum == 0x00000100) { tempOccValue0[0] += 256; } else if (sum == 0x00010000) { tempOccValue0[1] += 256; } else if (sum == 0x01000000) { tempOccValue0[2] += 256; } else { tempOccValue0[3] += 256; } } } occValueMajor[occMajorIndex * 4 + 0] = occValueMajor[(occMajorIndex - 1) * 4 + 0] + tempOccValue0[0]; occValueMajor[occMajorIndex * 4 + 1] = occValueMajor[(occMajorIndex - 1) * 4 + 1] + tempOccValue0[1]; occValueMajor[occMajorIndex * 4 + 2] = occValueMajor[(occMajorIndex - 1) * 4 + 2] + tempOccValue0[2]; occValueMajor[occMajorIndex * 4 + 3] = occValueMajor[(occMajorIndex - 1) * 4 + 3] + tempOccValue0[3]; tempOccValue0[0] = 0; tempOccValue0[1] = 0; tempOccValue0[2] = 0; tempOccValue0[3] = 0; } while (occIndex < (numberOfOccValue-1)/2) { sum = 0; tempOccValue1[0] = tempOccValue0[0]; tempOccValue1[1] = tempOccValue0[1]; tempOccValue1[2] = tempOccValue0[2]; tempOccValue1[3] = tempOccValue0[3]; for (j=0; j> 16]; sum += decodeTable[c & 0x0000FFFF]; bwtIndex++; } if (!DNA_OCC_SUM_EXCEPTION(sum)) { tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8; tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8; tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8; tempOccValue1[3] += sum; } else { if (sum == 0x00000100) { tempOccValue1[0] += 256; } else if (sum == 0x00010000) { tempOccValue1[1] += 256; } else if (sum == 0x01000000) { tempOccValue1[2] += 256; } else { tempOccValue1[3] += 256; } } occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0]; occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1]; occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2]; occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3]; tempOccValue0[0] = tempOccValue1[0]; tempOccValue0[1] = tempOccValue1[1]; tempOccValue0[2] = tempOccValue1[2]; tempOccValue0[3] = tempOccValue1[3]; sum = 0; occIndex++; for (j=0; j> 16]; sum += decodeTable[c & 0x0000FFFF]; bwtIndex++; } if (!DNA_OCC_SUM_EXCEPTION(sum)) { tempOccValue0[0] += (sum & 0x000000FF); sum >>= 8; tempOccValue0[1] += (sum & 0x000000FF); sum >>= 8; tempOccValue0[2] += (sum & 0x000000FF); sum >>= 8; tempOccValue0[3] += sum; } else { if (sum == 0x00000100) { tempOccValue0[0] += 256; } else if (sum == 0x00010000) { tempOccValue0[1] += 256; } else if (sum == 0x01000000) { tempOccValue0[2] += 256; } else { tempOccValue0[3] += 256; } } } sum = 0; tempOccValue1[0] = tempOccValue0[0]; tempOccValue1[1] = tempOccValue0[1]; tempOccValue1[2] = tempOccValue0[2]; tempOccValue1[3] = tempOccValue0[3]; if (occIndex * 2 < numberOfOccValue - 1) { for (j=0; j> 16]; sum += decodeTable[c & 0x0000FFFF]; bwtIndex++; } if (!DNA_OCC_SUM_EXCEPTION(sum)) { tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8; tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8; tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8; tempOccValue1[3] += sum; } else { if (sum == 0x00000100) { tempOccValue1[0] += 256; } else if (sum == 0x00010000) { tempOccValue1[1] += 256; } else if (sum == 0x01000000) { tempOccValue1[2] += 256; } else { tempOccValue1[3] += 256; } } } occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0]; occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1]; occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2]; occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3]; } static void BWTIncConstruct(BWTInc *bwtInc, const unsigned int numChar) { unsigned int i; unsigned int mergedBwtSizeInWord, mergedOccSizeInWord; unsigned int firstCharInThisIteration; unsigned int *relativeRank, *seq, *sortedRank, *insertBwt, *mergedBwt; unsigned int newInverseSa0RelativeRank, oldInverseSa0RelativeRank, newInverseSa0; #ifdef DEBUG if (numChar > bwtInc->buildSize) { fprintf(stderr, "BWTIncConstruct(): numChar > buildSize!\n"); exit(1); } #endif mergedBwtSizeInWord = BWTResidentSizeInWord(bwtInc->bwt->textLength + numChar); mergedOccSizeInWord = BWTOccValueMinorSizeInWord(bwtInc->bwt->textLength + numChar); initializeVAL(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0); if (bwtInc->bwt->textLength == 0) { // Initial build // Set address seq = bwtInc->workingMemory; relativeRank = seq + bwtInc->buildSize + 1; mergedBwt = insertBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord; // build in place BWTIncPutPackedTextToRank(bwtInc->packedText, relativeRank, bwtInc->cumulativeCountInCurrentBuild, numChar); firstCharInThisIteration = relativeRank[0]; relativeRank[numChar] = 0; // Sort suffix QSufSortSuffixSort((int*)relativeRank, (int*)seq, (int)numChar, (int)ALPHABET_SIZE - 1, 0, FALSE); newInverseSa0 = relativeRank[0]; // Clear BWT area initializeVAL(insertBwt, mergedBwtSizeInWord, 0); // Build BWT BWTIncBuildPackedBwt(relativeRank, insertBwt, numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->packedShift); // so that the cumulativeCount is not deducted bwtInc->firstCharInLastIteration = ALPHABET_SIZE; } else { // Incremental build // Set address sortedRank = bwtInc->workingMemory; seq = sortedRank + bwtInc->buildSize + 1; insertBwt = seq; relativeRank = seq + bwtInc->buildSize + 1; // Store the first character of this iteration firstCharInThisIteration = bwtInc->packedText[0] >> (BITS_IN_WORD - BIT_PER_CHAR); // Count occurrence of input text ForwardDNAAllOccCountNoLimit(bwtInc->packedText, numChar, bwtInc->cumulativeCountInCurrentBuild + 1, bwtInc->bwt->decodeTable); // Add the first character of the previous iteration to represent the inverseSa0 of the previous iteration bwtInc->cumulativeCountInCurrentBuild[bwtInc->firstCharInLastIteration + 1]++; bwtInc->cumulativeCountInCurrentBuild[2] += bwtInc->cumulativeCountInCurrentBuild[1]; bwtInc->cumulativeCountInCurrentBuild[3] += bwtInc->cumulativeCountInCurrentBuild[2]; bwtInc->cumulativeCountInCurrentBuild[4] += bwtInc->cumulativeCountInCurrentBuild[3]; // Get rank of new suffix among processed suffix // The seq array is built into ALPHABET_SIZE + 2 groups; ALPHABET_SIZE groups + 1 group divided into 2 by inverseSa0 + inverseSa0 as 1 group oldInverseSa0RelativeRank = BWTIncGetAbsoluteRank(bwtInc->bwt, sortedRank, seq, bwtInc->packedText, numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->firstCharInLastIteration); // Sort rank by ALPHABET_SIZE + 2 groups (or ALPHABET_SIZE + 1 groups when inverseSa0 sit on the border of a group) for (i=0; icumulativeCountInCurrentBuild[i] > oldInverseSa0RelativeRank || bwtInc->cumulativeCountInCurrentBuild[i+1] <= oldInverseSa0RelativeRank) { BWTIncSortKey(sortedRank + bwtInc->cumulativeCountInCurrentBuild[i], seq + bwtInc->cumulativeCountInCurrentBuild[i], bwtInc->cumulativeCountInCurrentBuild[i+1] - bwtInc->cumulativeCountInCurrentBuild[i]); } else { if (bwtInc->cumulativeCountInCurrentBuild[i] < oldInverseSa0RelativeRank) { BWTIncSortKey(sortedRank + bwtInc->cumulativeCountInCurrentBuild[i], seq + bwtInc->cumulativeCountInCurrentBuild[i], oldInverseSa0RelativeRank - bwtInc->cumulativeCountInCurrentBuild[i]); } if (bwtInc->cumulativeCountInCurrentBuild[i+1] > oldInverseSa0RelativeRank + 1) { BWTIncSortKey(sortedRank + oldInverseSa0RelativeRank + 1, seq + oldInverseSa0RelativeRank + 1, bwtInc->cumulativeCountInCurrentBuild[i+1] - oldInverseSa0RelativeRank - 1); } } } // build relative rank; sortedRank is updated for merging to cater for the fact that $ is not encoded in bwt // the cumulative freq information is used to make sure that inverseSa0 and suffix beginning with different characters are kept in different unsorted groups) BWTIncBuildRelativeRank(sortedRank, seq, relativeRank, numChar, bwtInc->bwt->inverseSa0, bwtInc->cumulativeCountInCurrentBuild); #ifdef DEBUG if (relativeRank[numChar] != oldInverseSa0RelativeRank) { fprintf(stderr, "BWTIncConstruct(): relativeRank[numChar] != oldInverseSa0RelativeRank!\n"); exit(1); } #endif // Sort suffix QSufSortSuffixSort((int*)relativeRank, (int*)seq, (int)numChar, (int)numChar, 1, TRUE); newInverseSa0RelativeRank = relativeRank[0]; newInverseSa0 = sortedRank[newInverseSa0RelativeRank] + newInverseSa0RelativeRank; sortedRank[newInverseSa0RelativeRank] = 0; // a special value so that this is skipped in the merged bwt // Build BWT BWTIncBuildBwt(seq, relativeRank, numChar, bwtInc->cumulativeCountInCurrentBuild); // Merge BWT mergedBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord - bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR; // minus numberOfIteration * occInterval to create a buffer for merging BWTIncMergeBwt(sortedRank, bwtInc->bwt->bwtCode, insertBwt, mergedBwt, bwtInc->bwt->textLength, numChar); } // Build auxiliary structure and update info and pointers in BWT bwtInc->bwt->textLength += numChar; bwtInc->bwt->bwtCode = mergedBwt; bwtInc->bwt->bwtSizeInWord = mergedBwtSizeInWord; bwtInc->bwt->occSizeInWord = mergedOccSizeInWord; if (mergedBwt < bwtInc->workingMemory + mergedOccSizeInWord) { fprintf(stderr, "BWTIncConstruct() : Not enough memory allocated!\n"); exit(1); } bwtInc->bwt->occValue = mergedBwt - mergedOccSizeInWord; BWTClearTrailingBwtCode(bwtInc->bwt); BWTGenerateOccValueFromBwt(bwtInc->bwt->bwtCode, bwtInc->bwt->occValue, bwtInc->bwt->occValueMajor, bwtInc->bwt->textLength, bwtInc->bwt->decodeTable); bwtInc->bwt->inverseSa0 = newInverseSa0; bwtInc->bwt->cumulativeFreq[1] += bwtInc->cumulativeCountInCurrentBuild[1] - (bwtInc->firstCharInLastIteration <= 0); bwtInc->bwt->cumulativeFreq[2] += bwtInc->cumulativeCountInCurrentBuild[2] - (bwtInc->firstCharInLastIteration <= 1); bwtInc->bwt->cumulativeFreq[3] += bwtInc->cumulativeCountInCurrentBuild[3] - (bwtInc->firstCharInLastIteration <= 2); bwtInc->bwt->cumulativeFreq[4] += bwtInc->cumulativeCountInCurrentBuild[4] - (bwtInc->firstCharInLastIteration <= 3); bwtInc->firstCharInLastIteration = firstCharInThisIteration; // Set build size and text address for the next build BWTIncSetBuildSizeAndTextAddr(bwtInc); bwtInc->numberOfIterationDone++; } BWTInc *BWTIncConstructFromPacked(const char *inputFileName, const float targetNBit, const unsigned int initialMaxBuildSize, const unsigned int incMaxBuildSize) { FILE *packedFile; unsigned int packedFileLen; unsigned int totalTextLength; unsigned int textToLoad, textSizeInByte; unsigned int processedTextLength; unsigned char lastByteLength; BWTInc *bwtInc; packedFile = (FILE*)fopen(inputFileName, "rb"); if (packedFile == NULL) { fprintf(stderr, "BWTIncConstructFromPacked() : Cannot open inputFileName!\n"); exit(1); } fseek(packedFile, -1, SEEK_END); packedFileLen = ftell(packedFile); if ((int)packedFileLen < 0) { fprintf(stderr, "BWTIncConstructFromPacked: Cannot determine file length!\n"); exit(1); } fread(&lastByteLength, sizeof(unsigned char), 1, packedFile); totalTextLength = TextLengthFromBytePacked(packedFileLen, BIT_PER_CHAR, lastByteLength); bwtInc = BWTIncCreate(totalTextLength, targetNBit, initialMaxBuildSize, incMaxBuildSize); BWTIncSetBuildSizeAndTextAddr(bwtInc); if (bwtInc->buildSize > totalTextLength) { textToLoad = totalTextLength; } else { textToLoad = totalTextLength - ((totalTextLength - bwtInc->buildSize + CHAR_PER_WORD - 1) / CHAR_PER_WORD * CHAR_PER_WORD); } textSizeInByte = textToLoad / CHAR_PER_BYTE; // excluded the odd byte fseek(packedFile, -2, SEEK_CUR); fseek(packedFile, -((int)textSizeInByte), SEEK_CUR); fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte + 1, packedFile); fseek(packedFile, -((int)textSizeInByte + 1), SEEK_CUR); ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad); BWTIncConstruct(bwtInc, textToLoad); processedTextLength = textToLoad; while (processedTextLength < totalTextLength) { textToLoad = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD; if (textToLoad > totalTextLength - processedTextLength) { textToLoad = totalTextLength - processedTextLength; } textSizeInByte = textToLoad / CHAR_PER_BYTE; fseek(packedFile, -((int)textSizeInByte), SEEK_CUR); fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte, packedFile); fseek(packedFile, -((int)textSizeInByte), SEEK_CUR); ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad); BWTIncConstruct(bwtInc, textToLoad); processedTextLength += textToLoad; if (bwtInc->numberOfIterationDone % 10 == 0) { printf("[BWTIncConstructFromPacked] %u iterations done. %u characters processed.\n", bwtInc->numberOfIterationDone, processedTextLength); } } return bwtInc; } void BWTFree(BWT *bwt) { if (bwt == 0) return; free(bwt->cumulativeFreq); free(bwt->bwtCode); free(bwt->occValue); free(bwt->occValueMajor); free(bwt->saValue); free(bwt->inverseSa); free(bwt->decodeTable); free(bwt->saIndexRange); free(bwt->saValueOnBoundary); free(bwt); } void BWTIncFree(BWTInc *bwtInc) { if (bwtInc == 0) return; free(bwtInc->bwt); free(bwtInc->workingMemory); free(bwtInc); } static unsigned int BWTFileSizeInWord(const unsigned int numChar) { // The $ in BWT at the position of inverseSa0 is not encoded return (numChar + CHAR_PER_WORD - 1) / CHAR_PER_WORD; } void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *occValueFileName) { FILE *bwtFile; /* FILE *occValueFile; */ unsigned int bwtLength; bwtFile = (FILE*)fopen(bwtFileName, "wb"); if (bwtFile == NULL) { fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Cannot open BWT code file!\n"); exit(1); } fwrite(&bwt->inverseSa0, sizeof(unsigned int), 1, bwtFile); fwrite(bwt->cumulativeFreq + 1, sizeof(unsigned int), ALPHABET_SIZE, bwtFile); bwtLength = BWTFileSizeInWord(bwt->textLength); fwrite(bwt->bwtCode, sizeof(unsigned int), bwtLength, bwtFile); fclose(bwtFile); /* occValueFile = (FILE*)fopen(occValueFileName, "wb"); if (occValueFile == NULL) { fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Cannot open occ value file!\n"); exit(1); } fwrite(&bwt->inverseSa0, sizeof(unsigned int), 1, occValueFile); fwrite(bwt->cumulativeFreq + 1, sizeof(unsigned int), ALPHABET_SIZE, occValueFile); fwrite(bwt->occValue, sizeof(unsigned int), bwt->occSizeInWord, occValueFile); fwrite(bwt->occValueMajor, sizeof(unsigned int), bwt->occMajorSizeInWord, occValueFile); fclose(occValueFile); */ } void bwt_bwtgen(const char *fn_pac, const char *fn_bwt) { BWTInc *bwtInc; bwtInc = BWTIncConstructFromPacked(fn_pac, 2.5, 10000000, 10000000); printf("[bwt_gen] Finished constructing BWT in %u iterations.\n", bwtInc->numberOfIterationDone); BWTSaveBwtCodeAndOcc(bwtInc->bwt, fn_bwt, 0); BWTIncFree(bwtInc); } int bwt_bwtgen_main(int argc, char *argv[]) { if (argc < 3) { fprintf(stderr, "Usage: bwtgen \n"); return 1; } bwt_bwtgen(argv[1], argv[2]); return 0; } #ifdef MAIN_BWT_GEN int main(int argc, char *argv[]) { return bwt_bwtgen_main(argc, argv); } #endif