/*

   BWTConstruct.c		BWT-Index Construction

   This module constructs BWT and auxiliary data structures.

   Copyright (C) 2004, Wong Chi Kwong.

   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License
   as published by the Free Software Foundation; either version 2
   of the License, or (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "bwt_gen.h"
#include "QSufSort.h"

static unsigned int TextLengthFromBytePacked(unsigned int bytePackedLength, unsigned int bitPerChar,
											 unsigned int lastByteLength)
{
	if (bytePackedLength > ALL_ONE_MASK / (BITS_IN_BYTE / bitPerChar)) {
		fprintf(stderr, "TextLengthFromBytePacked(): text length > 2^32!\n");
		exit(1);
	}
	return (bytePackedLength - 1) * (BITS_IN_BYTE / bitPerChar) + lastByteLength;
}

static void initializeVAL(unsigned int *startAddr, const unsigned int length, const unsigned int initValue)
{
	unsigned int i;
	for (i=0; i<length; i++) startAddr[i] = initValue;
}

static void GenerateDNAOccCountTable(unsigned int *dnaDecodeTable)
{
	unsigned int i, j, c, t;

	for (i=0; i<DNA_OCC_CNT_TABLE_SIZE_IN_WORD; i++) {
		dnaDecodeTable[i] = 0;
		c = i;
		for (j=0; j<8; j++) {
			t = c & 0x00000003;
			dnaDecodeTable[i] += 1 << (t * 8);
			c >>= 2;
		}
	}

}
// for BWTIncCreate()
static unsigned int BWTOccValueMajorSizeInWord(const unsigned int numChar)
{
	unsigned int numOfOccValue;
	unsigned int numOfOccIntervalPerMajor;
	numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding
	numOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL;
	return (numOfOccValue + numOfOccIntervalPerMajor - 1) / numOfOccIntervalPerMajor * ALPHABET_SIZE;
}
// for BWTIncCreate()
static unsigned int BWTOccValueMinorSizeInWord(const unsigned int numChar)
{
	unsigned int numOfOccValue;
	numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1;		// Value at both end for bi-directional encoding
	return (numOfOccValue + OCC_VALUE_PER_WORD - 1) / OCC_VALUE_PER_WORD * ALPHABET_SIZE;
}
// for BWTIncCreate()
static unsigned int BWTResidentSizeInWord(const unsigned int numChar) {

	unsigned int numCharRoundUpToOccInterval;

	// The $ in BWT at the position of inverseSa0 is not encoded
	numCharRoundUpToOccInterval = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL;

	return (numCharRoundUpToOccInterval + CHAR_PER_WORD - 1) / CHAR_PER_WORD;

}

static void BWTIncSetBuildSizeAndTextAddr(BWTInc *bwtInc)
{
	unsigned int maxBuildSize;

	if (bwtInc->bwt->textLength == 0) {
		// initial build
		// Minus 2 because n+1 entries of seq and rank needed for n char
		maxBuildSize = (bwtInc->availableWord - 2 - OCC_INTERVAL / CHAR_PER_WORD)
							/ (2 * CHAR_PER_WORD + 1) * CHAR_PER_WORD;
		if (bwtInc->initialMaxBuildSize > 0) {
			bwtInc->buildSize = min(bwtInc->initialMaxBuildSize, maxBuildSize);
		} else {
			bwtInc->buildSize = maxBuildSize;
		}
	} else {
		// Minus 3 because n+1 entries of sorted rank, seq and rank needed for n char
		// Minus numberOfIterationDone because bwt slightly shift to left in each iteration
		maxBuildSize = (bwtInc->availableWord - bwtInc->bwt->bwtSizeInWord - bwtInc->bwt->occSizeInWord - 3
							 - bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR) 
							 / 3;
		if (maxBuildSize < CHAR_PER_WORD) {
			fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n");
			exit(1);
		}
		if (bwtInc->incMaxBuildSize > 0) {
            bwtInc->buildSize = min(bwtInc->incMaxBuildSize, maxBuildSize);
		} else {
			bwtInc->buildSize = maxBuildSize;
		}
		if (bwtInc->buildSize < CHAR_PER_WORD) {
			bwtInc->buildSize = CHAR_PER_WORD;
		}
	}

	if (bwtInc->buildSize < CHAR_PER_WORD) {
		fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n");
		exit(1);
	}

	bwtInc->buildSize = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD;

	bwtInc->packedText = bwtInc->workingMemory + 2 * (bwtInc->buildSize + 1);
	bwtInc->textBuffer = (unsigned char*)(bwtInc->workingMemory + bwtInc->buildSize + 1);

}

// for ceilLog2()
unsigned int leadingZero(const unsigned int input)
{
	unsigned int l;
	const static unsigned int leadingZero8bit[256] = {8,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
											 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
											 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
											 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
											 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
											 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
											 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
											 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};

	if (input & 0xFFFF0000) {
		if (input & 0xFF000000) {
			l = leadingZero8bit[input >> 24];
		} else {
			l = 8 + leadingZero8bit[input >> 16];
		}
	} else {
		if (input & 0x0000FF00) {
			l = 16 + leadingZero8bit[input >> 8];
		} else {
			l = 24 + leadingZero8bit[input];
		}
	}
	return l;

}
// for BitPerBytePackedChar()
static unsigned int ceilLog2(const unsigned int input)
{
	if (input <= 1) return 0;
	return BITS_IN_WORD - leadingZero(input - 1);

}
// for ConvertBytePackedToWordPacked()
static unsigned int BitPerBytePackedChar(const unsigned int alphabetSize)
{
	unsigned int bitPerChar;
	bitPerChar = ceilLog2(alphabetSize);
	// Return the largest number of bit that does not affect packing efficiency
	if (BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar) > bitPerChar)
		bitPerChar = BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar);
	return bitPerChar;
}
// for ConvertBytePackedToWordPacked()
static unsigned int BitPerWordPackedChar(const unsigned int alphabetSize)
{
	return ceilLog2(alphabetSize);
}

static void ConvertBytePackedToWordPacked(const unsigned char *input, unsigned int *output, const unsigned int alphabetSize,
										  const unsigned int textLength)
{
	unsigned int i, j, k;
	unsigned int c;
	unsigned int bitPerBytePackedChar;
	unsigned int bitPerWordPackedChar;
	unsigned int charPerWord;
	unsigned int charPerByte;
	unsigned int bytePerIteration;
	unsigned int byteProcessed = 0;
	unsigned int wordProcessed = 0;
	unsigned int mask, shift;
	
	unsigned int buffer[BITS_IN_WORD];

	bitPerBytePackedChar = BitPerBytePackedChar(alphabetSize);
	bitPerWordPackedChar = BitPerWordPackedChar(alphabetSize);
	charPerByte = BITS_IN_BYTE / bitPerBytePackedChar;
	charPerWord = BITS_IN_WORD / bitPerWordPackedChar;

	bytePerIteration = charPerWord / charPerByte;
	mask = truncateRight(ALL_ONE_MASK, BITS_IN_WORD - bitPerWordPackedChar);
	shift = BITS_IN_WORD - BITS_IN_BYTE + bitPerBytePackedChar - bitPerWordPackedChar;

	while ((wordProcessed + 1) * charPerWord < textLength) {

		k = 0;
		for (i=0; i<bytePerIteration; i++) {
			c = (unsigned int)input[byteProcessed] << shift;
			for (j=0; j<charPerByte; j++) {
				buffer[k] = c & mask;
				c <<= bitPerBytePackedChar;
				k++;
			}
			byteProcessed++;
		}

		c = 0;
		for (i=0; i<charPerWord; i++) {
			c |= buffer[i] >> bitPerWordPackedChar * i;
		}
		output[wordProcessed] = c;
		wordProcessed++;

	}

	k = 0;
	for (i=0; i < (textLength - wordProcessed * charPerWord - 1) / charPerByte + 1; i++) {
		c = (unsigned int)input[byteProcessed] << shift;
		for (j=0; j<charPerByte; j++) {
			buffer[k] = c & mask;
			c <<= bitPerBytePackedChar;
			k++;
		}
		byteProcessed++;
	}

	c = 0;
	for (i=0; i<textLength - wordProcessed * charPerWord; i++) {
		c |= buffer[i] >> bitPerWordPackedChar * i;
	}
	output[wordProcessed] = c;
}

BWT *BWTCreate(const unsigned int textLength, unsigned int *decodeTable)
{
	BWT *bwt;

	bwt = (BWT*)calloc(1, sizeof(BWT));

	bwt->textLength = 0;
	bwt->inverseSa = 0;

	bwt->cumulativeFreq = (unsigned*)calloc((ALPHABET_SIZE + 1), sizeof(unsigned int*));
	initializeVAL(bwt->cumulativeFreq, ALPHABET_SIZE + 1, 0);

	bwt->bwtSizeInWord = 0;
	bwt->saValueOnBoundary = NULL;

	// Generate decode tables
	if (decodeTable == NULL) {
		bwt->decodeTable = (unsigned*)calloc(DNA_OCC_CNT_TABLE_SIZE_IN_WORD, sizeof(unsigned int));
		GenerateDNAOccCountTable(bwt->decodeTable);
	} else {
		bwt->decodeTable = decodeTable;
	}

	bwt->occMajorSizeInWord = BWTOccValueMajorSizeInWord(textLength);
	bwt->occValueMajor = (unsigned*)calloc(bwt->occMajorSizeInWord, sizeof(unsigned int));

	bwt->occSizeInWord = 0;
	bwt->occValue = NULL;

	bwt->saInterval = ALL_ONE_MASK;
	bwt->saValueSize = 0;
	bwt->saValue = NULL;

	bwt->inverseSaInterval = ALL_ONE_MASK;
	bwt->inverseSaSize = 0;
	bwt->inverseSa = NULL;

	return bwt;
}

BWTInc *BWTIncCreate(const unsigned int textLength, const float targetNBit,
					 const unsigned int initialMaxBuildSize, const unsigned int incMaxBuildSize)
{
	BWTInc *bwtInc;
	unsigned int i;

	if (targetNBit == 0) {
		fprintf(stderr, "BWTIncCreate() : targetNBit = 0!\n");
		exit(1);
	}
	
	bwtInc = (BWTInc*)calloc(1, sizeof(BWTInc));
	bwtInc->numberOfIterationDone = 0;
	bwtInc->bwt = BWTCreate(textLength, NULL);
	bwtInc->initialMaxBuildSize = initialMaxBuildSize;
	bwtInc->incMaxBuildSize = incMaxBuildSize;
	bwtInc->targetNBit = targetNBit;
	bwtInc->cumulativeCountInCurrentBuild = (unsigned*)calloc((ALPHABET_SIZE + 1), sizeof(unsigned int));
	initializeVAL(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0);

	// Build frequently accessed data
	bwtInc->packedShift = (unsigned*)calloc(CHAR_PER_WORD, sizeof(unsigned int));
	for (i=0; i<CHAR_PER_WORD; i++) {
		bwtInc->packedShift[i] = BITS_IN_WORD - (i+1) * BIT_PER_CHAR;
	}

	bwtInc->targetTextLength = textLength;
	bwtInc->availableWord = (unsigned int)((textLength + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL / BITS_IN_WORD * bwtInc->targetNBit);
	if (bwtInc->availableWord < BWTResidentSizeInWord(textLength) + BWTOccValueMinorSizeInWord(textLength)) {
		fprintf(stderr, "BWTIncCreate() : targetNBit is too low!\n");
		exit(1);
	}
	bwtInc->workingMemory = (unsigned*)calloc(bwtInc->availableWord, BYTES_IN_WORD);

	return bwtInc;

}
// for BWTIncConstruct()
static void BWTIncPutPackedTextToRank(const unsigned int *packedText, unsigned int* __restrict rank,
									  unsigned int* __restrict cumulativeCount, const unsigned int numChar)
{
	unsigned int i, j;
	unsigned int c, t;
	unsigned int packedMask;
	unsigned int rankIndex;
	unsigned int lastWord, numCharInLastWord;

	lastWord = (numChar - 1) / CHAR_PER_WORD;
	numCharInLastWord = numChar - lastWord * CHAR_PER_WORD;

	packedMask = ALL_ONE_MASK >> (BITS_IN_WORD - BIT_PER_CHAR);
	rankIndex = numChar - 1;

	t = packedText[lastWord] >> (BITS_IN_WORD - numCharInLastWord * BIT_PER_CHAR);
	for (i=0; i<numCharInLastWord; i++) {
		c = t & packedMask;
		cumulativeCount[c+1]++;
		rank[rankIndex] = c;
		rankIndex--;
		t >>= BIT_PER_CHAR;
	}

	for (i=lastWord; i--;) {	// loop from lastWord - 1 to 0
		t = packedText[i];
		for (j=0; j<CHAR_PER_WORD; j++) {
			c = t & packedMask;
			cumulativeCount[c+1]++;
			rank[rankIndex] = c;
			rankIndex--;
			t >>= BIT_PER_CHAR;
		}
	}

	// Convert occurrence to cumulativeCount
	cumulativeCount[2] += cumulativeCount[1];
	cumulativeCount[3] += cumulativeCount[2];
	cumulativeCount[4] += cumulativeCount[3];
}


static void ForwardDNAAllOccCountNoLimit(const unsigned int*  dna, const unsigned int index,
										 unsigned int* __restrict occCount, const unsigned int*  dnaDecodeTable)
{
	static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000,
											   0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000,
											   0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00,
											   0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC };

	unsigned int iteration, wordToCount, charToCount;
	unsigned int i, j, c;
	unsigned int sum;

	occCount[0] = 0;
	occCount[1] = 0;
	occCount[2] = 0;
	occCount[3] = 0;

	iteration = index / 256;
	wordToCount = (index - iteration * 256) / 16;
	charToCount = index - iteration * 256 - wordToCount * 16;

	for (i=0; i<iteration; i++) {

		sum = 0;
		for (j=0; j<16; j++) {
			sum += dnaDecodeTable[*dna >> 16];
			sum += dnaDecodeTable[*dna & 0x0000FFFF];
			dna++;
		}
		if (!DNA_OCC_SUM_EXCEPTION(sum)) {
			occCount[0] += sum & 0x000000FF;	sum >>= 8;
			occCount[1] += sum & 0x000000FF;	sum >>= 8;
			occCount[2] += sum & 0x000000FF;	sum >>= 8;
			occCount[3] += sum;
		} else {
			// only some or all of the 3 bits are on
			// in reality, only one of the four cases are possible
			if (sum == 0x00000100) {
				occCount[0] += 256;
			} else if (sum == 0x00010000) {
				occCount[1] += 256;
			} else if (sum == 0x01000000) {
				occCount[2] += 256;
			} else if (sum == 0x00000000) {
				occCount[3] += 256;
			} else {
				fprintf(stderr, "ForwardDNAAllOccCountNoLimit(): DNA occ sum exception!\n");
				exit(1);
			}
		}

	}

	sum = 0;
	for (j=0; j<wordToCount; j++) {
		sum += dnaDecodeTable[*dna >> 16];
		sum += dnaDecodeTable[*dna & 0x0000FFFF];
		dna++;
	}

	if (charToCount > 0) {
		c = *dna & truncateRightMask[charToCount];	// increase count of 'a' by 16 - c;
		sum += dnaDecodeTable[c >> 16];
		sum += dnaDecodeTable[c & 0xFFFF];
		sum += charToCount - 16;	// decrease count of 'a' by 16 - positionToProcess
	}

	occCount[0] += sum & 0x000000FF;	sum >>= 8;
	occCount[1] += sum & 0x000000FF;	sum >>= 8;
	occCount[2] += sum & 0x000000FF;	sum >>= 8;
	occCount[3] += sum;
}

static void BWTIncBuildPackedBwt(const unsigned int *relativeRank, unsigned int* __restrict bwt, const unsigned int numChar,
								 const unsigned int *cumulativeCount, const unsigned int *packedShift) {

	unsigned int i, c, r;
	unsigned int previousRank, currentRank;
	unsigned int wordIndex, charIndex;
	unsigned int inverseSa0;

	inverseSa0 = previousRank = relativeRank[0];

	for (i=1; i<=numChar; i++) {
		currentRank = relativeRank[i];
		// previousRank > cumulativeCount[c] because $ is one of the char
		c = (previousRank > cumulativeCount[1]) + (previousRank > cumulativeCount[2]) 
											    + (previousRank > cumulativeCount[3]);
		// set bwt for currentRank
		if (c > 0) {
			// c <> 'a'
			r = currentRank;
			if (r > inverseSa0) {
				// - 1 because $ at inverseSa0 is not encoded			
				r--;
			}
			wordIndex = r / CHAR_PER_WORD;
			charIndex = r - wordIndex * CHAR_PER_WORD;
			bwt[wordIndex] |= c << packedShift[charIndex];
		}
		previousRank = currentRank;
	}
}

static inline unsigned int BWTOccValueExplicit(const BWT *bwt, const unsigned int occIndexExplicit,
											   const unsigned int character)
{
	unsigned int occIndexMajor;

	occIndexMajor = occIndexExplicit * OCC_INTERVAL / OCC_INTERVAL_MAJOR;

	if (occIndexExplicit % OCC_VALUE_PER_WORD == 0) {
		return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] +
			   (bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] >> 16);

	} else {
		return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] +
			   (bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] & 0x0000FFFF);
	}
}


static unsigned int ForwardDNAOccCount(const unsigned int*  dna, const unsigned int index, const unsigned int character,
									   const unsigned int*  dnaDecodeTable)
{
	static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000,
											   0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000,
											   0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00,
											   0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC };

	unsigned int wordToCount, charToCount;
	unsigned int i, c;
	unsigned int sum = 0;

	wordToCount = index / 16;
	charToCount = index - wordToCount * 16;

	for (i=0; i<wordToCount; i++) {
		sum += dnaDecodeTable[dna[i] >> 16];
		sum += dnaDecodeTable[dna[i] & 0x0000FFFF];
	}

	if (charToCount > 0) {
		c = dna[i] & truncateRightMask[charToCount];	// increase count of 'a' by 16 - c;
		sum += dnaDecodeTable[c >> 16];
		sum += dnaDecodeTable[c & 0xFFFF];
		sum += charToCount - 16;	// decrease count of 'a' by 16 - positionToProcess
	}

	return (sum >> (character * 8)) & 0x000000FF;

}

static unsigned int BackwardDNAOccCount(const unsigned int*  dna, const unsigned int index, const unsigned int character,
										const unsigned int*  dnaDecodeTable)
{
	static const unsigned int truncateLeftMask[16] =  { 0x00000000, 0x00000003, 0x0000000F, 0x0000003F,
											   0x000000FF, 0x000003FF, 0x00000FFF, 0x00003FFF,
											   0x0000FFFF, 0x0003FFFF, 0x000FFFFF, 0x003FFFFF,
											   0x00FFFFFF, 0x03FFFFFF, 0x0FFFFFFF, 0x3FFFFFFF };

	unsigned int wordToCount, charToCount;
	unsigned int i, c;
	unsigned int sum = 0;

	wordToCount = index / 16;
	charToCount = index - wordToCount * 16;

	dna -= wordToCount + 1;

	if (charToCount > 0) {
		c = *dna & truncateLeftMask[charToCount];	// increase count of 'a' by 16 - c;
		sum += dnaDecodeTable[c >> 16];
		sum += dnaDecodeTable[c & 0xFFFF];
		sum += charToCount - 16;	// decrease count of 'a' by 16 - positionToProcess
	}
	
	for (i=0; i<wordToCount; i++) {
		dna++;
		sum += dnaDecodeTable[*dna >> 16];
		sum += dnaDecodeTable[*dna & 0x0000FFFF];
	}

	return (sum >> (character * 8)) & 0x000000FF;

}

unsigned int BWTOccValue(const BWT *bwt, unsigned int index, const unsigned int character) {

	unsigned int occValue;
	unsigned int occExplicitIndex, occIndex;

	// $ is supposed to be positioned at inverseSa0 but it is not encoded
	// therefore index is subtracted by 1 for adjustment
	if (index > bwt->inverseSa0) {
		index--;
	}

	occExplicitIndex = (index + OCC_INTERVAL / 2 - 1) / OCC_INTERVAL;	// Bidirectional encoding
	occIndex = occExplicitIndex * OCC_INTERVAL;
	occValue = BWTOccValueExplicit(bwt, occExplicitIndex, character);

	if (occIndex == index) {
		return occValue;
	}

	if (occIndex < index) {
		return occValue + ForwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, index - occIndex, character, bwt->decodeTable);
	} else {
		return occValue - BackwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, occIndex - index, character, bwt->decodeTable);
	}

}

static unsigned int BWTIncGetAbsoluteRank(BWT *bwt, unsigned int* __restrict absoluteRank, unsigned int* __restrict seq,
										  const unsigned int *packedText, const unsigned int numChar,
										  const unsigned int* cumulativeCount, const unsigned int firstCharInLastIteration)
{
	unsigned int saIndex;
	unsigned int lastWord;
	unsigned int packedMask;
	unsigned int i, j;
	unsigned int c, t;
	unsigned int rankIndex;
	unsigned int shift;
	unsigned int seqIndexFromStart[ALPHABET_SIZE];
	unsigned int seqIndexFromEnd[ALPHABET_SIZE];

	for (i=0; i<ALPHABET_SIZE; i++) {
		seqIndexFromStart[i] = cumulativeCount[i];
		seqIndexFromEnd[i] = cumulativeCount[i+1] - 1;
	}

	shift = BITS_IN_WORD - BIT_PER_CHAR;
	packedMask = ALL_ONE_MASK >> shift;
	saIndex = bwt->inverseSa0;
	rankIndex = numChar - 1;

	lastWord = numChar / CHAR_PER_WORD;
	for (i=lastWord; i--;) {	// loop from lastWord - 1 to 0
		t = packedText[i];
		for (j=0; j<CHAR_PER_WORD; j++) {
			c = t & packedMask;
			saIndex = bwt->cumulativeFreq[c] + BWTOccValue(bwt, saIndex, c) + 1;
			// A counting sort using the first character of suffix is done here
			// If rank > inverseSa0 -> fill seq from end, otherwise fill seq from start -> to leave the right entry for inverseSa0
			if (saIndex > bwt->inverseSa0) {
				seq[seqIndexFromEnd[c]] = rankIndex;
				absoluteRank[seqIndexFromEnd[c]] = saIndex;
				seqIndexFromEnd[c]--;
			} else {
				seq[seqIndexFromStart[c]] = rankIndex;
				absoluteRank[seqIndexFromStart[c]] = saIndex;
				seqIndexFromStart[c]++;
			}
			rankIndex--;
			t >>= BIT_PER_CHAR;
		}
	}

	absoluteRank[seqIndexFromStart[firstCharInLastIteration]] = bwt->inverseSa0;	// representing the substring of all preceding characters
	seq[seqIndexFromStart[firstCharInLastIteration]] = numChar;

	return seqIndexFromStart[firstCharInLastIteration];
}

static void BWTIncSortKey(unsigned int* __restrict key, unsigned int* __restrict seq, const unsigned int numItem)
{
	#define EQUAL_KEY_THRESHOLD	4	// Partition for equal key if data array size / the number of data with equal value with pivot < EQUAL_KEY_THRESHOLD

	int lowIndex, highIndex, midIndex;
	int lowPartitionIndex, highPartitionIndex;
	int lowStack[32], highStack[32];
	int stackDepth;
	int i, j;
	unsigned int tempSeq, tempKey;
	int numberOfEqualKey;

	if (numItem < 2) return;

	stackDepth = 0;

    lowIndex = 0;
    highIndex = numItem - 1;

	for (;;) {

		for (;;) {

			// Sort small array of data
			if (highIndex - lowIndex < BWTINC_INSERT_SORT_NUM_ITEM) {	 // Insertion sort on smallest arrays
				for (i=lowIndex+1; i<=highIndex; i++) {
					tempSeq = seq[i];
					tempKey = key[i];
					for (j = i; j > lowIndex && key[j-1] > tempKey; j--) {
						seq[j] = seq[j-1];
						key[j] = key[j-1];
					}
					if (j != i) {
						seq[j] = tempSeq;
						key[j] = tempKey;
					}
				}
				break;
			}

			// Choose pivot as median of the lowest, middle, and highest data; sort the three data

			midIndex = average(lowIndex, highIndex);
			if (key[lowIndex] > key[midIndex]) {
				tempSeq = seq[lowIndex];
				tempKey = key[lowIndex];
				seq[lowIndex] = seq[midIndex];
				key[lowIndex] = key[midIndex];
				seq[midIndex] = tempSeq;
				key[midIndex] = tempKey;
			}
			if (key[lowIndex] > key[highIndex]) {
				tempSeq = seq[lowIndex];
				tempKey = key[lowIndex];
				seq[lowIndex] = seq[highIndex];
				key[lowIndex] = key[highIndex];
				seq[highIndex] = tempSeq;
				key[highIndex] = tempKey;
			}
			if (key[midIndex] > key[highIndex]) {
				tempSeq = seq[midIndex];
				tempKey = key[midIndex];
				seq[midIndex] = seq[highIndex];
				key[midIndex] = key[highIndex];
				seq[highIndex] = tempSeq;
				key[highIndex] = tempKey;
			}

			// Partition data

			numberOfEqualKey = 0;

			lowPartitionIndex = lowIndex + 1;
			highPartitionIndex = highIndex - 1;

			for (;;) {
				while (lowPartitionIndex <= highPartitionIndex && key[lowPartitionIndex] <= key[midIndex]) {
					numberOfEqualKey += (key[lowPartitionIndex] == key[midIndex]);
					lowPartitionIndex++;
				}
				while (lowPartitionIndex < highPartitionIndex) {
					if (key[midIndex] >= key[highPartitionIndex]) {
						numberOfEqualKey += (key[midIndex] == key[highPartitionIndex]);
						break;
					}
					highPartitionIndex--;
				}
				if (lowPartitionIndex >= highPartitionIndex) {
					break;
				}
				tempSeq = seq[lowPartitionIndex];
				tempKey = key[lowPartitionIndex];
				seq[lowPartitionIndex] = seq[highPartitionIndex];
				key[lowPartitionIndex] = key[highPartitionIndex];
				seq[highPartitionIndex] = tempSeq;
				key[highPartitionIndex] = tempKey;
				if (highPartitionIndex == midIndex) {
					// partition key has been moved
					midIndex = lowPartitionIndex;
				}
				lowPartitionIndex++;
				highPartitionIndex--;
			}

			// Adjust the partition index
			highPartitionIndex = lowPartitionIndex;
			lowPartitionIndex--;

			// move the partition key to end of low partition
			tempSeq = seq[midIndex];
			tempKey = key[midIndex];
			seq[midIndex] = seq[lowPartitionIndex];
			key[midIndex] = key[lowPartitionIndex];
			seq[lowPartitionIndex] = tempSeq;
			key[lowPartitionIndex] = tempKey;

			if (highIndex - lowIndex + BWTINC_INSERT_SORT_NUM_ITEM <= EQUAL_KEY_THRESHOLD * numberOfEqualKey) {

				// Many keys = partition key; separate the equal key data from the lower partition
		
				midIndex = lowIndex;

				for (;;) {
					while (midIndex < lowPartitionIndex && key[midIndex] < key[lowPartitionIndex]) {
						midIndex++;
					}
					while (midIndex < lowPartitionIndex && key[lowPartitionIndex] == key[lowPartitionIndex - 1]) {
						lowPartitionIndex--;
					}
					if (midIndex >= lowPartitionIndex) {
						break;
					}
					tempSeq = seq[midIndex];
					tempKey = key[midIndex];
					seq[midIndex] = seq[lowPartitionIndex - 1];
					key[midIndex] = key[lowPartitionIndex - 1];
					seq[lowPartitionIndex - 1] = tempSeq;
					key[lowPartitionIndex - 1] = tempKey;
					midIndex++;
					lowPartitionIndex--;
				}

			}

			if (lowPartitionIndex - lowIndex > highIndex - highPartitionIndex) {
				// put the larger partition to stack
				lowStack[stackDepth] = lowIndex;
				highStack[stackDepth] = lowPartitionIndex - 1;
				stackDepth++;
				// sort the smaller partition first
				lowIndex = highPartitionIndex;
			} else {
				// put the larger partition to stack
				lowStack[stackDepth] = highPartitionIndex;
				highStack[stackDepth] = highIndex;
				stackDepth++;
				// sort the smaller partition first
				if (lowPartitionIndex > lowIndex) {
					highIndex = lowPartitionIndex - 1;
				} else {
					// all keys in the partition equals to the partition key
					break;
				}
			}
			continue;
		}

		// Pop a range from stack
		if (stackDepth > 0) {
			stackDepth--;
			lowIndex = lowStack[stackDepth];
			highIndex = highStack[stackDepth];
			continue;
		} else return;
	}
}


static void BWTIncBuildRelativeRank(unsigned int* __restrict sortedRank, unsigned int* __restrict seq,
									unsigned int* __restrict relativeRank, const unsigned int numItem,
									unsigned int oldInverseSa0, const unsigned int *cumulativeCount)
{
	unsigned int i, c;
	unsigned int s, r;
	unsigned int lastRank, lastIndex;
	unsigned int oldInverseSa0RelativeRank = 0;
	unsigned int freq;

	lastIndex = numItem;
	lastRank = sortedRank[numItem];
	if (lastRank > oldInverseSa0) {
		sortedRank[numItem]--;	// to prepare for merging; $ is not encoded in bwt
	}
	s = seq[numItem];
	relativeRank[s] = numItem;
	if (lastRank == oldInverseSa0) {
		oldInverseSa0RelativeRank = numItem;
		oldInverseSa0++;	// so that this segment of code is not run again
		lastRank++;			// so that oldInverseSa0 become a sorted group with 1 item
	}

	c = ALPHABET_SIZE - 1;
	freq = cumulativeCount[c];

	for (i=numItem; i--;) {	// from numItem - 1 to 0
		r = sortedRank[i];
		if (r > oldInverseSa0) {
			sortedRank[i]--;	// to prepare for merging; $ is not encoded in bwt
		}
		s = seq[i];
		if (i < freq) {
			if (lastIndex >= freq) {
				lastRank++;	// to trigger the group across alphabet boundary to be split
			}
			c--;
			freq = cumulativeCount[c];
		}
		if (r == lastRank) {
			relativeRank[s] = lastIndex;
		} else {
			if (i == lastIndex - 1) {
				if (lastIndex < numItem && (int)seq[lastIndex + 1] < 0) {
					seq[lastIndex] = seq[lastIndex + 1] - 1;
				} else {
					seq[lastIndex] = (unsigned int)-1;
				}
			}
			lastIndex = i;
			lastRank = r;
			relativeRank[s] = i;
			if (r == oldInverseSa0) {
				oldInverseSa0RelativeRank = i;
				oldInverseSa0++;	// so that this segment of code is not run again
				lastRank++;			// so that oldInverseSa0 become a sorted group with 1 item
			}
		}
	}

}

static void BWTIncBuildBwt(unsigned int*  seq, const unsigned int *relativeRank, const unsigned int numChar,
						   const unsigned int *cumulativeCount)
{
	unsigned int i, c;
	unsigned int previousRank, currentRank;

	previousRank = relativeRank[0];

	for (i=1; i<=numChar; i++) {
		currentRank = relativeRank[i];
		c = (previousRank >= cumulativeCount[1]) + (previousRank >= cumulativeCount[2])
											  	 + (previousRank >= cumulativeCount[3]);
		seq[currentRank] = c;
		previousRank = currentRank;
	}
}

static void BWTIncMergeBwt(const unsigned int *sortedRank, const unsigned int* oldBwt, const unsigned int *insertBwt,
						   unsigned int* __restrict mergedBwt, const unsigned int numOldBwt, const unsigned int numInsertBwt)
{
	unsigned int bitsInWordMinusBitPerChar;
	unsigned int leftShift, rightShift;
	unsigned int o;
	unsigned int oIndex, iIndex, mIndex;
	unsigned int mWord, mChar, oWord, oChar;
	unsigned int numInsert;

	bitsInWordMinusBitPerChar = BITS_IN_WORD - BIT_PER_CHAR;

	oIndex = 0;
	iIndex = 0;
	mIndex = 0;

	mWord = 0;
	mChar = 0;

	mergedBwt[0] = 0;	// this can be cleared as merged Bwt slightly shift to the left in each iteration

	while (oIndex < numOldBwt) {

		// copy from insertBwt
		while (iIndex <= numInsertBwt && sortedRank[iIndex] <= oIndex) {
			if (sortedRank[iIndex] != 0) {	// special value to indicate that this is for new inverseSa0
				mergedBwt[mWord] |= insertBwt[iIndex] << (BITS_IN_WORD - (mChar + 1) * BIT_PER_CHAR);
				mIndex++;
				mChar++;
				if (mChar == CHAR_PER_WORD) {
					mChar = 0;
					mWord++;
					mergedBwt[mWord] = 0;	// no need to worry about crossing mergedBwt boundary
				}
			}
			iIndex++;
		}

		// Copy from oldBwt to mergedBwt
		if (iIndex <= numInsertBwt) {
			o = sortedRank[iIndex];
		} else {
			o = numOldBwt;
		}
		numInsert = o - oIndex;

		oWord = oIndex / CHAR_PER_WORD;
		oChar = oIndex - oWord * CHAR_PER_WORD;
		if (oChar > mChar) {
			leftShift = (oChar - mChar) * BIT_PER_CHAR;
			rightShift = (CHAR_PER_WORD + mChar - oChar) * BIT_PER_CHAR;
			mergedBwt[mWord] = mergedBwt[mWord]
								| (oldBwt[oWord] << (oChar * BIT_PER_CHAR) >> (mChar * BIT_PER_CHAR))
								| (oldBwt[oWord+1] >> rightShift);
			oIndex += min(numInsert, CHAR_PER_WORD - mChar);
			while (o > oIndex) {
				oWord++;
				mWord++;
				mergedBwt[mWord] = (oldBwt[oWord] << leftShift) | (oldBwt[oWord+1] >> rightShift);
				oIndex += CHAR_PER_WORD;
			}
		} else if (oChar < mChar) {
			rightShift = (mChar - oChar) * BIT_PER_CHAR;
			leftShift = (CHAR_PER_WORD + oChar - mChar) * BIT_PER_CHAR;
			mergedBwt[mWord] = mergedBwt[mWord] 
								| (oldBwt[oWord] << (oChar * BIT_PER_CHAR) >> (mChar * BIT_PER_CHAR));
			oIndex += min(numInsert, CHAR_PER_WORD - mChar);
			while (o > oIndex) {
				oWord++;
				mWord++;
				mergedBwt[mWord] = (oldBwt[oWord-1] << leftShift) | (oldBwt[oWord] >> rightShift);
				oIndex += CHAR_PER_WORD;
			}
		} else { // oChar == mChar
			mergedBwt[mWord] = mergedBwt[mWord] | truncateLeft(oldBwt[oWord], mChar * BIT_PER_CHAR);
			oIndex += min(numInsert, CHAR_PER_WORD - mChar);
			while (o > oIndex) {
				oWord++;
				mWord++;
				mergedBwt[mWord] = oldBwt[oWord];
				oIndex += CHAR_PER_WORD;
			}
		}
		oIndex = o;
		mIndex += numInsert;

		// Clear the trailing garbage in mergedBwt
		mWord = mIndex / CHAR_PER_WORD;
		mChar = mIndex - mWord * CHAR_PER_WORD;
		if (mChar == 0) {
			mergedBwt[mWord] = 0;
		} else {
			mergedBwt[mWord] = truncateRight(mergedBwt[mWord], (BITS_IN_WORD - mChar * BIT_PER_CHAR));
		}

	}

	// copy from insertBwt
	while (iIndex <= numInsertBwt) {
		if (sortedRank[iIndex] != 0) {
			mergedBwt[mWord] |= insertBwt[iIndex] << (BITS_IN_WORD - (mChar + 1) * BIT_PER_CHAR);
			mIndex++;
			mChar++;
			if (mChar == CHAR_PER_WORD) {
				mChar = 0;
				mWord++;
				mergedBwt[mWord] = 0;	// no need to worry about crossing mergedBwt boundary
			}
		}
		iIndex++;
	}
}

void BWTClearTrailingBwtCode(BWT *bwt)
{
	unsigned int bwtResidentSizeInWord;
	unsigned int wordIndex, offset;
	unsigned int i;

	bwtResidentSizeInWord = BWTResidentSizeInWord(bwt->textLength);

	wordIndex = bwt->textLength / CHAR_PER_WORD;
	offset = (bwt->textLength - wordIndex * CHAR_PER_WORD) * BIT_PER_CHAR;
	if (offset > 0) {
		bwt->bwtCode[wordIndex] = truncateRight(bwt->bwtCode[wordIndex], BITS_IN_WORD - offset);
	} else {
		if (wordIndex < bwtResidentSizeInWord) {
			bwt->bwtCode[wordIndex] = 0;
		}
	}

	for (i=wordIndex+1; i<bwtResidentSizeInWord; i++) {
		bwt->bwtCode[i] = 0;
	}
}


void BWTGenerateOccValueFromBwt(const unsigned int*  bwt, unsigned int* __restrict occValue,
								unsigned int* __restrict occValueMajor,
								const unsigned int textLength, const unsigned int*  decodeTable)
{
	unsigned int numberOfOccValueMajor, numberOfOccValue;
	unsigned int wordBetweenOccValue;
	unsigned int numberOfOccIntervalPerMajor;
	unsigned int c;
	unsigned int i, j;
	unsigned int occMajorIndex;
	unsigned int occIndex, bwtIndex;
	unsigned int sum;
	unsigned int tempOccValue0[ALPHABET_SIZE], tempOccValue1[ALPHABET_SIZE];

	wordBetweenOccValue = OCC_INTERVAL / CHAR_PER_WORD;

	// Calculate occValue
	// [lh3] by default: OCC_INTERVAL_MAJOR=65536, OCC_INTERVAL=256
	numberOfOccValue = (textLength + OCC_INTERVAL - 1) / OCC_INTERVAL + 1;				// Value at both end for bi-directional encoding
	numberOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL;
	numberOfOccValueMajor = (numberOfOccValue + numberOfOccIntervalPerMajor - 1) / numberOfOccIntervalPerMajor;

	tempOccValue0[0] = 0;
	tempOccValue0[1] = 0;
	tempOccValue0[2] = 0;
	tempOccValue0[3] = 0;
	occValueMajor[0] = 0;
	occValueMajor[1] = 0;
	occValueMajor[2] = 0;
	occValueMajor[3] = 0;

	occIndex = 0;
	bwtIndex = 0;
	for (occMajorIndex=1; occMajorIndex<numberOfOccValueMajor; occMajorIndex++) {

		for (i=0; i<numberOfOccIntervalPerMajor/2; i++) {

			sum = 0;
			tempOccValue1[0] = tempOccValue0[0];
			tempOccValue1[1] = tempOccValue0[1];
			tempOccValue1[2] = tempOccValue0[2];
			tempOccValue1[3] = tempOccValue0[3];

			for (j=0; j<wordBetweenOccValue; j++) {
				c = bwt[bwtIndex];
				sum += decodeTable[c >> 16];
				sum += decodeTable[c & 0x0000FFFF];
				bwtIndex++;
			}
			if (!DNA_OCC_SUM_EXCEPTION(sum)) {
				tempOccValue1[0] += (sum & 0x000000FF);	sum >>= 8;
				tempOccValue1[1] += (sum & 0x000000FF);	sum >>= 8;
				tempOccValue1[2] += (sum & 0x000000FF);	sum >>= 8;
				tempOccValue1[3] += sum;
			} else {
				if (sum == 0x00000100) {
					tempOccValue1[0] += 256;
				} else if (sum == 0x00010000) {
					tempOccValue1[1] += 256;
				} else if (sum == 0x01000000) {
					tempOccValue1[2] += 256;
				} else {
					tempOccValue1[3] += 256;
				}
			}
			occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0];
			occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1];
			occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2];
			occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3];
			tempOccValue0[0] = tempOccValue1[0];
			tempOccValue0[1] = tempOccValue1[1];
			tempOccValue0[2] = tempOccValue1[2];
			tempOccValue0[3] = tempOccValue1[3];
			sum = 0;

			occIndex++;

			for (j=0; j<wordBetweenOccValue; j++) {
				c = bwt[bwtIndex];
				sum += decodeTable[c >> 16];
				sum += decodeTable[c & 0x0000FFFF];
				bwtIndex++;
			}
			if (!DNA_OCC_SUM_EXCEPTION(sum)) {
				tempOccValue0[0] += (sum & 0x000000FF);	sum >>= 8;
				tempOccValue0[1] += (sum & 0x000000FF);	sum >>= 8;
				tempOccValue0[2] += (sum & 0x000000FF);	sum >>= 8;
				tempOccValue0[3] += sum;
			} else {
				if (sum == 0x00000100) {
					tempOccValue0[0] += 256;
				} else if (sum == 0x00010000) {
					tempOccValue0[1] += 256;
				} else if (sum == 0x01000000) {
					tempOccValue0[2] += 256;
				} else {
					tempOccValue0[3] += 256;
				}
			}
		}

		occValueMajor[occMajorIndex * 4 + 0] = occValueMajor[(occMajorIndex - 1) * 4 + 0] + tempOccValue0[0];
		occValueMajor[occMajorIndex * 4 + 1] = occValueMajor[(occMajorIndex - 1) * 4 + 1] + tempOccValue0[1];
		occValueMajor[occMajorIndex * 4 + 2] = occValueMajor[(occMajorIndex - 1) * 4 + 2] + tempOccValue0[2];
		occValueMajor[occMajorIndex * 4 + 3] = occValueMajor[(occMajorIndex - 1) * 4 + 3] + tempOccValue0[3];
		tempOccValue0[0] = 0;
		tempOccValue0[1] = 0;
		tempOccValue0[2] = 0;
		tempOccValue0[3] = 0;

	}

	while (occIndex < (numberOfOccValue-1)/2) {
		sum = 0;
		tempOccValue1[0] = tempOccValue0[0];
		tempOccValue1[1] = tempOccValue0[1];
		tempOccValue1[2] = tempOccValue0[2];
		tempOccValue1[3] = tempOccValue0[3];
		for (j=0; j<wordBetweenOccValue; j++) {
			c = bwt[bwtIndex];
			sum += decodeTable[c >> 16];
			sum += decodeTable[c & 0x0000FFFF];
			bwtIndex++;
		}
		if (!DNA_OCC_SUM_EXCEPTION(sum)) {
			tempOccValue1[0] += (sum & 0x000000FF);	sum >>= 8;
			tempOccValue1[1] += (sum & 0x000000FF);	sum >>= 8;
			tempOccValue1[2] += (sum & 0x000000FF);	sum >>= 8;
			tempOccValue1[3] += sum;
		} else {
			if (sum == 0x00000100) {
				tempOccValue1[0] += 256;
			} else if (sum == 0x00010000) {
				tempOccValue1[1] += 256;
			} else if (sum == 0x01000000) {
				tempOccValue1[2] += 256;
			} else {
				tempOccValue1[3] += 256;
			}
		}
		occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0];
		occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1];
		occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2];
		occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3];
		tempOccValue0[0] = tempOccValue1[0];
		tempOccValue0[1] = tempOccValue1[1];
		tempOccValue0[2] = tempOccValue1[2];
		tempOccValue0[3] = tempOccValue1[3];
		sum = 0;
		occIndex++;

		for (j=0; j<wordBetweenOccValue; j++) {
			c = bwt[bwtIndex];
			sum += decodeTable[c >> 16];
			sum += decodeTable[c & 0x0000FFFF];
			bwtIndex++;
		}
		if (!DNA_OCC_SUM_EXCEPTION(sum)) {
			tempOccValue0[0] += (sum & 0x000000FF);	sum >>= 8;
			tempOccValue0[1] += (sum & 0x000000FF);	sum >>= 8;
			tempOccValue0[2] += (sum & 0x000000FF);	sum >>= 8;
			tempOccValue0[3] += sum;
		} else {
			if (sum == 0x00000100) {
				tempOccValue0[0] += 256;
			} else if (sum == 0x00010000) {
				tempOccValue0[1] += 256;
			} else if (sum == 0x01000000) {
				tempOccValue0[2] += 256;
			} else {
				tempOccValue0[3] += 256;
			}
		}
	}

	sum = 0;
	tempOccValue1[0] = tempOccValue0[0];
	tempOccValue1[1] = tempOccValue0[1];
	tempOccValue1[2] = tempOccValue0[2];
	tempOccValue1[3] = tempOccValue0[3];

	if (occIndex * 2 < numberOfOccValue - 1) {
		for (j=0; j<wordBetweenOccValue; j++) {
			c = bwt[bwtIndex];
			sum += decodeTable[c >> 16];
			sum += decodeTable[c & 0x0000FFFF];
			bwtIndex++;
		}
		if (!DNA_OCC_SUM_EXCEPTION(sum)) {
			tempOccValue1[0] += (sum & 0x000000FF);	sum >>= 8;
			tempOccValue1[1] += (sum & 0x000000FF);	sum >>= 8;
			tempOccValue1[2] += (sum & 0x000000FF);	sum >>= 8;
			tempOccValue1[3] += sum;
		} else {
			if (sum == 0x00000100) {
				tempOccValue1[0] += 256;
			} else if (sum == 0x00010000) {
				tempOccValue1[1] += 256;
			} else if (sum == 0x01000000) {
				tempOccValue1[2] += 256;
			} else {
				tempOccValue1[3] += 256;
			}
		}
	}

	occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0];
	occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1];
	occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2];
	occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3];

}

static void BWTIncConstruct(BWTInc *bwtInc, const unsigned int numChar)
{
	unsigned int i;
	unsigned int mergedBwtSizeInWord, mergedOccSizeInWord;
	unsigned int firstCharInThisIteration;

	unsigned int *relativeRank, *seq, *sortedRank, *insertBwt, *mergedBwt;
	unsigned int newInverseSa0RelativeRank, oldInverseSa0RelativeRank, newInverseSa0;

	#ifdef DEBUG
	if (numChar > bwtInc->buildSize) {
		fprintf(stderr, "BWTIncConstruct(): numChar > buildSize!\n");
		exit(1);
	}
	#endif

	mergedBwtSizeInWord = BWTResidentSizeInWord(bwtInc->bwt->textLength + numChar);
	mergedOccSizeInWord = BWTOccValueMinorSizeInWord(bwtInc->bwt->textLength + numChar);

	initializeVAL(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0);

	if (bwtInc->bwt->textLength == 0) {		// Initial build

		// Set address
		seq = bwtInc->workingMemory;
		relativeRank = seq + bwtInc->buildSize + 1;
		mergedBwt = insertBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord;	// build in place

		BWTIncPutPackedTextToRank(bwtInc->packedText, relativeRank, bwtInc->cumulativeCountInCurrentBuild, numChar);

		firstCharInThisIteration = relativeRank[0];
		relativeRank[numChar] = 0;

		// Sort suffix
		QSufSortSuffixSort((int*)relativeRank, (int*)seq, (int)numChar, (int)ALPHABET_SIZE - 1, 0, FALSE);
		newInverseSa0 = relativeRank[0];

		// Clear BWT area
		initializeVAL(insertBwt, mergedBwtSizeInWord, 0);

		// Build BWT
		BWTIncBuildPackedBwt(relativeRank, insertBwt, numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->packedShift);

		// so that the cumulativeCount is not deducted
		bwtInc->firstCharInLastIteration = ALPHABET_SIZE;

	} else {		// Incremental build
		// Set address
		sortedRank = bwtInc->workingMemory;
		seq = sortedRank + bwtInc->buildSize + 1;
		insertBwt = seq;
		relativeRank = seq + bwtInc->buildSize + 1;

		// Store the first character of this iteration
		firstCharInThisIteration = bwtInc->packedText[0] >> (BITS_IN_WORD - BIT_PER_CHAR);

		// Count occurrence of input text
		ForwardDNAAllOccCountNoLimit(bwtInc->packedText, numChar, bwtInc->cumulativeCountInCurrentBuild + 1, bwtInc->bwt->decodeTable);
		// Add the first character of the previous iteration to represent the inverseSa0 of the previous iteration
		bwtInc->cumulativeCountInCurrentBuild[bwtInc->firstCharInLastIteration + 1]++;
		bwtInc->cumulativeCountInCurrentBuild[2] += bwtInc->cumulativeCountInCurrentBuild[1];
		bwtInc->cumulativeCountInCurrentBuild[3] += bwtInc->cumulativeCountInCurrentBuild[2];
		bwtInc->cumulativeCountInCurrentBuild[4] += bwtInc->cumulativeCountInCurrentBuild[3];

		// Get rank of new suffix among processed suffix
		// The seq array is built into ALPHABET_SIZE + 2 groups; ALPHABET_SIZE groups + 1 group divided into 2 by inverseSa0 + inverseSa0 as 1 group
		oldInverseSa0RelativeRank = BWTIncGetAbsoluteRank(bwtInc->bwt, sortedRank, seq, bwtInc->packedText, 
														  numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->firstCharInLastIteration);

		// Sort rank by ALPHABET_SIZE + 2 groups (or ALPHABET_SIZE + 1 groups when inverseSa0 sit on the border of a group)
		for (i=0; i<ALPHABET_SIZE; i++) {
			if (bwtInc->cumulativeCountInCurrentBuild[i] > oldInverseSa0RelativeRank ||
				bwtInc->cumulativeCountInCurrentBuild[i+1] <= oldInverseSa0RelativeRank) {
				BWTIncSortKey(sortedRank + bwtInc->cumulativeCountInCurrentBuild[i], seq + bwtInc->cumulativeCountInCurrentBuild[i], bwtInc->cumulativeCountInCurrentBuild[i+1] - bwtInc->cumulativeCountInCurrentBuild[i]);
			} else {
				if (bwtInc->cumulativeCountInCurrentBuild[i] < oldInverseSa0RelativeRank) {
					BWTIncSortKey(sortedRank + bwtInc->cumulativeCountInCurrentBuild[i], seq + bwtInc->cumulativeCountInCurrentBuild[i], oldInverseSa0RelativeRank - bwtInc->cumulativeCountInCurrentBuild[i]);
				}
				if (bwtInc->cumulativeCountInCurrentBuild[i+1] > oldInverseSa0RelativeRank + 1) {
					BWTIncSortKey(sortedRank + oldInverseSa0RelativeRank + 1, seq + oldInverseSa0RelativeRank + 1, bwtInc->cumulativeCountInCurrentBuild[i+1] - oldInverseSa0RelativeRank - 1);
				}
			}
		}

		// build relative rank; sortedRank is updated for merging to cater for the fact that $ is not encoded in bwt
		// the cumulative freq information is used to make sure that inverseSa0 and suffix beginning with different characters are kept in different unsorted groups)
		BWTIncBuildRelativeRank(sortedRank, seq, relativeRank, numChar, bwtInc->bwt->inverseSa0, bwtInc->cumulativeCountInCurrentBuild);
#ifdef DEBUG
		if (relativeRank[numChar] != oldInverseSa0RelativeRank) {
			fprintf(stderr, "BWTIncConstruct(): relativeRank[numChar] != oldInverseSa0RelativeRank!\n");
			exit(1);
		}
#endif

		// Sort suffix
		QSufSortSuffixSort((int*)relativeRank, (int*)seq, (int)numChar, (int)numChar, 1, TRUE);

		newInverseSa0RelativeRank = relativeRank[0];
		newInverseSa0 = sortedRank[newInverseSa0RelativeRank] + newInverseSa0RelativeRank;

		sortedRank[newInverseSa0RelativeRank] = 0;	// a special value so that this is skipped in the merged bwt

		// Build BWT
		BWTIncBuildBwt(seq, relativeRank, numChar, bwtInc->cumulativeCountInCurrentBuild);

		// Merge BWT
		mergedBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord 
				    - bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR;
					// minus numberOfIteration * occInterval to create a buffer for merging
		BWTIncMergeBwt(sortedRank, bwtInc->bwt->bwtCode, insertBwt, mergedBwt, bwtInc->bwt->textLength, numChar);

	}

	// Build auxiliary structure and update info and pointers in BWT
	bwtInc->bwt->textLength += numChar;
	bwtInc->bwt->bwtCode = mergedBwt;
	bwtInc->bwt->bwtSizeInWord = mergedBwtSizeInWord;
	bwtInc->bwt->occSizeInWord = mergedOccSizeInWord;
	if (mergedBwt < bwtInc->workingMemory + mergedOccSizeInWord) {
		fprintf(stderr, "BWTIncConstruct() : Not enough memory allocated!\n");
		exit(1);
	}

	bwtInc->bwt->occValue = mergedBwt - mergedOccSizeInWord;

	BWTClearTrailingBwtCode(bwtInc->bwt);
	BWTGenerateOccValueFromBwt(bwtInc->bwt->bwtCode, bwtInc->bwt->occValue, bwtInc->bwt->occValueMajor,
							   bwtInc->bwt->textLength, bwtInc->bwt->decodeTable);

	bwtInc->bwt->inverseSa0 = newInverseSa0;
	
	bwtInc->bwt->cumulativeFreq[1] += bwtInc->cumulativeCountInCurrentBuild[1] - (bwtInc->firstCharInLastIteration <= 0);
	bwtInc->bwt->cumulativeFreq[2] += bwtInc->cumulativeCountInCurrentBuild[2] - (bwtInc->firstCharInLastIteration <= 1);
	bwtInc->bwt->cumulativeFreq[3] += bwtInc->cumulativeCountInCurrentBuild[3] - (bwtInc->firstCharInLastIteration <= 2);
	bwtInc->bwt->cumulativeFreq[4] += bwtInc->cumulativeCountInCurrentBuild[4] - (bwtInc->firstCharInLastIteration <= 3);

	bwtInc->firstCharInLastIteration = firstCharInThisIteration;

	// Set build size and text address for the next build
	BWTIncSetBuildSizeAndTextAddr(bwtInc);
	bwtInc->numberOfIterationDone++;

}

BWTInc *BWTIncConstructFromPacked(const char *inputFileName, const float targetNBit,
								  const unsigned int initialMaxBuildSize, const unsigned int incMaxBuildSize)
{

	FILE *packedFile;
	unsigned int packedFileLen;
	unsigned int totalTextLength;
	unsigned int textToLoad, textSizeInByte;
	unsigned int processedTextLength;
	unsigned char lastByteLength;

	BWTInc *bwtInc;

	packedFile = (FILE*)fopen(inputFileName, "rb");

	if (packedFile == NULL) {
		fprintf(stderr, "BWTIncConstructFromPacked() : Cannot open inputFileName!\n");
		exit(1);
	}

	fseek(packedFile, -1, SEEK_END);
	packedFileLen = ftell(packedFile);
	if ((int)packedFileLen < 0) {
		fprintf(stderr, "BWTIncConstructFromPacked: Cannot determine file length!\n");
		exit(1);
	}
	fread(&lastByteLength, sizeof(unsigned char), 1, packedFile);
	totalTextLength = TextLengthFromBytePacked(packedFileLen, BIT_PER_CHAR, lastByteLength);

	bwtInc = BWTIncCreate(totalTextLength, targetNBit, initialMaxBuildSize, incMaxBuildSize);

	BWTIncSetBuildSizeAndTextAddr(bwtInc);

	if (bwtInc->buildSize > totalTextLength) {
		textToLoad = totalTextLength;
	} else {
		textToLoad = totalTextLength - ((totalTextLength - bwtInc->buildSize + CHAR_PER_WORD - 1) / CHAR_PER_WORD * CHAR_PER_WORD);
	}
	textSizeInByte = textToLoad / CHAR_PER_BYTE;	// excluded the odd byte

	fseek(packedFile, -2, SEEK_CUR);
	fseek(packedFile, -((int)textSizeInByte), SEEK_CUR);
	fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte + 1, packedFile);
	fseek(packedFile, -((int)textSizeInByte + 1), SEEK_CUR);

	ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad);
	BWTIncConstruct(bwtInc, textToLoad);

	processedTextLength = textToLoad;

	while (processedTextLength < totalTextLength) {
		textToLoad = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD;
		if (textToLoad > totalTextLength - processedTextLength) {
			textToLoad = totalTextLength - processedTextLength;
		}
		textSizeInByte = textToLoad / CHAR_PER_BYTE;
		fseek(packedFile, -((int)textSizeInByte), SEEK_CUR);
		fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte, packedFile);
		fseek(packedFile, -((int)textSizeInByte), SEEK_CUR);
		ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad);
		BWTIncConstruct(bwtInc, textToLoad);
		processedTextLength += textToLoad;
		if (bwtInc->numberOfIterationDone % 10 == 0) {
			printf("[BWTIncConstructFromPacked] %u iterations done. %u characters processed.\n",
				   bwtInc->numberOfIterationDone, processedTextLength);
		}
	}
	return bwtInc;
}

void BWTFree(BWT *bwt)
{
	if (bwt == 0) return;
	free(bwt->cumulativeFreq);
	free(bwt->bwtCode);
	free(bwt->occValue);
	free(bwt->occValueMajor);
	free(bwt->saValue);
	free(bwt->inverseSa);
	free(bwt->decodeTable);
	free(bwt->saIndexRange);
	free(bwt->saValueOnBoundary);
	free(bwt);
}

void BWTIncFree(BWTInc *bwtInc)
{
	if (bwtInc == 0) return;
	free(bwtInc->bwt);
	free(bwtInc->workingMemory);
	free(bwtInc);
}

static unsigned int BWTFileSizeInWord(const unsigned int numChar)
{
	// The $ in BWT at the position of inverseSa0 is not encoded
	return (numChar + CHAR_PER_WORD - 1) / CHAR_PER_WORD;
}

void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *occValueFileName)
{
	FILE *bwtFile;
/*	FILE *occValueFile; */
	unsigned int bwtLength;

	bwtFile = (FILE*)fopen(bwtFileName, "wb");
	if (bwtFile == NULL) {
		fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Cannot open BWT code file!\n");
		exit(1);
	}

	fwrite(&bwt->inverseSa0, sizeof(unsigned int), 1, bwtFile);
	fwrite(bwt->cumulativeFreq + 1, sizeof(unsigned int), ALPHABET_SIZE, bwtFile);
	bwtLength = BWTFileSizeInWord(bwt->textLength);
	fwrite(bwt->bwtCode, sizeof(unsigned int), bwtLength, bwtFile);
	fclose(bwtFile);
/*
	occValueFile = (FILE*)fopen(occValueFileName, "wb");
	if (occValueFile == NULL) {
		fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Cannot open occ value file!\n");
		exit(1);
	}

	fwrite(&bwt->inverseSa0, sizeof(unsigned int), 1, occValueFile);
	fwrite(bwt->cumulativeFreq + 1, sizeof(unsigned int), ALPHABET_SIZE, occValueFile);
	fwrite(bwt->occValue, sizeof(unsigned int), bwt->occSizeInWord, occValueFile);
	fwrite(bwt->occValueMajor, sizeof(unsigned int), bwt->occMajorSizeInWord, occValueFile);
	fclose(occValueFile);
*/
}

void bwt_bwtgen(const char *fn_pac, const char *fn_bwt)
{
	BWTInc *bwtInc;
	bwtInc = BWTIncConstructFromPacked(fn_pac, 2.5, 10000000, 10000000);
	printf("[bwt_gen] Finished constructing BWT in %u iterations.\n", bwtInc->numberOfIterationDone);
	BWTSaveBwtCodeAndOcc(bwtInc->bwt, fn_bwt, 0);
	BWTIncFree(bwtInc);
}

int bwt_bwtgen_main(int argc, char *argv[])
{
	if (argc < 3) {
		fprintf(stderr, "Usage: bwtgen <in.pac> <out.bwt>\n");
		return 1;
	}
	bwt_bwtgen(argv[1], argv[2]);
	return 0;
}

#ifdef MAIN_BWT_GEN

int main(int argc, char *argv[])
{
	return bwt_bwtgen_main(argc, argv);
}

#endif