/*************************************************************************** * Copyright (C) 2006 by Veli Mäkinen and Niko Välimäki * * * * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation; either version 2 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program; if not, write to the * * Free Software Foundation, Inc., * * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * ***************************************************************************/ #include "CSA.h" //////////////////////////////////////////////////////////////////////////// // Class CSA::THuffAlphabetRank CSA::THuffAlphabetRank::THuffAlphabetRank(alpha_t *s, trace_t n, TCodeEntry *codetable, unsigned level) { left = NULL; right = NULL; bitrank = NULL; ch = s[0]; leaf = false; this->codetable = codetable; bool *B = new bool[n]; trace_t sum=0, i; /* for (i=0; i 100) return; */ for (i=0; i 0) sfirst = new alpha_t[n-sum]; //if (sum > 0) ssecond = new alpha_t[sum]; unsigned j=0, k=0; for (i=0; i 0) { left = new THuffAlphabetRank(sfirst, j, codetable, level + 1); delete [] sfirst; //} //if (k > 0) { right = new THuffAlphabetRank(ssecond, k, codetable, level + 1); delete [] ssecond; //} } bool CSA::THuffAlphabetRank::Test(alpha_t *s, trace_t n) { // testing that the code works correctly int C[ALPHABETSIZE]; bool correct=true; for (salpha_t j=0; j%d\n",(unsigned)i,(int)s[i]-128,C[s[i]],(int)rank(s[i],i)); return false; } } return correct; } CSA::THuffAlphabetRank::~THuffAlphabetRank() { if (left!=NULL) delete left; if (right!=NULL) delete right; if (bitrank!=NULL) delete bitrank; } //////////////////////////////////////////////////////////////////////////// // Class CSA CSA::CSA(alpha_t *text, trace_t n, unsigned samplerate, const char *loadFromFile, const char *saveToFile) { this->n = n; this->samplerate = samplerate; //cout << "CSA:" << endl; //////////////////// // http://en.wikipedia.org/wiki/FM-index alpha_t *bwt; if (loadFromFile != 0) bwt = LoadFromFile(loadFromFile); else bwt = BWT(text); if (saveToFile != 0) SaveToFile(saveToFile, bwt); /* cout << "\n bwt:"; for (trace_t i=0; i 0) { min = i; break; } for (i=ALPHABETSIZE-1; i>=min; --i) if (C[i] > 0) { max = i; break; } */ trace_t prev = C[0], temp; C[0] = 0; for (i=1; icodetable = node::makecodetable(bwt, n); alphabetrank = new THuffAlphabetRank(bwt, n, this->codetable, 0); //if (alphabetrank->Test(bwt, n)) printf("alphabetrank ok\n"); delete [] bwt; // Make tables maketables(); // to avoid running out of unsigned, the sizes are computed in specific order (large/small)*small // |class CSA|+ALPHABETSIZE*|TCodeEntry|+|C[]|+|suffixes[]+positions[]|+... //printf("FMindex takes %d B\n", // 6*W/8+ALPHABETSIZE*3*W/8+ALPHABETSIZE*W/8+ (2*n/(samplerate*8))*W+sampled->SpaceRequirementInBits()/8+alphabetrank->SpaceRequirementInBits()/8+W/8); } trace_t CSA::lookup(trace_t i) // Time complexity: O(samplerate log \sigma) { trace_t dist=0; while (!sampled->IsBitSet(i)) { int c = alphabetrank->charAtPos(i); i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping ++dist; } return suffixes[sampled->rank(i)-1]+dist; } trace_t CSA::Psi(trace_t i) // Time complexity: O(samplerate log \sigma) { // Return 0 if SA[i] = n if (lookup(i) == n - 1) return 0; // Search sampled position so that SA[j] less than or equal to SA[i] int c; unsigned j = i; while (!sampled->IsBitSet(j)) { c = alphabetrank->charAtPos(j); j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping } // Move to j = inverse SA[ SA[j] + samplerate ] // or to the end of the BWT if (suffixes[sampled->rank(j)-1] / samplerate + 1 >= n / samplerate) j = bwtEndPos; else j = positions[suffixes[sampled->rank(j)-1] / samplerate + 1]; // Search position SA[prev] = SA[i] + 1 unsigned prev; do { prev = j; c = alphabetrank->charAtPos(j); j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping } while (j != i); // Return the previous j value prev return prev; } alpha_t * CSA::substring(trace_t i, trace_t l) { alpha_t *result = new alpha_t[l + 1]; if (l == 0) { result[0] = 0u; return result; } trace_t dist; trace_t k = i + l - 1; // Check for end of the string if (k > n - 1) { l -= k - n + 1; k = n - 1; } trace_t skip = samplerate - k % samplerate - 1; trace_t j; if (k / samplerate + 1 >= n / samplerate) { j = bwtEndPos; skip = n - k - 1; } else { j = positions[k/samplerate+1]; //cout << samplerate << ' ' << j << '\n'; } for (dist=0; distcharAtPos(j); j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping if (dist >= skip) result[l + skip - dist - 1] = c; } result[l] = 0u; return result; } trace_t CSA::inverse(trace_t i) { trace_t skip = samplerate - i % samplerate; trace_t j; if (i / samplerate + 1 >= n / samplerate) { j = bwtEndPos; skip = n - i; } else { j = positions[i/samplerate+1]; //cout << samplerate << ' ' << j << '\n'; } while (skip > 0) { int c = alphabetrank->charAtPos(j); j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping skip --; } return j; } trace_t CSA::Search(alpha_t *pattern, trace_t m, trace_t *spResult, trace_t *epResult) { // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i) int c = (int)pattern[m-1]; int i = m-1; int sp = C[c]; int ep = C[c+1]-1; while (sp <= ep && i >= 1) { c = (int)pattern[--i]; sp = C[c]+alphabetrank->rank(c,sp-1); ep = C[c]+alphabetrank->rank(c,ep)-1; } *spResult = sp; *epResult = ep; if (sp <= ep) return ep - sp + 1; else return 0; } CSA::~CSA() { delete alphabetrank; delete sampled; delete [] suffixes; delete [] positions; delete [] codetable; } void CSA::maketables() { trace_t sampleLength = (n % samplerate == 0) ? n/samplerate : n/samplerate+1; trace_t *sampledpositions = new trace_t[n/W+1]; suffixes = new trace_t[sampleLength]; positions = new trace_t[sampleLength]; trace_t i, j=0; for (i=0; iGetPos(i) x = (i == n-1) ? 0 : i+1; if (x % samplerate == 0) { Tools::SetField(sampledpositions,1,p,1); positions[x/samplerate] = p; //printf("positions[%lu] = %lu\n", x/samplerate, p); } //p= wt->LFmapping(p+1)-1; alpha_t c = alphabetrank->charAtPos(p); p = C[c]+alphabetrank->rank(c, p)-1; } //printf("Sampled positions:\n0123456789012345678901234567890123456789\n"); //Tools::PrintBitSequence(sampledpositions,n); sampled = new BitRank(sampledpositions,n,true); /* printf("Is bit set test:\n"); for (i = 0; i< n; i++) if ((*sampled)->IsBitSet(i)) printf("1"); else printf("0"); printf("\n"); */ // suffixes: for (i=0; irank(positions[i]); if (j == 0) j = sampleLength; suffixes[j-1] = (i*samplerate == n) ? 0 : i*samplerate; //printf("suffixes[%lu] = %lu\n", j-1, (i*samplerate==n)?0:i*samplerate); } } alpha_t * CSA::LoadFromFile(const char *filename) { alpha_t *s; std::ifstream file (filename, ios::in|ios::binary); if (file.is_open()) { std::cerr << "Loading CSA from file: " << filename << std::endl; file.read((char *)&bwtEndPos, sizeof(trace_t)); s = new alpha_t[n]; for (trace_t offset=0; offsetgetBWT(); for (trace_t i=0; i, std::greater> q; // First I push all the leaf nodes into the queue for (salpha_t i=0; i 1) { node *child0 = new node(q.top()); q.pop(); node *child1 = new node(q.top()); q.pop(); q.push(node(child0, child1)); } // Now I compute and return the codetable q.top().maketable(0u, 0u, result); q.pop(); /* cout << "\n makecodetable:"; for (trace_t i=0; imaketable(SetBit(code, bits, 0), bits + 1, codetable); child1->maketable(SetBit(code, bits, 1), bits + 1, codetable); delete child0; delete child1; } else { codetable[value].code = code; codetable[value].bits = bits; //cout << "codetable[" << hex << (trace_t)value << "].bits = " << bits << endl; //cout << "codetable[" << hex << (trace_t)value << "].code = " << code << endl; } } void CSA::node::count_chars(alpha_t *text, trace_t n, TCodeEntry *counts ) { for (salpha_t i=0; i