WordBitCompress.cc

Go to the documentation of this file.
00001 //
00002 // WordBitCompress.cc
00003 //
00004 // Part of the ht://Dig package   <http://www.htdig.org/>
00005 // Copyright (c) 1999, 2000, 2001 The ht://Dig Group
00006 // For copyright details, see the file COPYING in your distribution
00007 // or the GNU General Public License version 2 or later
00008 // <http://www.gnu.org/copyleft/gpl.html>
00009 //
00010 // $Id: WordBitCompress_8cc-source.html,v 1.1 2008/06/08 10:12:57 sebdiaz Exp $
00011 //
00012 
00013 #ifdef HAVE_CONFIG_H
00014 #include "config.h"
00015 #endif /* HAVE_CONFIG_H */
00016 
00017 #include <stdlib.h>
00018 #include <string.h>
00019 
00020 #include"WordBitCompress.h"
00021 #include"HtMaxMin.h"
00022 
00023 inline static int bitcount(unsigned int maxval)
00024 {
00025     unsigned int mv = maxval;
00026     int nbits;
00027     for (nbits = 0; mv; nbits++)
00028         mv >>= 1;
00029     return nbits;
00030 }
00031 
00032 // log in base 2 of v
00033 // log2(0) -> -1
00034 // log2(1) ->  0
00035 // log2(2) ->  1
00036 // log2(4) ->  2
00037 // ...
00038 // log2(8) ->  3
00039 // log2(7) ->  2
00040 int
00041 log2(unsigned int v)
00042 {
00043     int res;
00044     for(res=-1;v;res++){v>>=1;}
00045     return(res);
00046 }
00047 
00048 // compute 2^x
00049 #define pow2(x) (1<<(x))
00050 
00051 // **************************************************
00052 // *************** VlengthCoder   *******************
00053 // **************************************************
00054 //
00055 // Compress values into a bitstream based on their probability distribution
00056 // The probability distribution is reduced to a number of intervals.
00057 // Each  interval (generally)  has the same probability of occuring
00058 // values are then coded by:  interval_number position_inside_interval
00059 // this can be seen as modified version of shanon-fanno encoding
00060 //
00061 // Here are some aproximate calculation for estimating final coded size: 
00062 //
00063 // n number of entries to code
00064 // nbits maximum size in bits of entries to code
00065 //
00066 // SUM_interval_bit_sizes -> depends on probability dist
00067 // total_size = table_size + coded_size
00068 // table_size = 2^interval_bits * NBITS_NBITS_VAL
00069 // coded_size = n * (interval_bits + SUM_interval_bit_sizes / 2^interval_bits )
00070 //
00071 // example1: flat probability distribution :
00072 // SUM_interval_bit_sizes = 2^interval_bits * log2( 2^nbits / 2^interval_bits) = 2^interval_bits * ( nbits - interval_bits )
00073 // => coded_size = n * ( interval_bits + nbits - interval_bits ) = n*nbits !!
00074 // => coded_size is the same as if we used no compression
00075 //    this is normal, because it is not possible to compress random data
00076 //
00077 // example2: probability all focused in first interval except for one entry
00078 // SUM_interval_bit_sizes  = 1 + nbits
00079 // the computations above are not valid because of integer roundofs
00080 // => coded_size would actually be = n *  1 + nbits 
00081 // (but the code needs a few cleanups to obtain this value) 
00082 //
00083 
00084 //
00085 // Representation of an unsigned int interval [low low+size]
00086 //
00087 class WordInterval {
00088 public:
00089   inline void SizeFromBits() {
00090     size = ((nbits > 0 ? pow2(nbits - 1) : 0));
00091   }
00092 
00093   //
00094   // Number of bits to code values in the range
00095   // [0 size]
00096   //
00097   int nbits;
00098   //
00099   // Size of the interval
00100   //
00101   unsigned int size;
00102   //
00103   // Lowest bound of the interval
00104   //
00105   unsigned int low;
00106 };
00107 
00108 class VlengthCoder {
00109 public:
00110   //
00111   // Constructor. 
00112   //
00113   VlengthCoder(WordBitStream & nbs);
00114 
00115   ~VlengthCoder() {
00116     if(intervals) delete [] intervals;
00117   }
00118 
00119   void PutUints(unsigned int *vals, int n);
00120   void PutUintsPrepare(unsigned int *vals, int n);
00121   void GetUints(unsigned int *vals, int n);
00122 
00123   // compress and insert a value into the bitstream
00124   inline void PutUint(unsigned int val) {
00125     unsigned int low = 0;
00126     int interval = 0;
00127     FindInterval(val, interval, low);
00128 
00129     bs.PutUint(interval, interval_bits);        // store interval
00130 
00131     const int bitsremaining = (intervals[interval].nbits > 0 ? intervals[interval].nbits - 1 : 0);
00132     val -= low;
00133     bs.PutUint(val, bitsremaining);
00134   }
00135 
00136   // get and uncompress  a value from  the bitstream
00137   inline unsigned int GetUint() {
00138     int interval = bs.GetUint(interval_bits);   // get interval
00139     const int bitsremaining =
00140       (intervals[interval].nbits > 0 ? intervals[interval].nbits - 1 : 0);
00141     unsigned int val = bs.GetUint(bitsremaining);
00142     val += intervals[interval].low;
00143     return (val);
00144   }
00145 
00146   // find interval where value v resides
00147   // (very unusual implementation of binary search)
00148   inline void FindInterval(const unsigned int v, int &interval, unsigned int &low) {
00149     int i0 = 0;
00150     int i1 = nintervals;
00151     int i;
00152     for (;;) {
00153       if (i1 == i0 + 1) {
00154         break;
00155       }
00156       i = (i0 + i1) >> 1;
00157       low = intervals[i].low;
00158       if (v < low) {
00159         i1 = i;
00160         continue;
00161       } else {
00162         i0 = i;
00163         continue;
00164       }
00165 
00166     }
00167 
00168     low = intervals[i0].low;
00169     interval = i0;
00170   }
00171 
00172   void GenerateLowBoundaries(WordInterval *intervals, int nintervals);
00173 
00174 private:
00175   int interval_bits;            // number of bits needed to code an index
00176                                 // of the intervals data member (maximum 
00177                                 // value of index is nintervals - 1).
00178 
00179   WordInterval *intervals;
00180   int nintervals;               // number of intervals
00181 
00182   WordBitStream & bs;
00183 };
00184 
00185 VlengthCoder::VlengthCoder(WordBitStream & nbs):bs(nbs)
00186 {
00187     interval_bits = 0;
00188     nintervals = 0;
00189     intervals = NULL;
00190 }
00191 
00192 // quick sort compare function (for unsigned int's)
00193 static int qsort_uint_cmp(const void *a, const void *b)
00194 {
00195     if ((*((unsigned int *) a)) > (*((unsigned int *) b)))
00196         return 1;
00197     else if ((*((unsigned int *) a)) < (*((unsigned int *) b)))
00198         return -1;
00199     else
00200         return 0;
00201 //      return 
00202 //      (*((unsigned int *)a)) -
00203 //      (*((unsigned int *)b))   ;
00204 }
00205 
00206 // quick sort an array of unsigned int's
00207 static void qsort_uint(unsigned int *v, int n)
00208 {
00209     qsort((void *) v, (unsigned int) n, sizeof(unsigned int), &qsort_uint_cmp);
00210 }
00211 
00212 void VlengthCoder::PutUints(unsigned int *vals, int n)
00213 {
00214   PutUintsPrepare(vals, n);
00215 
00216   int i;
00217   bs.PutUint(interval_bits, WORD_CMPR_LOG32_BITS);
00218   for (i = 0; i < nintervals; i++) {
00219     bs.PutUint(intervals[i].nbits, WORD_CMPR_LOG32_BITS);
00220   }
00221 
00222   for (i = 0; i < n; i++)
00223     PutUint(vals[i]);
00224 }
00225 
00226 void VlengthCoder::PutUintsPrepare(unsigned int *vals, int n)
00227 {
00228     unsigned int *sorted = new unsigned int[n];
00229     memcpy((void *) sorted, (void *) vals, n * sizeof(unsigned int));
00230     qsort_uint(sorted, n);
00231 
00232     int nbits = bitcount(sorted[n - 1]);
00233 
00234     // **** heuristics to determine best interval_bits
00235     // The interval table should not be larger than 1/10 of the 
00236     // data. force table size to be less than 1/10 of the maximum coded size
00237     interval_bits = bitcount((n * nbits) / (10 * WORD_CMPR_LOG32_BITS));
00238     // sanity
00239     if (interval_bits >= nbits) {
00240         interval_bits = nbits - 1;
00241     }
00242     // interval_bits at least 1
00243     if (interval_bits < 1) {
00244         interval_bits = 1;
00245     }
00246 
00247     nintervals = (1 << interval_bits);
00248     int i;
00249 
00250     // + 1 because .low will be set for the past-to-last element
00251     // algorithmic glitch ? 
00252     intervals = new WordInterval[nintervals + 1];
00253 
00254     // find split boundaires
00255     unsigned int lboundary = 0;
00256     unsigned int boundary;
00257     for (i = 0; i < nintervals - 1; i++) {
00258         boundary = sorted[(n * (i + 1)) / nintervals];
00259         intervals[i].nbits = 1 + log2(boundary - lboundary);
00260         intervals[i].SizeFromBits();
00261         lboundary += intervals[i].size;
00262     }
00263     boundary = sorted[n - 1];
00264     intervals[i].nbits = 1 + log2(boundary - lboundary) + 1;
00265     intervals[i].SizeFromBits();
00266 
00267     GenerateLowBoundaries(intervals, nintervals);
00268 
00269     delete [] sorted;
00270 }
00271 
00272 void VlengthCoder::GetUints(unsigned int *vals, int n)
00273 {
00274     int i;
00275     interval_bits = bs.GetUint(WORD_CMPR_LOG32_BITS);
00276     nintervals = pow2(interval_bits);
00277 
00278     // + 1 because .low will be set for the past-to-last element
00279     // algorithmic glitch ? 
00280     intervals = new WordInterval[nintervals + 1];
00281 
00282     for (i = 0; i < nintervals; i++) {
00283         intervals[i].nbits = bs.GetUint(WORD_CMPR_LOG32_BITS);
00284         intervals[i].SizeFromBits();
00285     }
00286     GenerateLowBoundaries(intervals, nintervals);
00287 
00288     for (i = 0; i < n; i++)
00289         vals[i] = GetUint();
00290 }
00291 
00292 void VlengthCoder::GenerateLowBoundaries(WordInterval *intervals, int nintervals)
00293 {
00294     unsigned int low = 0;
00295     for (int j = 0; j <= nintervals; j++) {
00296         intervals[j].low = low;
00297         if (j < nintervals) {
00298             low += intervals[j].size;
00299         }
00300     }
00301 }
00302 
00303 // **************************************************
00304 // *************** WordBitStream  ***********************
00305 // **************************************************
00306 
00307 void 
00308 WordBitStream::PutUint(unsigned int v, int n)
00309 {
00310   if(freezeon) {
00311     freeze_bitcount += n;
00312     return;
00313   }
00314     
00315   if(n <= 0) return;
00316 
00317   int relative_bitpos = bitpos & 0x07;
00318   // simplest case it all fits in the current byte
00319   if(relative_bitpos + n < 8) {
00320     // would be safer to mask v to make
00321     // sure no spurious bits are inserted if there is garbadge in v
00322     buff[buff_idx] |= (v << relative_bitpos);
00323     BitposIncr(n);
00324     return;
00325   } else {
00326     const int central = ((relative_bitpos + n) >> 3) - 1;
00327 
00328     // put first
00329     const int bits_in_first_byte = 8 - relative_bitpos;
00330     buff[buff_idx] |= ((v & 0xff) << relative_bitpos) & 0xff;
00331     BitposIncr(bits_in_first_byte);
00332     v >>= bits_in_first_byte;
00333 
00334     // put central
00335     for(int i = central; i ;i--) {
00336       buff[buff_idx] = v & 0xff ;
00337       BitposIncr(8);
00338       v >>= 8;      
00339     }
00340 
00341     // put last
00342     const int bits_in_last_byte = n - ((central << 3) + bits_in_first_byte);
00343 
00344     if(bits_in_last_byte > 0) {
00345       buff[buff_idx] = v & ((1 << bits_in_last_byte) - 1);
00346       BitposIncr(bits_in_last_byte);
00347     }
00348   }
00349 }
00350 
00351 unsigned int 
00352 WordBitStream::GetUint(int n)
00353 {
00354   if(n <= 0) return 0;
00355 
00356   unsigned int res = 0;
00357 
00358   int relative_bitpos = bitpos & 0x07;
00359 
00360   if(relative_bitpos + n < 8) {
00361     // simplest case it all fits in the current byte
00362     res = (buff[bitpos >> 3] >> relative_bitpos) & ((1 << n) - 1);
00363     bitpos += n;
00364     return res;
00365   } else {
00366     int bytepos = bitpos >> 3;
00367     const int central = ((relative_bitpos + n) >> 3) - 1;
00368 
00369     // put first
00370     res = (buff[bytepos] >> relative_bitpos) & 0xff;
00371     const int bits_in_first_byte = 8 - relative_bitpos;
00372     bytepos++;
00373 
00374     // put central
00375     if(central) {
00376       unsigned int tmp_res = 0;
00377       for(int i = central - 1; i >= 0; i--) {
00378         tmp_res |= buff[bytepos + i] & 0xff;
00379         if(i) tmp_res <<= 8;
00380       }
00381       bytepos += central;
00382       res |= tmp_res << bits_in_first_byte;
00383     }
00384     
00385     // put last
00386     const int bits_in_last_byte = n - ((central << 3) + bits_in_first_byte);
00387     if(bits_in_last_byte > 0) {
00388       res |= ((unsigned int)(buff[bytepos] & ((1 << bits_in_last_byte) - 1))) << (bits_in_first_byte + ((bytepos - (bitpos>>3) - 1) << 3));
00389     }
00390 
00391     bitpos+=n;
00392     return res;
00393   }
00394 }
00395 
00396 void 
00397 WordBitStream::PutZone(unsigned char *vals, int n)
00398 {
00399   int limit = (n + 7) / 8;
00400   for(int i = 0; i < limit; i++) {
00401     const int remains = n - (8 * i);
00402     PutUint(vals[i], remains > 8 ? 8 : remains);
00403   }
00404 }
00405 
00406 void 
00407 WordBitStream::GetZone(unsigned char *vals, int n)
00408 {
00409   int limit = (n + 7) / 8;
00410   for(int i = 0; i < limit; i++) {
00411     const int remains = n - (8 * i);
00412     vals[i] = GetUint(remains > 8 ? 8 : remains);
00413   }
00414 }
00415 
00416 // **************************************************
00417 // *************** WordBitCompress ***********************
00418 // **************************************************
00419 
00420 //
00421 // Count the minimum number of bits to consider to code
00422 // the value in binary.
00423 // 0 -> 0
00424 // 1 -> 1
00425 // 2 -> 2
00426 // 3 -> 2
00427 // 4 -> 3
00428 // 5 -> 3
00429 // 6 -> 3
00430 // ...
00431 // 
00432 
00433 void 
00434 WordBitCompress::PutUint(unsigned int v, int n)
00435 {
00436     int nbits = bitcount(v);
00437     WordBitStream::PutUint(nbits, bitcount(n));
00438     if(nbits)
00439       WordBitStream::PutUint(v, nbits);
00440 }
00441 
00442 unsigned int 
00443 WordBitCompress::GetUint(int n)
00444 {
00445     int nbits = WordBitStream::GetUint(bitcount(n));
00446     if(!nbits)
00447       return 0;
00448     else
00449       return WordBitStream::GetUint(nbits);
00450 }
00451 
00452 int 
00453 WordBitCompress::PutUints(unsigned int *vals, int n)
00454 {
00455   int cpos = Length();
00456     
00457   if(n >= (1 << WORD_CMPR_NBITS_NVALS)) {
00458     fprintf(stderr, "WordBitCompress::PutUints: : overflow: n >= %d\n", (1 << WORD_CMPR_NBITS_NVALS));
00459     abort();
00460   }
00461 
00462   PutUint(n, WORD_CMPR_NBITS_NVALS);
00463   if(n == 0)
00464     return Length() - cpos;
00465 
00466   int max_nbits = bitcount(HtMaxMin::max_v(vals, n));
00467 
00468   int sdecr;
00469   int sfixed;
00470 
00471   //
00472   // Only bother to compare the Decr and Fixed compression
00473   // performances if there are more than 15 values involved and that
00474   // the max of these values is encoded on more than 3 bits.
00475   //
00476   if(n > 15 && max_nbits > 3) {
00477     Freeze();
00478     PutUintsDecr(vals, n);
00479     sdecr = Length();
00480     UnFreeze();
00481 
00482     Freeze();
00483     PutUintsFixed(vals, n);
00484     sfixed = Length();
00485     UnFreeze();
00486   } else {
00487     //
00488     // Set to arbitrary values so that sdecr > sfixed, hence
00489     // Fixed scheme will be chosen;
00490     //
00491     sdecr = 4242;
00492     sfixed = 0;
00493   }
00494 
00495   //
00496   // Encode the array using the best model (the one that
00497   // takes less space).
00498   //
00499   if(sdecr < sfixed) {
00500     WordBitStream::PutUint(WORD_CMPR_MODEL_DECR, WORD_CMPR_MODEL_BITS);
00501     PutUintsDecr(vals, n);
00502   } else {
00503     WordBitStream::PutUint(WORD_CMPR_MODEL_FIXED, WORD_CMPR_MODEL_BITS);
00504     PutUintsFixed(vals,n);
00505   }
00506 
00507   return Length() - cpos;
00508 }
00509 
00510 int 
00511 WordBitCompress::GetUints(unsigned int **valsp)
00512 {
00513     int n = GetUint(WORD_CMPR_NBITS_NVALS);
00514 
00515     if(!n) {
00516       *valsp = NULL;
00517       return 0;
00518     }
00519 
00520     unsigned int *vals = new unsigned int[n];
00521 
00522     int model = WordBitStream::GetUint(WORD_CMPR_MODEL_BITS);
00523 
00524     switch(model) {
00525     case WORD_CMPR_MODEL_DECR:
00526       GetUintsDecr(vals, n);
00527       break;
00528     case WORD_CMPR_MODEL_FIXED:
00529       GetUintsFixed(vals, n);
00530       break;
00531     default:
00532       fprintf(stderr, "WordBitCompress::GetUints invalid compression model %d\n", model);
00533       abort();
00534       break;
00535     }
00536 
00537     *valsp = vals;
00538     
00539     return n;
00540 }
00541 
00542 int 
00543 WordBitCompress::GetUints(unsigned int **valsp, int* vals_sizep)
00544 {
00545     int n = GetUint(WORD_CMPR_NBITS_NVALS);
00546 
00547     if(!n) {
00548       return 0;
00549     }
00550 
00551     while(n >= *vals_sizep) {
00552       (*vals_sizep) *= 2;
00553       (*valsp) = (unsigned int*)realloc(*valsp, (*vals_sizep) * sizeof(unsigned int));
00554     }
00555 
00556     int model = WordBitStream::GetUint(WORD_CMPR_MODEL_BITS);
00557 
00558     switch(model) {
00559     case WORD_CMPR_MODEL_DECR:
00560       GetUintsDecr(*valsp, n);
00561       break;
00562     case WORD_CMPR_MODEL_FIXED:
00563       GetUintsFixed(*valsp, n);
00564       break;
00565     default:
00566       fprintf(stderr, "WordBitCompress::GetUints invalid compression model %d\n", model);
00567       abort();
00568       break;
00569     }
00570 
00571     return n;
00572 }
00573 
00574 int WordBitCompress::PutUchars(unsigned char *vals, int n)
00575 {
00576     int cpos = Length();
00577 
00578     if(n >= (1 << WORD_CMPR_NBITS_NVALS)) {
00579       fprintf(stderr, "WordBitCompress::PutUchars: : overflow: n >= %d\n", (1 << WORD_CMPR_NBITS_NVALS));
00580       abort();
00581     }
00582 
00583     PutUint(n, WORD_CMPR_NBITS_NVALS);
00584     if (n == 0) {
00585         return 0;
00586     }
00587 
00588     int max_nbits = bitcount(HtMaxMin::max_v(vals, n));
00589 
00590     if(max_nbits >= (1 << WORD_CMPR_LOG8_BITS)) {
00591       fprintf(stderr, "WordBitCompress::PutUchars: : overflow: max_nbits >= %d\n", (1 << WORD_CMPR_LOG8_BITS));
00592       abort();
00593     }
00594 
00595     WordBitStream::PutUint(max_nbits, WORD_CMPR_LOG8_BITS);
00596     
00597     for(int i = 0; i < n; i++) {
00598         WordBitStream::PutUint(vals[i] & 0xff, max_nbits);
00599 #if 0
00600         unsigned char v = vals[i];
00601         for (int j = 0; j < max_nbits; j++) {
00602           Put(v & (1 << j));
00603         }
00604 #endif
00605     }
00606     return Length() - cpos;
00607 }
00608 
00609 int 
00610 WordBitCompress::GetUchars(unsigned char **valsp)
00611 {
00612     int n = GetUint(WORD_CMPR_NBITS_NVALS);
00613     if (!n) {
00614         *valsp = NULL;
00615         return 0;
00616     }
00617     
00618     int nbits = WordBitStream::GetUint(WORD_CMPR_LOG8_BITS);
00619     int i;
00620     unsigned char *vals = new unsigned char[n];
00621 
00622     for (i = 0; i < n; i++)
00623         vals[i] = WordBitStream::GetUint(nbits);
00624 
00625     *valsp = vals;
00626     return n;
00627 }
00628 
00629 int 
00630 WordBitCompress::GetUchars(unsigned char **valsp, int *vals_sizep)
00631 {
00632     int n = GetUint(WORD_CMPR_NBITS_NVALS);
00633     if (!n) {
00634         return 0;
00635     }
00636     
00637     while(n >= *vals_sizep) {
00638       (*vals_sizep) *= 2;
00639       (*valsp) = (unsigned char*)realloc(*valsp, (*vals_sizep) * sizeof(unsigned char));
00640     }
00641 
00642     int nbits = WordBitStream::GetUint(WORD_CMPR_LOG8_BITS);
00643     int i;
00644 
00645     for (i = 0; i < n; i++)
00646         (*valsp)[i] = WordBitStream::GetUint(nbits);
00647 
00648     return n;
00649 }
00650 
00651 void WordBitCompress::PutUintsFixed(unsigned int *vals, int n)
00652 {
00653     int max_nbits = bitcount(HtMaxMin::max_v(vals, n));
00654 
00655     if(max_nbits > (1 << WORD_CMPR_LOG32_BITS)) {
00656       fprintf(stderr, "WordBitCompress::PutUintsFixed: : overflow: %d > %d\n", max_nbits, (1 << WORD_CMPR_LOG32_BITS));
00657       abort();
00658     }
00659 
00660     PutUint(max_nbits, WORD_CMPR_LOG32_BITS);
00661 
00662     for (int i = 0; i < n; i++)
00663         WordBitStream::PutUint(vals[i], max_nbits);
00664 }
00665 
00666 void WordBitCompress::GetUintsFixed(unsigned int *vals, int n)
00667 {
00668     int nbits = GetUint(WORD_CMPR_LOG32_BITS);
00669 
00670     int i;
00671     for (i = 0; i < n; i++)
00672         vals[i] = WordBitStream::GetUint(nbits);
00673 }
00674 
00675 void WordBitCompress::PutUintsDecr(unsigned int *vals, int n)
00676 {
00677     VlengthCoder coder(*this);
00678     coder.PutUints(vals, n);
00679 }
00680 
00681 void WordBitCompress::GetUintsDecr(unsigned int *vals, int n)
00682 {
00683     VlengthCoder coder(*this);
00684     coder.GetUints(vals, n);
00685 }

Generated on Sun Jun 8 10:56:39 2008 for GNUmifluz by  doxygen 1.5.5