WordDict.h

Go to the documentation of this file.
00001 //
00002 // WordDict.h
00003 //
00004 // NAME
00005 // 
00006 // manage and use an inverted index dictionary.
00007 //
00008 // SYNOPSIS
00009 // 
00010 // #include <mifluz.h>
00011 // 
00012 // WordList* words = ...;
00013 // WordDict* dict = words->Dict();
00014 // 
00015 // DESCRIPTION
00016 // 
00017 // WordDict maps strings to unique identifiers and frequency in the 
00018 // inverted index. Whenever a new word is found, the WordDict class 
00019 // can be asked to assign it a serial number. When doing so, an entry
00020 // is created in the dictionary with a frequency of zero. The application
00021 // may then increment or decrement the frequency to reflect the inverted
00022 // index content.
00023 //
00024 // The serial numbers range from 1 to 2^32 inclusive.
00025 //
00026 // A WordDict object is automatically created by the WordList object and
00027 // should not be created directly by the application.
00028 //
00029 // END
00030 //
00031 // Part of the ht://Dig package   <http://www.htdig.org/>
00032 // Copyright (c) 1999, 2000, 2001 The ht://Dig Group
00033 // For copyright details, see the file COPYING in your distribution
00034 // or the GNU General Public License version 2 or later
00035 // <http://www.gnu.org/copyleft/gpl.html>
00036 //
00037 // $Id: WordDict_8h-source.html,v 1.1 2008/06/08 10:13:09 sebdiaz Exp $
00038 //
00039 
00040 #ifndef _WordDict_h_
00041 #define _WordDict_h_
00042 
00043 #include <stdio.h>
00044 
00045 #ifndef SWIG
00046 #include "htString.h"
00047 #include "WordDB.h"
00048 
00049 class WordList;
00050 class WordDictCursor;
00051 
00052 #define WORD_DICT_SERIAL_INVALID        0
00053 
00054 class WordDictRecord {
00055  public:
00056   inline WordDictRecord() { count = 0; id = WORD_DICT_SERIAL_INVALID; }
00057 
00058   inline int Unpack(const String& coded) {
00059     int offset = 0;
00060     coded.ber_shift(offset, count);
00061     coded.ber_shift(offset, id);
00062     return OK;
00063   }
00064 
00065   inline int Pack(String& coded) const {
00066     int offset = 0;
00067     coded.ber_push(offset, count);
00068     coded.ber_push(offset, id);
00069     return OK;
00070   }
00071 
00072   inline int Get(WordDB* db, const String& word) {
00073     String tmp_word = word;
00074     String coded(BER_MAX_BYTES * 2);
00075     int ret;
00076     if((ret = db->Get(0, tmp_word, coded, 0)) != 0) return ret;
00077 
00078     Unpack(coded);
00079 
00080     return ret;
00081   }
00082   
00083   inline int Put(WordDB* db, const String& word) {
00084     String coded(BER_MAX_BYTES * 2);
00085     Pack(coded);
00086     return db->Put(0, word, coded, 0);
00087   }
00088 
00089   inline int Del(WordDB* db, const String& word) {
00090     return db->Del(0, word);
00091   }
00092 
00093   inline unsigned int Count() { return count; }
00094   inline unsigned int Id() { return id; }
00095 
00096   unsigned int count;
00097   unsigned int id;
00098 };
00099 #endif /* SWIG */
00100 
00101 class WordDict 
00102 {
00103  public:
00104 #ifndef SWIG
00105   //-
00106   // Private constructor. 
00107   //
00108   WordDict() { words = 0; db = 0; }
00109   ~WordDict();
00110 
00111   //-
00112   // Bind the object a WordList inverted index. Return OK on success,
00113   // NOTOK otherwise.
00114   //
00115   int Initialize(WordList* words);
00116 
00117   //-
00118   // Open the underlying Berkeley DB sub-database. The enclosing 
00119   // file is given by the <i>words</i> data member. Return OK on success,
00120   // NOTOK otherwise.
00121   //
00122   int Open();
00123   //-
00124   // Destroy the underlying Berkeley DB sub-database. Return OK on success,
00125   // NOTOK otherwise.
00126   //
00127   int Remove();
00128   //-
00129   // Close the underlying Berkeley DB sub-database. Return OK on success,
00130   // NOTOK otherwise.
00131   //
00132   int Close();
00133     
00134   //-
00135   // If the <b>word</b> argument exists in the dictionnary, return its
00136   // serial number in the <b>serial</b> argument. If it does not already
00137   // exists, assign it a serial number, create an entry with a frequency
00138   // of zero and return the new serial in the <b>serial</b> argument.
00139   // Return OK on success, NOTOK otherwise.
00140   //
00141   int Serial(const String& word, unsigned int& serial);
00142   //-
00143   // If the <b>word</b> argument exists in the dictionnary, return its
00144   // serial number in the <b>serial</b> argument. If it does not exists
00145   // set the <b>serial</b> argument to WORD_DICT_SERIAL_INVALID.
00146   // Return OK on success, NOTOK otherwise.
00147   //
00148   int SerialExists(const String& word, unsigned int& serial);
00149   //-
00150   // Short hand for Serial() followed by Ref().
00151   // Return OK on success, NOTOK otherwise.
00152   //
00153   int SerialRef(const String& word, unsigned int& serial);
00154   //-
00155   // Return the frequency of the <b>word</b> argument
00156   // in the <b>noccurrence</b> argument. 
00157   // Return OK on success, NOTOK otherwise.
00158   //
00159   int Noccurrence(const String& word, unsigned int& noccurrence) const;
00160 #endif /* SWIG */
00161 
00162   //-
00163   // Short hand for words->GetContext()->GetType()->Normalize(word).
00164   // Return OK on success, NOTOK otherwise.
00165   // 
00166   int Normalize(String& word) const;
00167 
00168   //-
00169   // Short hand for Incr(word, 1)
00170   //
00171   int Ref(const String& word) { return Incr(word, 1); }
00172   //-
00173   // Add <b>incr</b> to the frequency of the <b>word</b>. 
00174   // Return OK on success, NOTOK otherwise.
00175   //
00176   int Incr(const String& word, unsigned int incr);
00177   //-
00178   // Short hand for Decr(word, 1)
00179   //
00180   int Unref(const String& word) { return Decr(word, 1); }
00181   //-
00182   // Subtract <b>decr</b> to the frequency of the <b>word</b>. If
00183   // the frequency becomes lower or equal to zero, remove the entry
00184   // from the dictionnary and lose the association between the word and its
00185   // serial number.
00186   // Return OK on success, NOTOK otherwise.
00187   //
00188   int Decr(const String& word, unsigned int decr);
00189   //-
00190   // Set the frequency of <b>word</b> with the value of the <b>noccurrence</b>
00191   // argument.
00192   //
00193   int Put(const String& word, unsigned int noccurrence);
00194 
00195   //-
00196   // Return true if <b>word</b> exists in the dictionnary, false otherwise.
00197   //
00198   int Exists(const String& word) const;
00199 
00200 #ifndef SWIG
00201   //-
00202   // Return a pointer to the associated WordList object.
00203   //
00204   List* Words() const;
00205 
00206   //-
00207   // Return a cursor to sequentially walk the dictionnary using the 
00208   // <b>Next</b> method. 
00209   //
00210   WordDictCursor* Cursor() const;
00211   //-
00212   // Return the next entry in the dictionnary. The <b>cursor</b> argument
00213   // must have been created using the <i>Cursor</i> method. The word is
00214   // returned in the <b>word</b> argument and the record is returned in
00215   // the <b>record</b> argument. 
00216   // On success the function returns 0, at the end of the dictionnary it
00217   // returns DB_NOTFOUND. The <b>cursor</b> argument is deallocated when
00218   // the function hits the end of the dictionnary or an error occurs.
00219   // 
00220   int Next(WordDictCursor* cursor, String& word, WordDictRecord& record);
00221 
00222   //-
00223   // Return a cursor to sequentially walk the entries of the dictionnary
00224   // that start with the <b>prefix</b> argument, using the 
00225   // <b>NextPrefix</b> method. 
00226   //
00227   WordDictCursor* CursorPrefix(const String& prefix) const;
00228   //-
00229   // Return the next prefix from the dictionnary. The <b>cursor</b> argument
00230   // must have been created using the <i>CursorPrefix</i> method. The word is
00231   // returned in the <b>word</b> argument and the record is returned in
00232   // the <b>record</b> argument. The <b>word</b> is guaranteed to start with
00233   // the prefix specified to the <b>CursorPrefix</b> method.
00234   // On success the function returns 0, at the end of the dictionnary it
00235   // returns DB_NOTFOUND. The <b>cursor</b> argument is deallocated when
00236   // the function hits the end of the dictionnary or an error occurs.
00237   // 
00238   int NextPrefix(WordDictCursor* cursor, String& word, WordDictRecord& record);
00239 
00240   //-
00241   // Dump the complete dictionary in the file descriptor <b>f.</b> The
00242   // format of the dictionary is <i>word serial frequency</i>, one by
00243   // line. 
00244   //
00245   int Write(FILE* f);
00246 
00247  private:
00248   WordList*                     words;
00249   WordDB*                       db;
00250 #endif /* SWIG */
00251 };
00252 #endif /* _WordDict_h_ */

Generated on Sun Jun 8 10:56:40 2008 for GNUmifluz by  doxygen 1.5.5