WordList.h

Go to the documentation of this file.
00001 //
00002 // WordList.h
00003 //
00004 // NAME
00005 // 
00006 // abstract class to manage and use an inverted index file.
00007 //
00008 // SYNOPSIS
00009 // 
00010 // #include <mifluz.h>
00011 // 
00012 // WordContext context;
00013 //
00014 // WordList* words = context->List();
00015 // 
00016 // delete words;
00017 // 
00018 // DESCRIPTION
00019 // 
00020 // WordList is the <i>mifluz</i> equivalent of a database handler. Each
00021 // WordList object is bound to an inverted index file and implements the
00022 // operations to create it, fill it with word occurrences and search 
00023 // for an entry matching a given criterion.
00024 // 
00025 // WordList is an abstract class and cannot be instanciated. 
00026 // The <b>List</b> method of the class WordContext will create 
00027 // an instance using the appropriate derived class, either WordListOne
00028 // or WordListMulti. Refer to the corresponding manual pages for
00029 // more information on their specific semantic.
00030 //
00031 // When doing bulk insertions, mifluz creates temporary files that
00032 // contain the entries to be inserted in the index. Those files are
00033 // typically named <i>indexC00000000</i>. The maximum size of the 
00034 // temporary file is <b>wordlist_cache_size</b> / 2. When the maximum
00035 // size of the temporary file is reached, mifluz creates another temporary
00036 // file named <i>indexC00000001</i>. The process continues until mifluz
00037 // created 50 temporary file. At this point it merges all temporary files
00038 // into one that replaces the first <i>indexC00000000</i>. Then it continues
00039 // to create temporary file again and keeps following this algorithm until
00040 // the bulk insertion is finished. When the bulk insertion is finished,
00041 // mifluz has one big file named <i>indexC00000000</i> that contains
00042 // all the entries to be inserted in the index. mifluz inserts all the
00043 // entries from <i>indexC00000000</i> into the index and delete the 
00044 // temporary file when done. The insertion will be fast since all the
00045 // entries in <i>indexC00000000</i> are already sorted. 
00046 //
00047 // The parameter <b>wordlist_cache_max</b> can be used to prevent the
00048 // temporary files to grow indefinitely. If the total cumulated size of
00049 // the <i>indexC*</i> files grow beyond this parameter, they are merged
00050 // into the main index and deleted. For instance setting this parameter
00051 // value to 500Mb garanties that the total size of the <i>indexC*</i> 
00052 // files will not grow above 500Mb.
00053 //
00054 // CONFIGURATION
00055 // 
00056 // wordlist_extend {true|false} (default false)
00057 //   If <b>true</b> maintain reference count of unique 
00058 //   words. The <b>Noccurrence</b> method gives access to this count.
00059 // 
00060 // wordlist_verbose <number> (default 0)
00061 //   Set the verbosity level of the WordList class. 
00062 //   <br>
00063 //   1 walk logic
00064 //   <br>
00065 //   2 walk logic details
00066 //   <br>
00067 //   3 walk logic lots of details
00068 // 
00069 // wordlist_page_size <bytes> (default 8192)
00070 //   Berkeley DB page size (see Berkeley DB documentation)
00071 // 
00072 // wordlist_cache_size <bytes> (default 500K)
00073 //   Berkeley DB cache size (see Berkeley DB documentation)
00074 //   Cache makes a huge difference in performance. It must be at least 2%
00075 //   of the expected total data size. Note that if compression is activated
00076 //   the data size is eight times larger than the actual file size. In this
00077 //   case the cache must be scaled to 2% of the data size, not 2% 
00078 //   of the file size. See <b>Cache tuning</b> in the mifluz guide for
00079 //   more hints.
00080 //   See WordList(3) for the rationale behind cache file handling.
00081 // 
00082 // wordlist_cache_max <bytes> (default 0)
00083 //   Maximum size of the cumulated cache files generated when doing bulk
00084 //   insertion with the <b>BatchStart()</b> function. When this limit is
00085 //   reached, the cache files are all merged into the inverted index. 
00086 //   The value 0 means infinite size allowed.
00087 //   See WordList(3) for the rationale behind cache file handling.
00088 //
00089 // wordlist_cache_inserts {true|false} (default false)
00090 //   If true all <b>Insert</b> calls are cached in memory. When the 
00091 //   WordList object is closed or a different access method is called
00092 //   the cached entries are flushed in the inverted index.
00093 //
00094 // wordlist_compress {true|false} (default false)
00095 //   Activate compression of the index. The resulting index is eight times
00096 //   smaller than the uncompressed index.
00097 // 
00098 //
00099 // END
00100 //
00101 // Part of the ht://Dig package   <http://www.htdig.org/>
00102 // Copyright (c) 1999, 2000, 2001 The ht://Dig Group
00103 // For copyright details, see the file COPYING in your distribution
00104 // or the GNU General Public License version 2 or later
00105 // <http://www.gnu.org/copyleft/gpl.html>
00106 //
00107 // $Id: WordList_8h-source.html,v 1.1 2008/06/08 10:13:15 sebdiaz Exp $
00108 //
00109 
00110 #ifndef _WordList_h_
00111 #define _WordList_h_
00112 
00113 #include <fcntl.h>
00114 #include <stdlib.h>
00115 #include <stdio.h>
00116 
00117 #ifndef SWIG
00118 #include "Dictionary.h"
00119 #include "List.h"
00120 #include "htString.h"
00121 #include "WordRecord.h"
00122 #include "WordReference.h"
00123 #include "WordType.h"
00124 #include "WordDB.h"
00125 #include "WordDBCompress.h"
00126 #include "Configuration.h"
00127 #include "WordCursor.h"
00128 #include "WordDict.h"
00129 #endif /* SWIG */
00130 
00131 class List;
00132 class WordList;
00133 class WordDBCursor;
00134 class WordContext;
00135 class WordDBCaches;
00136 class WordMeta;
00137 class WordDead;
00138 
00139 // 
00140 // Inverted index interface
00141 //
00142 class WordList
00143 {
00144  public:
00145     virtual ~WordList() {}
00146 
00147     //-
00148     // Return a pointer to the WordContext object used to create
00149     // this instance.
00150     //
00151     inline WordContext* GetContext() { return context; }
00152 #ifndef SWIG
00153     //-
00154     // Return a pointer to the WordContext object used to create
00155     // this instance as a const.
00156     //
00157     inline const WordContext* GetContext() const { return context; }
00158 #endif /* SWIG */
00159 
00160     //-
00161     // Insert <b>wordRef</b> in index. If the <i>Key()</i> part of
00162     // the <b>wordRef</b> exists in the index, override it.
00163     // Returns OK on success, NOTOK on error.
00164     //
00165     virtual inline int Override(const WordReference& wordRef) { NotImplemented(); return NOTOK; }
00166 
00167     //-
00168     // Returns OK if <b>wordRef</b> exists in the index, NOTOK otherwise.
00169     //
00170     virtual int Exists(const WordReference& wordRef) { NotImplemented(); return NOTOK; }
00171 #ifndef SWIG
00172     //-
00173     // Returns OK if <b>word</b> exists in the index, NOTOK otherwise.
00174     //
00175     inline int Exists(const String& word) { return Dict()->Exists(word) ? OK : NOTOK; }
00176 #endif /* SWIG */
00177 
00178     //
00179     // Delete permanently
00180     //
00181     //-
00182     // Delete all entries in the index whose key matches the 
00183     // <i>Key()</i> part of <b>wordRef</b>, using the <i>Walk</i>
00184     // method.
00185     // Returns the number of entries successfully deleted.
00186     //
00187     virtual int WalkDelete(const WordReference& wordRef) { NotImplemented(); return NOTOK; }
00188     //-
00189     // Delete the entry in the index that exactly matches the
00190     // <i>Key()</i> part of <b>wordRef.</b>
00191     // Returns OK if deletion is successfull, NOTOK otherwise.
00192     //
00193     virtual int Delete(const WordReference& wordRef) { NotImplemented(); return NOTOK; }
00194 
00195     //-
00196     // Open inverted index <b>filename.</b> <b>mode</b>
00197     // may be <i>O_RDONLY</i> or <i>O_RDWR.</i> If mode is 
00198     // <i>O_RDWR</i> it can be or'ed with <i>O_TRUNC</i> to reset
00199     // the content of an existing inverted index.
00200     // Return OK on success, NOTOK otherwise.
00201     //
00202     virtual int Open(const String& filename, int mode) { NotImplemented(); return NOTOK; }
00203     //-
00204     // Close inverted index.
00205     // Return OK on success, NOTOK otherwise.
00206     // 
00207     virtual int Close() { NotImplemented(); return NOTOK; }
00208     //-
00209     // Return the size of the index in pages.
00210     //
00211     virtual unsigned int Size() const { NotImplemented(); return 0; }
00212     //-
00213     // Return the page size
00214     //
00215     virtual int Pagesize() const { NotImplemented(); return 0; }
00216     //-
00217     // Return a pointer to the inverted index dictionnary.
00218     //
00219     virtual WordDict *Dict() { NotImplemented(); return 0; }
00220     virtual WordMeta *Meta() { NotImplemented(); return 0; }
00221     virtual WordDead *Dead() { NotImplemented(); return 0; }
00222     //-
00223     // Return the filename given to the last call to Open.
00224     //
00225     const String& Filename() const { return filename; }
00226     //-
00227     // Return the mode given to the last call to Open.
00228     //
00229     int Flags() const { return flags; }
00230 
00231     //
00232     // These returns a list of all the WordReference * matching 
00233     // the constraint.
00234     //-
00235     // Returns the list of word occurrences exactly matching the
00236     // <i>Key()</i> part of <b>wordRef.</b> The <i>List</i> returned
00237     // contains pointers to <i>WordReference</i> objects. It is
00238     // the responsibility of the caller to free the list. See List.h
00239     // header for usage.
00240     //
00241     inline List *Find(const WordReference& wordRef) { return (*this)[wordRef]; }
00242     //-
00243     // Returns the list of word occurrences exactly matching the
00244     // <b>word.</b> The <i>List</i> returned
00245     // contains pointers to <i>WordReference</i> objects. It is
00246     // the responsibility of the caller to free the list. See List.h
00247     // header for usage.
00248     //
00249     inline List *FindWord(const String& word) { return (*this)[word]; }
00250 #ifndef SWIG
00251     //-
00252     // Alias to the <b>Find</b> method.
00253     //
00254     virtual List *operator [] (const WordReference& wordRef) { NotImplemented(); return 0; }
00255     //-
00256     // Alias to the <b>FindWord</b> method.
00257     //
00258     inline List *operator [] (const String& word)  {
00259       WordReference wordRef(context, word);
00260       unsigned int wordid;
00261       Dict()->SerialExists(word, wordid);
00262       if(wordid != WORD_DICT_SERIAL_INVALID) {
00263         wordRef.Key().Set(WORD_KEY_WORD, wordid);
00264         return (*this)[wordRef];
00265       } else {
00266         return new List;
00267       }
00268     }
00269 #endif /* SWIG */
00270     //-
00271     // Returns the list of word occurrences matching the <i>Key()</i>
00272     // part of <b>wordRef.</b> In the <i>Key()</i>, the string
00273     // (accessed with <i>GetWord()</i>) matches any string that begins
00274     // with it. The <i>List</i> returned contains pointers to
00275     // <i>WordReference</i> objects. It is the responsibility of the
00276     // caller to free the list.
00277     //
00278     virtual List *Prefix (const WordReference& prefix) { NotImplemented(); return 0; }
00279 #ifndef SWIG
00280     //-
00281     // Returns the list of word occurrences matching the
00282     // <b>word.</b> In the <i>Key()</i>, the string (accessed with
00283     // <i>GetWord()</i>) matches any string that begins with it. The
00284     // <i>List</i> returned contains pointers to <i>WordReference</i>
00285     // objects. It is the responsibility of the caller to free the
00286     // list.
00287     //
00288     inline List *Prefix (const String& prefix) { return this->Prefix(WordReference(context, prefix)); }
00289 #endif /* SWIG */
00290 
00291     //
00292     // Iterate over the complete database.
00293     //
00294 #ifndef SWIG
00295     //- 
00296     // Returns a list of all unique words contained in the inverted
00297     // index. The <i>List</i> returned contains pointers to
00298     // <i>String</i> objects. It is the responsibility of the caller
00299     // to free the list. See List.h header for usage.
00300     //
00301     virtual List *Words() { NotImplemented(); return 0; }
00302 #endif /* SWIG */
00303     //- 
00304     // Returns a list of all entries contained in the
00305     // inverted index. The <i>List</i> returned contains pointers to
00306     // <i>WordReference</i> objects. It is the responsibility of
00307     // the caller to free the list. See List.h header for usage.
00308     //
00309     virtual List *WordRefs() { NotImplemented(); return 0; }
00310 
00311 #ifndef SWIG
00312     //-
00313     // Create a cursor that searches all the occurrences in the
00314     // inverted index and call <b>ncallback</b> with
00315     // <b>ncallback_data</b> for every match.
00316     //
00317     virtual WordCursor *Cursor(wordlist_walk_callback_t callback, Object *callback_data) { NotImplemented(); return 0; }
00318     //- 
00319     // Create a cursor that searches all the occurrences in the
00320     // inverted index and that match <b>nsearchKey.</b> If
00321     // <b>naction</b> is set to HTDIG_WORDLIST_WALKER calls
00322     // <b>searchKey.callback</b> with <b>searchKey.callback_data</b>
00323     // for every match. If <b>naction</b> is set to
00324     // HTDIG_WORDLIST_COLLECT push each match in <b>searchKey.collectRes</b>
00325     // data member as a <b>WordReference</b> object. It is the responsibility
00326     // of the caller to free the <b>searchKey.collectRes</b> list.
00327     //
00328     virtual WordCursor *Cursor(const WordKey &searchKey, int action = HTDIG_WORDLIST_WALKER) { NotImplemented(); return 0; }
00329     //-
00330     // Create a cursor that searches all the occurrences in the
00331     // inverted index and that match <b>nsearchKey</b> and calls
00332     // <b>ncallback</b> with <b>ncallback_data</b> for every match.
00333     //
00334     virtual WordCursor *Cursor(const WordKey &searchKey, wordlist_walk_callback_t callback, Object * callback_data) { NotImplemented(); return 0; }
00335 #endif /* SWIG */
00336 
00337     //-
00338     // Create a WordKey object and return it. The <b>bufferin</b> argument
00339     // is used to initialize the key, as in the WordKey::Set method. 
00340     // The first component of <b>bufferin</b> must be a word that is translated
00341     // to the corresponding numerical id using the WordDict::Serial
00342     // method.
00343     //
00344     virtual WordKey Key(const String& bufferin) { NotImplemented(); return WordKey(0); }
00345     //-
00346     // Create a WordReference object and return it. The
00347     // <b>bufferin</b> argument is used to initialize the structure,
00348     // as in the WordReference::Set method.  The first component of
00349     // <b>bufferin</b> must be a word that is translated to the
00350     // corresponding numerical id using the WordDict::Serial method.
00351     // If the <b>exists</b> argument is set to 1, the method 
00352     // WordDict::SerialExists is used instead, that is no serial is
00353     // assigned to the word if it does not already have one.
00354     // Before translation the word is normalized using the
00355     // WordType::Normalize method. The word is saved using the
00356     // WordReference::SetWord method.
00357     //
00358     virtual WordReference Word(const String& bufferin, int exists = 0) { NotImplemented(); return WordReference(0); }
00359     //-
00360     // Alias for Word(bufferin, 1).
00361     //
00362     virtual WordReference WordExists(const String& bufferin) { return Word(bufferin, 1); }
00363     
00364     //-
00365     // Accelerate bulk insertions in the inverted index. All 
00366     // insertion done with the <b>Override</b> method are batched
00367     // instead of being updating the inverted index immediately.
00368     // No update of the inverted index file is done before the
00369     // <b>BatchEnd</b> method is called.
00370     // 
00371     virtual void BatchStart();
00372     //- 
00373     // Terminate a bulk insertion started with a call to the
00374     // <b>BatchStart</b> method. When all insertions are done
00375     // the <b>AllRef</b> method is called to restore statistics.
00376     //
00377     virtual void BatchEnd();
00378 
00379 #ifndef SWIG
00380     //-
00381     // Return in <b>noccurrence</b> the number of occurrences of the
00382     // string contained in the <i>GetWord()</i> part of <b>key.</b>
00383     // Returns OK on success, NOTOK otherwise.
00384     //
00385     virtual int Noccurrence(const String& key, unsigned int& noccurrence) const { NotImplemented(); return NOTOK; }
00386 
00387     //
00388     // Input/Output
00389     //
00390     //-
00391     // Write on file descriptor <b>f</b> an ASCII description of the
00392     // index. Each line of the file contains a <i>WordReference</i>
00393     // ASCII description.
00394     // Return OK on success, NOTOK otherwise.
00395     //
00396     virtual int Write(FILE* f) { NotImplemented(); return NOTOK; }
00397     //-
00398     // Write on file descriptor <b>f</b> the complete dictionnary 
00399     // with statistics.
00400     // Return OK on success, NOTOK otherwise.
00401     //
00402     virtual int WriteDict(FILE* f) { NotImplemented(); return NOTOK; }
00403     //
00404     //-
00405     // Read <i>WordReference</i> ASCII descriptions from <b>f</b>,
00406     // returns the number of inserted WordReference or < 0 if an error
00407     // occurs. Invalid descriptions are ignored as well as empty
00408     // lines.
00409     //
00410     virtual int Read(FILE* f) { NotImplemented(); return NOTOK; }
00411 
00412 #endif /* SWIG */
00413     //
00414     // Retrieve WordReferences from the database. 
00415     // Backend of WordRefs, operator[], Prefix...
00416     //
00417     virtual List *Collect(const WordReference& word) { NotImplemented(); return 0; }
00418 #ifndef SWIG
00419     //
00420     // Compressor object accessors
00421     //
00422     inline WordDBCompress *GetCompressor() { return compressor; }
00423     inline void SetCompressor(WordDBCompress* compressor_arg) { compressor = compressor_arg; }
00424 
00425     inline void NotImplemented() const {
00426       fprintf(stderr, "WordList::NotImplemented\n");
00427       abort();
00428     }
00429 
00430     WordContext*                context;
00431 
00432     int                         isopen;
00433     int                         flags;
00434     String                      filename;
00435 
00436     //
00437     // If true enable extended functionalities of WordList such
00438     // as per-word statistics. Read from wordlist_extended configuration
00439     // parameter.
00440     //
00441     int                         extended;
00442 
00443 
00444     WordDBCompress             *compressor;
00445     int                         verbose;
00446 
00447     WordDBCaches*               caches;
00448 #endif /* SWIG */
00449 };
00450 
00451 #endif /* _WordList_h_ */

Generated on Sun Jun 8 10:56:40 2008 for GNUmifluz by  doxygen 1.5.5