WordListMulti.h

Go to the documentation of this file.
00001 //
00002 // WordList.h
00003 //
00004 // NAME
00005 // 
00006 // manage and use an inverted index file.
00007 //
00008 // SYNOPSIS
00009 // 
00010 // #include <mifluz.h>
00011 // 
00012 // Configuration* config;
00013 // WordReference wordRef;
00014 // ...
00015 // WordList* words = new WordList(config)
00016 // 
00017 // delete words;
00018 // 
00019 // DESCRIPTION
00020 // 
00021 // WordList is the <i>mifluz</i> equivalent of a database handler. Each
00022 // WordList object is bound to an inverted index file and implements the
00023 // operations to create it, fill it with word occurrences and search 
00024 // for an entry matching a given criterion.
00025 // 
00026 // CONFIGURATION
00027 // 
00028 // wordlist_extend {true|false} (default false)
00029 //   If <b>true</b> maintain reference count of unique 
00030 //   words. The <b>Noccurrence</b> method gives access to this count.
00031 // 
00032 // wordlist_verbose <number> (default 0)
00033 //   Set the verbosity level of the WordList class. 
00034 //   <br>
00035 //   1 walk logic
00036 //   <br>
00037 //   2 walk logic details
00038 //   <br>
00039 //   3 walk logic lots of details
00040 // 
00041 // wordlist_page_size <bytes> (default 8192)
00042 //   Berkeley DB page size (see Berkeley DB documentation)
00043 // 
00044 // wordlist_cache_size <bytes> (default 500K)
00045 //   Berkeley DB cache size (see Berkeley DB documentation)
00046 //   Cache makes a huge difference in performance. It must be at least 2%
00047 //   of the expected total data size. Note that if compression is activated
00048 //   the data size is eight times larger than the actual file size. In this
00049 //   case the cache must be scaled to 2% of the data size, not 2% 
00050 //   of the file size. See <b>Cache tuning</b> in the mifluz guide for
00051 //   more hints.
00052 // 
00053 // wordlist_compress {true|false} (default false)
00054 //   Activate compression of the index. The resulting index is eight times
00055 //   smaller than the uncompressed index.
00056 // 
00057 //
00058 // END
00059 //
00060 // Part of the ht://Dig package   <http://www.htdig.org/>
00061 // Copyright (c) 1999, 2000, 2001 The ht://Dig Group
00062 // For copyright details, see the file COPYING in your distribution
00063 // or the GNU General Public License version 2 or later
00064 // <http://www.gnu.org/copyleft/gpl.html>
00065 //
00066 // $Id: WordListMulti_8h-source.html,v 1.1 2008/06/08 10:13:14 sebdiaz Exp $
00067 //
00068 
00069 #ifndef _WordListMulti_h_
00070 #define _WordListMulti_h_
00071 
00072 #include <fcntl.h>
00073 #include <stdio.h>
00074 
00075 #ifndef SWIG
00076 #include "WordList.h"
00077 #include "WordCursorOne.h"
00078 //#include "WordCursorMulti.h"
00079 #endif /* SWIG */
00080 
00081 class WordContext;
00082 
00083 // 
00084 // Inverted index interface
00085 //
00086 class WordListMulti : public WordList
00087 {
00088  public:
00089     //-
00090     // Constructor. Build inverted index handling object using
00091     // run time configuration parameters listed in the <b>CONFIGURATION</b>
00092     // section.
00093     //
00094     WordListMulti(WordContext* ncontext);
00095     virtual ~WordListMulti();
00096 
00097 #ifndef SWIG
00098     virtual int Override(const WordReference& wordRef);
00099 #endif /* SWIG */
00100 
00101     //-
00102     // Returns OK if <b>wordRef</b> exists in the index, NOTOK otherwise.
00103     //
00104     virtual int Exists(const WordReference& wordRef);
00105 
00106     //
00107     // Delete permanently
00108     //
00109     //-
00110     // Delete all entries in the index whose key matches the 
00111     // <i>Key()</i> part of <b>wordRef</b>, using the <i>Walk</i>
00112     // method.
00113     // Returns the number of entries successfully deleted.
00114     //
00115     virtual int WalkDelete(const WordReference& wordRef);
00116     //-
00117     // Delete the entry in the index that exactly matches the
00118     // <i>Key()</i> part of <b>wordRef.</b>
00119     // Returns OK if deletion is successfull, NOTOK otherwise.
00120     //
00121     virtual int Delete(const WordReference& wordRef);
00122 
00123     //-
00124     // Open inverted index <b>filename.</b> <b>mode</b>
00125     // may be <i>O_RDONLY</i> or <i>O_RDWR.</i> If mode is 
00126     // <i>O_RDWR</i> it can be or'ed with <i>O_TRUNC</i> to reset
00127     // the content of an existing inverted index.
00128     // Return OK on success, NOTOK otherwise.
00129     //
00130     virtual int Open(const String& filename, int mode);
00131     //-
00132     // Close inverted index.
00133     // Return OK on success, NOTOK otherwise.
00134     // 
00135     virtual int Close();
00136     //-
00137     // Return the size of the index in pages.
00138     //
00139     virtual unsigned int Size() const;
00140     int AddIndex();
00141     int Merge();
00142 
00143     //-
00144     // Alias to the <b>Find</b> method.
00145     //
00146     virtual List *operator [] (const WordReference& wordRef);
00147     //-
00148     // Returns the list of word occurrences matching the <i>Key()</i>
00149     // part of <b>wordRef.</b> In the <i>Key()</i>, the string
00150     // (accessed with <i>GetWord()</i>) matches any string that begins
00151     // with it. The <i>List</i> returned contains pointers to
00152     // <i>WordReference</i> objects. It is the responsibility of the
00153     // caller to free the list.
00154     //
00155     virtual List *Prefix (const WordReference& prefix);
00156 
00157     //
00158     // Iterate over the complete database.
00159     //
00160 #ifndef SWIG
00161     //- 
00162     // Returns a list of all unique words contained in the inverted
00163     // index. The <i>List</i> returned contains pointers to
00164     // <i>String</i> objects. It is the responsibility of the caller
00165     // to free the list. See List.h header for usage.
00166     //
00167     virtual List *Words();
00168 #endif /* SWIG */
00169     //- 
00170     // Returns a list of all entries contained in the
00171     // inverted index. The <i>List</i> returned contains pointers to
00172     // <i>WordReference</i> objects. It is the responsibility of
00173     // the caller to free the list. See List.h header for usage.
00174     //
00175     virtual List *WordRefs();
00176 
00177 #ifndef SWIG
00178     //-
00179     // Create a cursor that searches all the occurrences in the
00180     // inverted index and call <b>ncallback</b> with
00181     // <b>ncallback_data</b> for every match.
00182     //
00183     virtual inline WordCursor *Cursor(wordlist_walk_callback_t callback, Object *callback_data) { return new WordCursorOne(this, callback, callback_data); }
00184 #endif /* SWIG */
00185     //- 
00186     // Create a cursor that searches all the occurrences in the
00187     // inverted index and that match <b>nsearchKey.</b> If
00188     // <b>naction</b> is set to HTDIG_WORDLIST_WALKER calls
00189     // <b>searchKey.callback</b> with <b>searchKey.callback_data</b>
00190     // for every match. If <b>naction</b> is set to
00191     // HTDIG_WORDLIST_COLLECT push each match in <b>searchKey.collectRes</b>
00192     // data member as a <b>WordReference</b> object. It is the responsibility
00193     // of the caller to free the <b>searchKey.collectRes</b> list.
00194     //
00195     virtual inline WordCursor *Cursor(const WordKey &searchKey, int action = HTDIG_WORDLIST_WALKER) { return new WordCursorOne(this, searchKey, action); }
00196 #ifndef SWIG
00197     //-
00198     // Create a cursor that searches all the occurrences in the
00199     // inverted index and that match <b>nsearchKey</b> and calls
00200     // <b>ncallback</b> with <b>ncallback_data</b> for every match.
00201     //
00202     virtual inline WordCursor *Cursor(const WordKey &searchKey, wordlist_walk_callback_t callback, Object * callback_data) { return new WordCursorOne(this, searchKey, callback, callback_data); }
00203 #endif /* SWIG */
00204 
00205     //
00206     // Update/get global word statistics statistics
00207     //
00208     //-
00209     // Add one to the reference count for the string contained
00210     // in the <i>Key().GetWord()</i> part of <b>wordRef.</b>
00211     // Returns OK on success, NOTOK otherwise.
00212     //
00213     virtual int Ref(const WordReference& wordRef);
00214     //-
00215     // Substract one to the reference count for the string contained
00216     // in the <i>Key().GetWord()</i> part of <b>wordRef.</b>
00217     // Returns OK on success, NOTOK otherwise.
00218     //
00219     virtual int Unref(const WordReference& wordRef);
00220     virtual int AllRef();
00221 
00222 #ifndef SWIG
00223     //-
00224     // Return in <b>noccurrence</b> the number of occurrences of the
00225     // string contained in the <i>GetWord()</i> part of <b>key.</b>
00226     // Returns OK on success, NOTOK otherwise.
00227     //
00228     virtual int Noccurrence(const String& key, unsigned int& noccurrence) const;
00229     virtual int Write(FILE* f) { return NOTOK; }
00230     virtual int Read(FILE* f) { return NOTOK; }
00231 
00232     virtual WordKey Key(const String& bufferin) { abort(); return WordKey(0); }
00233 
00234     virtual WordReference Word(const String& bufferin, int exists = 0) { abort(); return WordReference(0); }
00235 
00236 #endif /* SWIG */
00237     //
00238     // Retrieve WordReferences from the database. 
00239     // Backend of WordRefs, operator[], Prefix...
00240     //
00241     virtual List *Collect(const WordReference& word);
00242 #ifndef SWIG
00243     List*               dbs;
00244     int                 serial;
00245     int                 file_max;
00246     int                 file_min;
00247     unsigned int        put_max;
00248 #endif /* SWIG */
00249 };
00250 
00251 #endif /* _WordListMulti_h_ */
00252 

Generated on Sun Jun 8 10:56:40 2008 for GNUmifluz by  doxygen 1.5.5