WordListOne.cc

Go to the documentation of this file.
00001 //
00002 // WordListOne.cc
00003 //
00004 // Part of the ht://Dig package   <http://www.htdig.org/>
00005 // Copyright (c) 1999, 2000, 2001 The ht://Dig Group
00006 // For copyright details, see the file COPYING in your distribution
00007 // or the GNU General Public License version 2 or later
00008 // <http://www.gnu.org/copyleft/gpl.html>
00009 //
00010 // $Id: WordListOne_8cc-source.html,v 1.1 2008/06/08 10:13:14 sebdiaz Exp $
00011 //
00012 
00013 #ifdef HAVE_CONFIG_H
00014 #include "config.h"
00015 #endif /* HAVE_CONFIG_H */
00016 
00017 #include "WordListOne.h"
00018 #include "WordReference.h"
00019 #include "WordRecord.h"
00020 #include "WordType.h"
00021 #include "WordContext.h"
00022 #include "Configuration.h"
00023 #include "htString.h"
00024 #include "HtTime.h"
00025 #include "WordDBCompress.h"
00026 #include "WordDBCache.h"
00027 #include "WordDead.h"
00028 #include "WordMeta.h"
00029 
00030 #include <stdio.h>
00031 #include <stdlib.h>
00032 #include <unistd.h>
00033 #include <ctype.h>
00034 #include <errno.h>
00035 
00036 // *****************************************************************************
00037 //
00038 WordListOne::WordListOne(WordContext* ncontext)
00039 {
00040   context = ncontext;
00041   db = new WordDB(ncontext->GetDBInfo());
00042   dict = new WordDict();
00043   dict->Initialize(this);
00044   meta = new WordMeta();
00045   meta->Initialize(this);
00046   dead = new WordDead();
00047   dead->Initialize(this);
00048 
00049   // The database itself hasn't been opened yet
00050   isopen = 0;
00051   Configuration& config = context->GetConfiguration();
00052   extended = config.Boolean("wordlist_extend");
00053   verbose =  config.Value("wordlist_verbose");
00054   compressor = 0;
00055   caches = 0;
00056   flags = 0;
00057 }
00058 
00059 // *****************************************************************************
00060 //
00061 WordListOne::~WordListOne()
00062 {
00063   BatchEnd();
00064   Close();
00065   delete dead;
00066   delete meta;
00067   delete dict;
00068   delete db;
00069 }
00070 
00071 static int word_db_qcmp(WordContext* context, const WordDBCacheEntry *a, const WordDBCacheEntry *b)
00072 {
00073   return WordKey::Compare(context, (const unsigned char*)a->key, a->key_size, (const unsigned char*)b->key, b->key_size);
00074 }
00075 
00076 // *****************************************************************************
00077 //
00078 int WordListOne::Open(const String& nfilename, int mode)
00079 {
00080   filename = nfilename;
00081 
00082   int usecompress = 0;
00083   Configuration& config = context->GetConfiguration();
00084 
00085   if(config.Boolean("wordlist_compress") == 1) {
00086     usecompress = DB_COMPRESS;
00087     WordDBCompress* compressor = new WordDBCompress(context);
00088     //      compressor->debug = config.Value("wordlist_compress_debug");
00089     SetCompressor(compressor);
00090 
00091     context->GetDBInfo().dbenv->mp_cmpr_info = compressor->CmprInfo();
00092     context->GetDBInfo().dbenv->flags |= DB_ENV_CMPR;
00093   }
00094 
00095   flags = (mode & O_RDWR) ? DB_CREATE : DB_RDONLY;
00096   flags |= usecompress;
00097   if(mode & O_TRUNC) {
00098     if(mode & O_RDWR) {
00099       unlink((char*)filename);
00100     } else
00101       fprintf(stderr, "WordListOne::Open: O_TRUNC | O_RDONLY is meaningless\n");
00102   }
00103 
00104   WordLock* lock;
00105   Meta()->Lock("open", lock);
00106 
00107   db->set_bt_compare(word_db_cmp, (void*)context);
00108 
00109   if(config.Boolean("wordlist_cache_inserts", 0)) {
00110     int size = config.Value("wordlist_cache_size", 0);
00111     if(size / 2 < WORD_DB_CACHE_MINIMUM)
00112       size = 0;
00113     else
00114       size /= 2;
00115 
00116     db->CacheOn(context, size);
00117     db->CacheCompare(word_db_qcmp);
00118   }
00119 
00120   db->set_pagesize(Pagesize());
00121 
00122   int ret = db->Open(filename, "index", DB_BTREE, flags, 0666, WORD_DB_INDEX) == 0 ? OK : NOTOK;
00123   if(ret == NOTOK) return ret;
00124   if(dict->Open() != OK) return NOTOK;
00125   if(meta->Open() != OK) return NOTOK;
00126   if(dead->Open() != OK) return NOTOK;
00127 
00128   isopen = 1;
00129 
00130   Meta()->Unlock("open", lock);
00131 
00132   return ret;
00133 }
00134 
00135 // *****************************************************************************
00136 //
00137 int WordListOne::Close()
00138 {
00139   if(isopen) {
00140     if(db->Close() != 0) return NOTOK;
00141     if(dict->Close() != 0) return NOTOK;
00142     if(meta->Close() != 0) return NOTOK;
00143     if(dead->Close() != 0) return NOTOK;
00144     isopen = 0;
00145   }
00146 
00147   {
00148     WordDBCompress* compressor = GetCompressor();
00149     if(compressor) {
00150       delete compressor;
00151       SetCompressor(0);
00152     }
00153     delete context->GetDBInfo().dbenv->mp_cmpr_info;
00154     context->GetDBInfo().dbenv->mp_cmpr_info = 0;
00155     context->GetDBInfo().dbenv->flags &= ~DB_ENV_CMPR;
00156   }
00157 
00158   return OK;
00159 }
00160 
00161 // ****************************************************************************
00162 //
00163 unsigned int WordListOne::Size() const 
00164 {
00165   return db->Size();
00166 }
00167 
00168 // ****************************************************************************
00169 //
00170 int WordListOne::Override(const WordReference& arg)
00171 {
00172   if (arg.GetWord().length() == 0) {
00173     fprintf(stderr, "WordListOne::Override(%s) word is zero length\n", (char*)arg.Get());
00174     return NOTOK;
00175   }
00176   if (!arg.Key().Filled()) {
00177     fprintf(stderr, "WordListOne::Override(%s) key is not fully defined\n", (char*)arg.Get());
00178     return NOTOK;
00179   }
00180 
00181   WordType& wtype = context->GetType();
00182   WordReference wordRef(arg);
00183   String        word = wordRef.GetWord();
00184   if(wtype.Normalize(word) & WORD_NORMALIZE_NOTOK)
00185     return NOTOK;
00186   wordRef.SetWord(word);
00187   unsigned int wordid = 0;
00188   if(dict->SerialRef(word, wordid) != OK) return NOTOK;
00189   wordRef.Key().Set(WORD_KEY_WORD, wordid);
00190 
00191   int ret = NOTOK;
00192 
00193   if(caches) {
00194     String key;
00195     String record;
00196     if(wordRef.Pack(key, record) != OK)
00197       return NOTOK;
00198     ret = caches->Add(key.get(), key.length(), record.get(), record.length()) == 0 ? OK : NOTOK;
00199     if(caches->Full()) caches->Merge(*db);
00200   } else {
00201     ret = db->Put(wordRef, 0) == 0 ? OK : NOTOK;
00202   }
00203 
00204   return ret;
00205 }
00206 
00207 
00208 // *****************************************************************************
00209 //
00210 List *WordListOne::operator [] (const WordReference& wordRef)
00211 {
00212   return Collect(wordRef);
00213 }
00214 
00215 // *****************************************************************************
00216 //
00217 List *WordListOne::Prefix (const WordReference& prefix)
00218 {
00219   List* result = new List();
00220   WordDictCursor* cursor = Dict()->CursorPrefix(prefix.GetWord());
00221   String word;
00222   WordDictRecord record;
00223   WordReference prefix2(prefix);
00224   while(Dict()->NextPrefix(cursor, word, record) == 0) {
00225     prefix2.Key().Set(WORD_KEY_WORD, record.Id());
00226     List* tmp_result = Collect(prefix2);
00227     while(tmp_result->Count() > 0) {
00228       WordReference* entry = (WordReference*)tmp_result->Shift(LIST_REMOVE_RELEASE);
00229       entry->SetWord(word);
00230       result->Push(entry);
00231     }
00232     delete tmp_result;
00233   }
00234   return result;
00235 }
00236 
00237 // *****************************************************************************
00238 //
00239 List *WordListOne::WordRefs()
00240 {
00241   return Collect(WordReference(context));
00242 }
00243 
00244 // *****************************************************************************
00245 //
00246 List *WordListOne::Collect(const WordReference& wordRef)
00247 {
00248   WordCursor *search = Cursor(wordRef.Key(), HTDIG_WORDLIST_COLLECTOR);
00249   if(search->Walk() != OK) return 0;
00250   List* result = search->GetResults();
00251   delete search;
00252   return result;
00253 }
00254 
00255 // *****************************************************************************
00256 //
00257 int 
00258 WordListOne::Read(FILE* f)
00259 {
00260   WordReference wordRef(context);
00261 #define WORD_BUFFER_SIZE        1024
00262   char buffer[WORD_BUFFER_SIZE + 1];
00263   String line;
00264   int line_number = 0;
00265   int inserted = 0;
00266 
00267   BatchStart();
00268 
00269   String key;
00270   String record;
00271 
00272   while(fgets(buffer, WORD_BUFFER_SIZE, f)) {
00273     line_number++;
00274     int buffer_length = strlen(buffer);
00275     int eol = buffer[buffer_length - 1] == '\n';
00276 
00277     if(eol) buffer[--buffer_length] = '\0';
00278     
00279     line.append(buffer, buffer_length);
00280     //
00281     // Join big lines
00282     //
00283     if(!eol) continue;
00284     //
00285     // If line ends with a \ continue
00286     //
00287     if(line.last() == '\\') {
00288       line.chop(1);
00289       continue;
00290     }
00291       
00292     if(!line.empty()) {
00293       StringList fields(line, "\t ");
00294 
00295       //
00296       // Convert the word to a wordid
00297       //
00298       String* word = (String*)fields.Get_First();
00299       unsigned int wordid;
00300       if(dict->SerialRef(*word, wordid) != OK) return NOTOK;
00301       word->trunc();
00302       (*word) << wordid;
00303 
00304       if(wordRef.SetList(fields) != OK) {
00305         fprintf(stderr, "WordList::Read: line %d : %s\n", line_number, (char*)line);
00306         fprintf(stderr, " cannot build WordReference (ignored)\n");
00307       } else {
00308         if(wordRef.Pack(key, record) != OK) {
00309           fprintf(stderr, "WordList::Read: line %d : %s\n", line_number, (char*)line);
00310           fprintf(stderr, " pack failed (ignored)\n");
00311         } else {
00312           caches->Add(key.get(), key.length(), record.get(), record.length());
00313           inserted++;
00314         }
00315         if(verbose && (inserted % 10000 == 0)) fprintf(stderr, "WordList::Read: inserted %d entries\n", inserted);
00316         if(verbose > 1) fprintf(stderr, "WordList::Read: inserting %s\n", (char*)wordRef.Get());
00317       }
00318 
00319       line.trunc();
00320     }
00321   }
00322 
00323   BatchEnd();
00324 
00325   return inserted;
00326 }
00327 
00328 // *****************************************************************************
00329 //
00330 // streaming operators for ascii dumping and reading a list
00331 class FileOutData : public Object
00332 {
00333 public:
00334   FILE* f;
00335   String word;
00336   FileOutData(FILE* f_arg) : f(f_arg) { }
00337 };
00338 
00339 // *****************************************************************************
00340 //
00341 static int
00342 wordlist_walk_callback_file_out(WordList *, WordDBCursor& , const WordReference *wordRef, Object &ndata)
00343 {
00344   FileOutData& data = (FileOutData&)ndata;
00345   ((WordReference*)wordRef)->SetWord(data.word);
00346   fprintf(data.f, "%s\n", (char*)wordRef->Get());
00347   return OK;
00348 }
00349 
00350 int WordListOne::Write(FILE* f)
00351 {
00352   FileOutData data(f);
00353   WordDictCursor* cursor = dict->Cursor();
00354   int ret;
00355   String word;
00356   WordDictRecord wordinfo;
00357   while((ret = dict->Next(cursor, word, wordinfo)) == 0) {
00358     WordKey key(context);
00359     key.Set(WORD_KEY_WORD, wordinfo.Id());
00360     data.word = word;
00361     WordCursor *search = Cursor(key, wordlist_walk_callback_file_out, (Object *)&data);
00362     search->Walk();
00363     delete search;
00364   }
00365   return ret == DB_NOTFOUND ? OK : NOTOK;
00366 }
00367 
00368 
00369 // *****************************************************************************
00370 //
00371 // Callback data dedicated to Dump and dump_word communication
00372 //
00373 class DeleteWordData : public Object
00374 {
00375 public:
00376   DeleteWordData() { count = 0; }
00377 
00378   int count;
00379 };
00380 
00381 // *****************************************************************************
00382 //
00383 //
00384 static int delete_word(WordList *words, WordDBCursor &cursor, const WordReference *word, Object &data)
00385 {
00386   WordListOne *words_one = (WordListOne*)words;
00387   if(words_one->DeleteCursor(cursor) == 0) {
00388     ((DeleteWordData&)data).count++;
00389     return OK;
00390   } else {
00391     fprintf(stderr, "WordList delete_word: deleting %s failed\n", (char*)word->Get());
00392     return NOTOK;
00393   }
00394 }
00395 
00396 // *****************************************************************************
00397 // 
00398 // Delete all records matching wordRef, return the number of 
00399 // deleted records.
00400 //
00401 int WordListOne::WalkDelete(const WordReference& wordRef)
00402 {
00403   DeleteWordData data;
00404   WordKey key = wordRef.Key();
00405 
00406   if(key.IsDefined(WORD_KEY_WORD)) {
00407     WordCursor *description = Cursor(key, delete_word, &data);
00408     description->Walk();
00409     delete description;
00410     dict->Decr(wordRef.GetWord(), data.count);
00411   } else {
00412     WordDictCursor* cursor = dict->Cursor();
00413     int ret;
00414     String word;
00415     WordDictRecord wordinfo;
00416     int total = 0;
00417     while((ret = dict->Next(cursor, word, wordinfo)) == 0) {
00418       key.Set(WORD_KEY_WORD, wordinfo.Id());
00419       WordCursor *search = Cursor(key, delete_word, &data);
00420       search->Walk();
00421       delete search;
00422       dict->Decr(word, data.count);
00423       total += data.count;
00424       data.count = 0;
00425     }
00426     data.count = total;
00427   }
00428   return data.count;
00429 }
00430 
00431 // *****************************************************************************
00432 //
00433 // Returns the reference count for word in <count> arg
00434 //
00435 int WordListOne::Noccurrence(const String& word, unsigned int& noccurrence) const
00436 {
00437   return dict->Noccurrence(word, noccurrence);
00438 }
00439 
00440 WordKey WordListOne::Key(const String& bufferin)
00441 {
00442   WordKey key(context);
00443   StringList fields(bufferin, "\t ");
00444   String* field = (String*)fields.Get_First();
00445   unsigned int wordid;
00446   Dict()->Serial(*field, wordid);
00447   field->trunc();
00448   (*field) << wordid;
00449   key.SetList(fields);
00450   return key;
00451 }
00452 
00453 WordReference WordListOne::Word(const String& bufferin, int exists /* = 0 */)
00454 {
00455   WordReference wordRef(context);
00456   StringList fields(bufferin, "\t ");
00457   String* field = (String*)fields.Get_First();
00458   if(context->GetType().Normalize(*field) & WORD_NORMALIZE_NOTOK) {
00459     //
00460     // If the goal is to build a WordReference object that may not be
00461     // in the index, canonicalization failure is not a problem.
00462     //
00463     if(!exists)
00464       fprintf(stderr, "WordListOne::Word: cannot normalize word %s\n", (char*)*field);
00465   }
00466   String word = *field;
00467   unsigned int wordid;
00468   if(exists)
00469     Dict()->SerialExists(word, wordid);
00470   else
00471     Dict()->Serial(word, wordid);
00472   field->trunc();
00473   (*field) << wordid;
00474   wordRef.SetList(fields);
00475   wordRef.SetWord(word);
00476   return wordRef;
00477 }
00478 
00479 void
00480 WordListOne::BatchEnd()
00481 {
00482   if(caches) {
00483     caches->Merge(*db);
00484     WordList::BatchEnd();
00485   }
00486 }

Generated on Sun Jun 8 10:56:40 2008 for GNUmifluz by  doxygen 1.5.5