GNU mifluz - Tutorial

Usage of GNU mifluz

The purpose of mifluz is to provide a C++ library to store a full text inverted index. To put it briefly, it allows storage of occurrences of words in such a way that they can later be searched. The basic idea of an inverted index is to associate each unique word with a list of documents in which they appear. This list can then be searched to locate the documents containing a specific word.

The indexer

The first step is to create a text indexer algorithm to count the words of a text. To become, we needs to determine a data model and characters filters rules.

The data structure

struct wordCounting{
 string word;
 int nb;
 string filename;
};

The characters filter

char charFilter (char pin)
{
    pin=tolower(pin);
    if (pin == 'é' || pin=='è' || pin=='ê' || pin =='ë' )
        return 'e';
    if (pin == 'à' || pin=='â' || pin =='ä')
        return 'a';
    if (pin == 'î' || pin=='ï'  )
        return 'i';
    if (pin == 'ô' || pin=='ö'  )
        return 'o';
    if (pin == 'ù' || pin=='û'  || pin=='ü')
        return 'u';
    if (pin == ',' || pin=='"'  || pin=='\''|| pin==';'||
      pin=='!'|| pin=='?'|| pin=='$'|| pin=='&'|| pin==')'||
      pin=='('|| pin=='.'|| pin==':' || pin=='-'|| pin=='_' ||
      pin=='-' || pin==']' || pin=='[' || pin=='/' )
        return ' ';
    return pin;
}

The word filter

string stringFilter (string &pin)
{
    stringstream ret;
    for (unsigned int i=0;i<pin.length();i++)
    {
        ret<<charFilter (pin.at(i));
    }
    return ret.str();
}

The text parser

map<string,wordCounting *> parseWords (string &pFile)
{
    map<string,wordCounting *> list;
    char c;
    ifstream is;
    is.open (pFile.c_str());    // open file
    string wordstream;
    map<string,wordCounting *>::iterator it;
    while (is.good())     // loop while extraction from file is possible
    {    is.get(c);       // get character from file
        c=charFilter(c);
        if (c==' '||c=='\n' || c=='\t' || c=='\0')
        {    if (!wordstream.empty())
            {
                wordCounting *res;
                it=list.find(wordstream);
                if( it == list.end() ) {
                    res = new wordCounting();
                    res->word=wordstream;
                    res->nb=0;
                    res->filename=pFile;
                    list.insert( make_pair( wordstream, res ) );
                }
                else
                    res=it->second;
                res->nb++;
            }
            wordstream="";
        }
        else
        {
            wordstream+=c;
        }
    }
    is.close();
    return list;
}

The mifluz Usage

After filt and build a word list, we can become to use the mifluz Engine.

Save in mifluz

mifluz needs a data model to save data in mifluz. The first element of the data modele is the word that is saved in the dictionnary and that is the word searched. With this data we can add parameters as Ranking and file Id.

static ConfigDefaults defaultsInd[] = {
   { "wordlist_wordkey_description","Word 30/Rank 15/Location 15"},
  { 0 }
};

Convert Data index model to mifluz model

static map<int,string> file_list;
void addList(map<string,wordCounting *> &list,WordList *words)
{
    map<int,string>::iterator iteror;
    map<string,wordCounting *>::iterator iter;
    for( iter = list.begin(); iter != list.end(); iter++ ) {
        bool isOnList=false;
        int idFile=-100;
        for (iteror=file_list.begin();iteror!=file_list.end();iteror++)
        {
            if (iteror->second==iter->second->filename)
            {
                idFile=iteror->first;
                isOnList=true;
                break;
            }
        }
        if (!isOnList)
        {
            idFile=file_list.size()+1;
            file_list.insert(make_pair(idFile,iter->second->filename));
        }
        stringstream el;
        el<<iter->second->word<<" "<<iter->second->nb<<" "<<idFile;
        words->Override(words->Word(el.str().c_str()));
    }
}

The mifluz environment

void action(WordContext* context)
{
  WordList *words = context->List();
  words->Open("words2.db", O_RDWR|O_TRUNC);
  string file = "file1.test";
  map<string,wordCounting *> list=parseWords (file);
  //..... on peut effectuer ces recherche
  words->Close();
  delete words;
}

The search

List *results = words->FindWord("which");
  WordReference *match;
  for(results->Start_Get(); (match = (WordReference*)results->Get_Next());) {
    map<int,string>::iterator iteror;
    iteror = file_list.find(match->Key().Get(2));
    cout <<"on File ="<<iteror->second<<" with nb words="<<match->Key().Get(1) <<endl;
  }

The complete source code

http://cvs.savannah.gnu.org/viewvc/mifluz/examples/example1.cc?root=mifluz&view=markup.

Report a Bug

If you think you have found a bug in GNU mifluz, then please send as complete a report as possible to http://savannah.gnu.org/bugs/?group=mifluz.

License

GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007

Copyright (C) 2007 Free Software Foundation, Inc.

Maintainer

GNU mifluz is currently being maintained by mailto:sebastien.diaz@gmail.com.