WordType.h

Go to the documentation of this file.
00001 //
00002 // WordType.h
00003 //
00004 // NAME
00005 // defines a word in term of allowed characters, length etc.
00006 //
00007 // SYNOPSIS
00008 //
00009 // Only called thru WordContext::Initialize()
00010 // 
00011 // DESCRIPTION
00012 // 
00013 // WordType defines an indexed word and operations to validate
00014 // a word to be indexed. All words inserted into the <i>mifluz</i> index
00015 // are <b>Normalize</b>d before insertion. The configuration options
00016 // give some control over the definition of a word.
00017 //
00018 // CONFIGURATION
00019 // 
00020 // wordlist_locale <locale> (default C)
00021 //   Set the locale of the program to <b>locale</b>. See setlocale(3)
00022 //   for more information.
00023 //
00024 // wordlist_allow_numbers {true|false} (default false)
00025 //   If <b>true</b> a word may contain digits. If <b>false</b> digits
00026 //   are not considered to be part of a word and an attempt to insert
00027 //   a word containing digits will result in an error. 
00028 //   See the <b>Normalize</b> method for more information.
00029 // 
00030 // wordlist_mimimun_word_length <number> (default 3)
00031 //   The minimum length of a word.
00032 //   See the <b>Normalize</b> method for more information.
00033 //
00034 // wordlist_maximum_word_length <number> (default 25)
00035 //   The maximum length of a word.
00036 //   See the <b>Normalize</b> method for more information.
00037 //
00038 // wordlist_allow_numbers {true|false} <number> (default false)
00039 //   A digit is considered a valid character within a word if
00040 //   this configuration parameter is set to <i>true</i> otherwise
00041 //   it is an error to insert a word containing digits.
00042 //   See the <b>Normalize</b> method for more information.
00043 //
00044 // wordlist_truncate {true|false} <number> (default true)
00045 //   If a word is too long according to
00046 //   the <i>wordlist_maximum_word_length</i> it is truncated
00047 //   if this configuration parameter is <i>true</i> otherwise it
00048 //   is considered an invalid word.
00049 //
00050 // wordlist_lowercase {true|false} <number> (default true)
00051 //   If a word contains upper case letters it is converted to lowercase
00052 //   if this configuration parameter is true, otherwise it is left
00053 //   untouched.
00054 //
00055 // wordlist_valid_punctuation [characters] (default none)
00056 //   A list of punctuation characters that may appear in a word. 
00057 //   These characters will be removed from the word before insertion
00058 //   in the index.
00059 //
00060 // wordlist_extra_word_characters [characters] (default none)
00061 //   A list of characters that may appear in a word. These characters
00062 //   are left untouched.
00063 //
00064 // END
00065 //
00066 // Part of the ht://Dig package   <http://www.htdig.org/>
00067 // Copyright (c) 1999, 2000, 2001 The ht://Dig Group
00068 // For copyright details, see the file COPYING in your distribution
00069 // or the GNU General Public License version 2 or later 
00070 // <http://www.gnu.org/copyleft/gpl.html>
00071 //
00072 // $Id: WordType_8h-source.html,v 1.1 2008/06/08 10:13:25 sebdiaz Exp $
00073 //
00074 
00075 #ifndef _WordType_h
00076 #define _WordType_h
00077 
00078 #include "htString.h"
00079 #include "Configuration.h"
00080 
00081 //
00082 // Return values of Normalize, to get them in string form use NormalizeStatus
00083 //
00084 #define WORD_NORMALIZE_TOOLONG          0x0001
00085 #define WORD_NORMALIZE_TOOSHORT         0x0002
00086 #define WORD_NORMALIZE_CAPITAL          0x0004
00087 #define WORD_NORMALIZE_NUMBER           0x0008
00088 #define WORD_NORMALIZE_CONTROL          0x0010
00089 #define WORD_NORMALIZE_BAD              0x0020
00090 #define WORD_NORMALIZE_NULL             0x0040
00091 #define WORD_NORMALIZE_PUNCTUATION      0x0080
00092 #define WORD_NORMALIZE_NOALPHA          0x0100
00093 #define WORD_NORMALIZE_OK               0x4000
00094 #define WORD_NORMALIZE_NOTOK            0x8000
00095 
00096 class WordType
00097 {
00098 public:
00099   //
00100   // Constructors
00101   //
00102   WordType(const Configuration& config);
00103 
00104   //
00105   // Predicates
00106   // 
00107   int IsChar(int c) const;
00108   int IsStrictChar(int c) const;
00109   int IsDigit(int c) const;
00110   int IsControl(int c) const;
00111 
00112   //
00113   // Transformations
00114   //
00115   int StripPunctuation(String &s) const;
00116 
00117   //-
00118   // Normalize a word according to configuration specifications and 
00119   // builtin transformations. <b>Every</b> word inserted in the inverted
00120   // index goes thru this function. If
00121   // a word is rejected (return value has WORD_NORMALIZE_NOTOK bit set) it will not 
00122   // be inserted in the index. If a word is accepted (return value has 
00123   // WORD_NORMALIZE_OK bit set) it will be inserted in the index. In
00124   // addition to these two bits, informational values are stored that
00125   // give information on the processing done on the word.
00126   // The bit field values and their meanings are
00127   // as follows:
00128   //
00129   // <dl>
00130   // <dt>WORD_NORMALIZE_TOOLONG
00131   // <dd>the word length exceeds the value of 
00132   //     the <i>wordlist_maximum_word_length</i> configuration parameter.
00133   // <dt>WORD_NORMALIZE_TOOSHORT
00134   // <dd>the word length is smaller than the value of 
00135   //     the <i>wordlist_minimum_word_length</i> configuration parameter.
00136   // <dt>WORD_NORMALIZE_CAPITAL
00137   // <dd>the word contained capital letters and has been converted 
00138   //     to lowercase. This bit is only set
00139   //     if the <i>wordlist_lowercase</i> configuration parameter
00140   //     is true.
00141   // <dt>WORD_NORMALIZE_NUMBER
00142   // <dd>the word contains digits and the configuration 
00143   //     parameter <i>wordlist_allow_numbers</i> is set to false.
00144   // <dt>WORD_NORMALIZE_CONTROL
00145   // <dd>the word contains control characters.
00146   // <dt>WORD_NORMALIZE_BAD
00147   // <dd>the word is listed in the file pointed by 
00148   //     the <i>wordlist_bad_word_list</i> configuration parameter.
00149   // <dt>WORD_NORMALIZE_NULL
00150   // <dd>the word is a zero length string.
00151   // <dt>WORD_NORMALIZE_PUNCTUATION
00152   // <dd>at least one character listed in 
00153   //     the <i>wordlist_valid_punctuation</i> attribute was removed
00154   //     from the word.
00155   // <dt>WORD_NORMALIZE_NOALPHA
00156   // <dd>the word does not contain any alphanumerical character.
00157   // </dl>
00158   // 
00159   int Normalize(String &s) const;
00160 
00161   //
00162   // Error handling
00163   //
00164   //-
00165   // Returns a string explaining the return flags of the Normalize
00166   // method.
00167   //
00168   static String NormalizeStatus(int flags);
00169 
00170 private:
00171 
00172   String                valid_punctuation;      // The same as the attribute.
00173   String                extra_word_characters;  // The same as the attribute.
00174   char                  chrtypes[256];          // quick lookup table for types
00175   int                   minimum_length;         // Minimum word length
00176   int                   maximum_length;         // Maximum word length
00177   int                   allow_numbers;          // True if a word may contain numbers
00178   int                   lowercase;              // True words converted to lowercase
00179   int                   truncate;               // True if word too long are truncated
00180   Dictionary            badwords;               // List of excluded words
00181 };
00182 
00183 // Bits to set in chrtypes[]:
00184 #define WORD_TYPE_ALPHA 0x01
00185 #define WORD_TYPE_DIGIT 0x02
00186 #define WORD_TYPE_EXTRA 0x04
00187 #define WORD_TYPE_VALIDPUNCT    0x08
00188 #define WORD_TYPE_CONTROL       0x10
00189 
00190 // One for characters that when put together are a word
00191 // (including punctuation).
00192 inline int
00193 WordType::IsChar(int c) const
00194 {
00195   return (chrtypes[(unsigned char)c] & (WORD_TYPE_ALPHA|WORD_TYPE_DIGIT|WORD_TYPE_EXTRA|WORD_TYPE_VALIDPUNCT)) != 0;
00196 }
00197 
00198 // Similar, but no punctuation characters.
00199 inline int
00200 WordType::IsStrictChar(int c) const
00201 {
00202   return (chrtypes[(unsigned char)c] & (WORD_TYPE_ALPHA|WORD_TYPE_EXTRA)) != 0;
00203 }
00204 
00205 // Reimplementation of isdigit() using the lookup table chrtypes[] 
00206 inline int
00207 WordType::IsDigit(int c) const
00208 {
00209   return (chrtypes[(unsigned char)c] & WORD_TYPE_DIGIT) != 0;
00210 }
00211 
00212 // Similar to IsDigit, but for iscntrl()
00213 inline int
00214 WordType::IsControl(int c) const
00215 {
00216   return (chrtypes[(unsigned char)c] & WORD_TYPE_CONTROL) != 0;
00217 }
00218 
00219 // Let caller get rid of getting and holding a configuration parameter.
00220 inline int
00221 WordType::StripPunctuation(String &s) const
00222 {
00223   return s.remove(valid_punctuation);
00224 }
00225 
00226 
00227 #endif /* __WordType_h */

Generated on Sun Jun 8 10:56:40 2008 for GNUmifluz by  doxygen 1.5.5