GNUmifluz: mifluz/WordType.cc Source File

00001 //
00002 // WordType.cc
00003 //
00004 // WordType:  Wrap some attributes to make is...() type
00005 //            functions and other common functions without having to manage
00006 //            the attributes or the exact attribute combination semantics.
00007 //            Configuration parameter used:
00008 //            valid_punctuation,extra_word_characters,minimum_word_length,
00009 //            maximum_word_length,allow_numbers,bad_word_list
00010 //
00011 // Part of the ht://Dig package   <http://www.htdig.org/>
00012 // Copyright (c) 1999, 2000, 2001 The ht://Dig Group
00013 // For copyright details, see the file COPYING in your distribution
00014 // or the GNU General Public License version 2 or later 
00015 // <http://www.gnu.org/copyleft/gpl.html>
00016 //
00017 // $Id: WordType_8cc-source.html,v 1.1 2008/06/08 10:13:25 sebdiaz Exp $
00018 //
00019 
00020 #ifdef HAVE_CONFIG_H
00021 #include "config.h"
00022 #endif /* HAVE_CONFIG_H */
00023 
00024 #include <ctype.h>
00025 #include <stdio.h>
00026 #include <locale.h>
00027 
00028 #include "WordType.h"
00029 
00030 WordType::WordType(const Configuration &config)
00031 {
00032   const String valid_punct = config["wordlist_valid_punctuation"];
00033   const String extra_word_chars = config["wordlist_extra_word_characters"];
00034 
00035   String locale = config.Find("wordlist_locale");
00036   if(locale.empty())
00037     locale = "C";
00038   if(setlocale(LC_ALL, (char*)locale) == 0) {
00039     fprintf(stderr, "WordType::WordType: cannot set locale: ");
00040     perror("");
00041   }
00042 
00043   minimum_length = config.Value("wordlist_minimum_word_length", 3);
00044   maximum_length = config.Value("wordlist_maximum_word_length", 25);
00045   allow_numbers = config.Boolean("wordlist_allow_numbers", 0);
00046   lowercase = config.Boolean("wordlist_lowercase", 1);
00047   truncate = config.Boolean("wordlist_truncate", 1);
00048 
00049   extra_word_characters = extra_word_chars;
00050   valid_punctuation = valid_punct;
00051 
00052   chrtypes[0] = 0;
00053   for (int i = 1; i < 256; i++)
00054   {
00055     chrtypes[i] = 0;
00056     if (isalpha(i))
00057         chrtypes[i] |= WORD_TYPE_ALPHA;
00058     if (isdigit(i))
00059         chrtypes[i] |= WORD_TYPE_DIGIT;
00060     if (iscntrl(i))
00061         chrtypes[i] |= WORD_TYPE_CONTROL;
00062     if (strchr(extra_word_chars, i))
00063         chrtypes[i] |= WORD_TYPE_EXTRA;
00064     if (strchr(valid_punct, i))
00065         chrtypes[i] |= WORD_TYPE_VALIDPUNCT;
00066   }
00067 
00068   {
00069     const String filename = config["wordlist_bad_word_list"];
00070     FILE        *fl = fopen(filename, "r");
00071     char        buffer[1000];
00072     char        *word;
00073     String      new_word;
00074 
00075     // Read in the badwords file (it's just a text file)
00076     while (fl && fgets(buffer, sizeof(buffer), fl))
00077       {
00078         word = strtok(buffer, "\r\n \t");
00079         if (word && *word)
00080           {
00081             int flags;
00082             new_word = word;
00083             if((flags = Normalize(new_word)) & WORD_NORMALIZE_NOTOK) {
00084               fprintf(stderr, "WordType::WordType: reading bad words from %s found %s, ignored because %s\n", (const char*)filename, word, (char*)NormalizeStatus(flags & WORD_NORMALIZE_NOTOK));
00085             } else {
00086               badwords.Add(new_word, 0);
00087             }
00088           }
00089     }
00090 
00091     if (fl)
00092         fclose(fl);
00093   }
00094 }
00095 
00096 int
00097 WordType::Normalize(String& word) const
00098 {
00099   int status = 0;
00100 
00101   //
00102   // Reject empty strings, always
00103   //
00104   if(word.empty())
00105     return status | WORD_NORMALIZE_NULL | WORD_NORMALIZE_NOTOK;
00106 
00107   //
00108   // Always convert to lowercase
00109   //
00110   if(lowercase && word.lowercase())
00111     status |= WORD_NORMALIZE_CAPITAL;
00112 
00113   //
00114   // Remove punctuation characters according to configuration
00115   //
00116   if(StripPunctuation(word))
00117     status |= WORD_NORMALIZE_PUNCTUATION;
00118 
00119   //
00120   // Truncate words too long 
00121   //
00122   if(word.length() > maximum_length) {
00123     status |= WORD_NORMALIZE_TOOLONG;
00124     if(truncate)
00125       word.chop(word.length() - maximum_length);
00126     else
00127       return status | WORD_NORMALIZE_NOTOK;
00128   }
00129 
00130   //
00131   // Reject words too short according to configuration
00132   //
00133   if(word.length() < minimum_length)
00134     return status | WORD_NORMALIZE_TOOSHORT | WORD_NORMALIZE_NOTOK;
00135 
00136   //
00137   // Reject if contains control characters
00138   //
00139   int alpha = 0;
00140   for(const unsigned char *p = (const unsigned char*)(const char*)(char *)word; *p; p++) {
00141     if(IsStrictChar(*p) || (allow_numbers && IsDigit(*p))) {
00142       alpha = 1;
00143     } else if(IsControl(*p)) {
00144       return status | WORD_NORMALIZE_CONTROL | WORD_NORMALIZE_NOTOK;
00145     } else if(IsDigit(*p)) {
00146       return status | WORD_NORMALIZE_NUMBER | WORD_NORMALIZE_NOTOK;
00147     }
00148   }
00149 
00150   //
00151   // Reject if contains no alpha characters (according to configuration)
00152   //
00153   if(!alpha) return status | WORD_NORMALIZE_NOALPHA | WORD_NORMALIZE_NOTOK;
00154 
00155   //
00156   // Reject if listed in config[bad_word_list]
00157   //
00158   if(badwords.Exists(word))
00159     return status | WORD_NORMALIZE_BAD | WORD_NORMALIZE_NOTOK;
00160 
00161   //
00162   // Accept and report the transformations that occured
00163   //
00164   return status | WORD_NORMALIZE_OK;
00165 }
00166 
00167 //
00168 // Convert the integer status into a readable string
00169 //
00170 String
00171 WordType::NormalizeStatus(int flags)
00172 {
00173   String tmp;
00174 
00175   if(flags & WORD_NORMALIZE_TOOLONG) tmp << "TOOLONG ";
00176   if(flags & WORD_NORMALIZE_TOOSHORT) tmp << "TOOSHORT ";
00177   if(flags & WORD_NORMALIZE_CAPITAL) tmp << "CAPITAL ";
00178   if(flags & WORD_NORMALIZE_NUMBER) tmp << "NUMBER ";
00179   if(flags & WORD_NORMALIZE_CONTROL) tmp << "CONTROL ";
00180   if(flags & WORD_NORMALIZE_BAD) tmp << "BAD ";
00181   if(flags & WORD_NORMALIZE_NULL) tmp << "NULL ";
00182   if(flags & WORD_NORMALIZE_PUNCTUATION) tmp << "PUNCTUATION ";
00183   if(flags & WORD_NORMALIZE_NOALPHA) tmp << "NOALPHA ";
00184 
00185   if(tmp.empty()) tmp << "GOOD";
00186 
00187   return tmp;
00188 }