From grdetil@scrc.umanitoba.ca Tue Aug 24 12:41:15 1999 Date: Tue, 24 Aug 1999 12:51:21 -0500 (CDT) From: Gilles Detillieux To: htdig@htdig.org Cc: htdig@htdig.org Subject: Re: [htdig] patch for htsearch excerpt highlighting According to Patrick: > I've also noticed that if you searched for "finding my email", and > "my" is in the "badwords" file, it will also be highlighted in the > excerpt. May I ask if it is possible that you also append this > into the patch? > > Currently, htdig/htsearch truly ignore words in the "badwords" file, > but they still come up in the excerpt highlighted. Thanks for the tip. Here's a revised patch that will make htsearch ignore words in excerpts that are in the bad_word_list file, or that are shorter than minimum_word_length, in addition to skipping over punctuation: --- htdig-3.1.2.bak/htlib/StringMatch.h Wed Apr 21 21:47:58 1999 +++ htdig-3.1.2/htlib/StringMatch.h Mon Aug 23 15:38:31 1999 @@ -98,6 +98,12 @@ public: void IgnoreCase(); // + // Build a local translation table which ignores all given punctuation + // characters + // + void IgnorePunct(char *punct = NULL); + + // // Determine if there is a pattern associated with this Match object. // int hasPattern() {return table[0] != 0;} --- htdig-3.1.2.bak/htlib/StringMatch.cc Wed Apr 21 21:47:58 1999 +++ htdig-3.1.2/htlib/StringMatch.cc Mon Aug 23 16:40:14 1999 @@ -90,6 +90,8 @@ StringMatch::Pattern(char *pattern, char table[i] = new int[n]; memset((unsigned char *) table[i], 0, n * sizeof(int)); } + for (i = 0; i < n; i++) + table[0][i] = i; // "no-op" states for null char, to be ignored // // Set up a standard case translation table if needed. @@ -127,6 +129,11 @@ StringMatch::Pattern(char *pattern, char #endif chr = trans[(unsigned char)*pattern]; + if (chr == 0) + { + pattern++; + continue; + } if (chr == sep) { // @@ -504,12 +511,39 @@ void StringMatch::TranslationTable(char // void StringMatch::IgnoreCase() { - if (local_alloc) - delete [] trans; - trans = new unsigned char[256]; + if (!local_alloc || !trans) + { + trans = new unsigned char[256]; + for (int i = 0; i < 256; i++) + trans[i] = (unsigned char)i; + local_alloc = 1; + } for (int i = 0; i < 256; i++) - trans[i] = tolower((unsigned char)i); - local_alloc = 1; + if (isupper((unsigned char)i)) + trans[i] = tolower((unsigned char)i); +} + + +//***************************************************************************** +// void StringMatch::IgnorePunct(char *punct) +// Set up the character translation table to ignore punctuation +// +void StringMatch::IgnorePunct(char *punct) +{ + if (!local_alloc || !trans) + { + trans = new unsigned char[256]; + for (int i = 0; i < 256; i++) + trans[i] = (unsigned char)i; + local_alloc = 1; + } + if (punct) + for (int i = 0; punct[i]; i++) + trans[(unsigned char)punct[i]] = 0; + else + for (int i = 0; i < 256; i++) + if (HtIsWordChar(i) && !HtIsStrictWordChar(i)) + trans[i] = 0; } --- htdig-3.1.2.bak/htsearch/htsearch.cc Wed Aug 18 16:40:30 1999 +++ htdig-3.1.2/htsearch/htsearch.cc Tue Aug 24 12:34:23 1999 @@ -222,9 +222,11 @@ main(int ac, char **av) // origPattern += logicalPattern; searchWordsPattern.IgnoreCase(); - searchWordsPattern.Pattern(origPattern); - if (debug > 2) - cout << "Excerpt pattern: " << origPattern << "\n"; + searchWordsPattern.IgnorePunct(); + searchWordsPattern.Pattern(logicalPattern); // this should now be enough + //searchWordsPattern.Pattern(origPattern); + //if (debug > 2) + // cout << "Excerpt pattern: " << origPattern << "\n"; // // If required keywords were given in the search form, we will @@ -314,7 +316,8 @@ createLogicalWords(List &searchWords, St } else wasHidden = 1; - if (ww->weight > 0) // Ignore boolean syntax stuff + if (ww->weight > 0 // Ignore boolean syntax stuff + && !ww->isIgnore) // Ignore short or bad words { if (pattern.length()) pattern << '|'; -- Gilles R. Detillieux E-mail: Spinal Cord Research Centre WWW: http://www.scrc.umanitoba.ca/~grdetil Dept. Physiology, U. of Manitoba Phone: (204)789-3766 Winnipeg, MB R3E 3J7 (Canada) Fax: (204)789-3930 ------------------------------------ To unsubscribe from the htdig mailing list, send a message to htdig@htdig.org containing the single word unsubscribe in the SUBJECT of the message.