Date: Fri, 15 Jun 2001 16:17:53 -0500 (CDT) From: Gilles Detillieux To: "ht://Dig mailing list" Subject: [htdig] PATCH: restrict and exclude config attributes for htsearch 3.1.5 This patch adds restrict and exclude config attributes to htsearch, which can be overridden by the restrict and exclude input parameters in the search form. This can be used to avoid the requirement of listing restrictions or exclusions in the search form, allowing you to put them in the config file instead. Note that if you apply both this patch and the build_select_lists.0 patch from the ftp.ccsf.org archive, you should remove the following three lines from htsearch/Display.cc, in Display::setVariables() around lines 512-514: if (strcmp(builds[b+1], "restrict") == 0 || strcmp(builds[b+1], "exclude") == 0) sepc = '|'; With this patch, the restrict and exclude input parameters are no longer modified in place, so the hook above will no longer work. This patch was made with the help of Gabriele Bartolini. He committed it to CVS, and I made a patch from it to support 3.1.5 until the next stable release is out. Apply in your main htdig-3.1.5 source directory using the command "patch -p0 < this-message". --- htcommon/defaults.cc.orig Tue Feb 15 16:05:44 2000 +++ htcommon/defaults.cc Thu May 31 07:44:43 2001 @@ -57,6 +57,7 @@ ConfigDefaults defaults[] = {"endings_word2root_db", "${common_dir}/word2root.db"}, {"excerpt_length", "300"}, {"excerpt_show_top", "false"}, + {"exclude", ""}, {"exclude_urls", "/cgi-bin/ .cgi"}, {"external_parsers", ""}, {"extra_word_characters", ""}, @@ -120,6 +121,7 @@ ConfigDefaults defaults[] = {"prev_page_text", "[prev]"}, {"remove_bad_urls", "true"}, {"remove_default_doc", "index.html"}, + {"restrict", ""}, {"robotstxt_name", "htdig"}, {"script_name", ""}, {"search_algorithm", "exact:1"}, --- htdoc/attrs.html.orig Fri Feb 25 10:18:47 2000 +++ htdoc/attrs.html Thu May 31 07:44:44 2001 @@ -1741,6 +1741,51 @@
+ exclude +
+
+
+
+ type: +
+
+ string list +
+
+ used by: +
+
+ htsearch +
+
+ default: +
+
+ <empty> +
+
+ description: +
+
+ If a URL contains any of the space separated patterns, + it will be discarded in the searching phase. This is + used to exclude certain URLs from search results. + The list can be specified from within the configuration + file, and can be overridden with the "exclude" input + parameter in the search form. +
+
+ example: +
+
+ exclude: cgi-bin +
+
+
+
+
+
+
exclude_urls
@@ -5037,6 +5082,57 @@
+ restrict +
+
+
+
+ type: +
+
+ string list +
+
+ used by: +
+
+ htsearch +
+
+ default: +
+
+ <empty> +
+
+ description: +
+
+ This specifies a set of patterns that all URLs have to + match against in order for them to be included in the + search results. Any number of strings can be specified, + separated by spaces. If multiple patterns are given, at + least one of the patterns has to match the URL. + The list can be specified from within the configuration + file, and can be overridden with the "restrict" input + parameter in the search form. Note that the restrict + list does not take precedence over the + exclude list - if a URL matches + patterns in both lists it is still excluded from the + search results. +
+
+ example: +
+
+ restrict: http://www.vh1.com/ +
+
+
+
+
+
+
robotstxt_name
--- htdoc/cf_byname.html.orig Tue Feb 15 15:59:53 2000 +++ htdoc/cf_byname.html Mon Jun 4 08:33:39 2001 @@ -56,6 +56,7 @@ * endings_word2root_db
* excerpt_length
* excerpt_show_top
+ * exclude
* exclude_urls
* external_parsers
* extra_word_characters
@@ -138,6 +139,7 @@ R
* remove_bad_urls
* remove_default_doc
+ * restrict
* robotstxt_name

S
--- htdoc/cf_byprog.html.orig Tue Feb 15 16:00:19 2000 +++ htdoc/cf_byprog.html Thu May 31 07:44:44 2001 @@ -132,6 +132,7 @@ * endings_word2root_db
* excerpt_length
* excerpt_show_top
+ * exclude
* extra_word_characters
* iso_8601
* logging
@@ -158,6 +159,7 @@ * page_number_text
* prefix_match_character
* prev_page_text
+ * restrict
* script_name
* search_algorithm
* search_results_footer
--- htdoc/hts_form.html.orig Thu Feb 17 16:02:22 2000 +++ htdoc/hts_form.html Thu May 31 07:44:44 2001 @@ -49,9 +49,13 @@ exclude
- This value is a pattern that all URLs of the search results - cannot match.
- The default is blank. + This value is a pattern that specifies which URLs are to be + excluded from the search results. If a URL matches one of + these patterns it is discarded. Multiple patterns can be + given, separated by a bar ("|"), or multiple definitions + of the exclude input parameter can be given.
+ The default is specified by the exclude + attribute in the configuration file.
format @@ -118,11 +122,18 @@ restrict
- This value is a pattern that all URLs of the search results - will have to match. This can be used to restrict the search - to a particular subtree or subsection of a bigger - database.
- The default is blank. + This value is a pattern that all URLs of the search results + will have to match. This can be used to restrict the search + to a particular subtree or subsection of a bigger database. + Multiple patterns can be given, separated by a bar ("|"), or + multiple definitions of the restrict input parameter can be + given. Any URL in the search results will have to match at + least one of these patterns.
+ Note that the restrict list does not take precedence over the + exclude list - if a URL matches patterns in both lists it is + still excluded from the search results.
+ The default is specified by the restrict + attribute in the configuration file.
sort --- htsearch/htsearch.cc.orig Tue Feb 15 16:17:13 2000 +++ htsearch/htsearch.cc Thu May 31 07:44:44 2001 @@ -104,24 +104,6 @@ main(int ac, char **av) cgi input(optind < ac ? av[optind] : none); // - // Compile the URL limit pattern. - // - if (input.exists("restrict")) - { - char *sep = input["restrict"]; - while ((sep = strchr(sep, '\001')) != NULL) - *sep++ = '|'; - limit_to.Pattern(input["restrict"]); - } - if (input.exists("exclude")) - { - char *sep = input["exclude"]; - while ((sep = strchr(sep, '\001')) != NULL) - *sep++ = '|'; - exclude_these.Pattern(input["exclude"]); - } - - // // Setup the configuration database. First we read the compiled defaults. // Then we override those with defaults read in from the configuration // file, and finally we override some attributes with information we @@ -189,6 +171,35 @@ main(int ac, char **av) config.Add(form_vars[i], input[form_vars[i]]); } + // + // Compile the URL limit pattern. + // + + StringList urllist; + String urlpat; + + if (strlen(config["restrict"])) + { + // Create a temporary list from either the configuration + // file or the input parameter + urllist.Create(config["restrict"], "| \t\r\n\001"); + urlpat = urllist.Join('|'); + urllist.Release(); // release the temporary list of URLs + config.Add("restrict", urlpat); // re-create the config attribute + limit_to.Pattern(urlpat); // Set the new limit pattern + } + + if (strlen(config["exclude"])) + { + // Create a temporary list from either the configuration + // file or the input parameter + urllist.Create(config["exclude"], "| \t\r\n\001"); + urlpat = urllist.Join('|'); + urllist.Release(); // release the temporary list of URLs + config.Add("exclude", urlpat); // re-create the config attribute + exclude_these.Pattern(urlpat); + } + // Ctype-like functions for what constitutes a word. HtWordType::Initialize(config); -- Gilles R. Detillieux E-mail: Spinal Cord Research Centre WWW: http://www.scrc.umanitoba.ca/~grdetil Dept. Physiology, U. of Manitoba Phone: (204)789-3766 Winnipeg, MB R3E 3J7 (Canada) Fax: (204)789-3930 _______________________________________________ htdig-general mailing list To unsubscribe, send a message to with a subject of unsubscribe FAQ: http://htdig.sourceforge.net/FAQ.html