From webmaster@javawoman.com Mon Jan 11 17:34:09 1999 Date: Mon, 11 Jan 1999 19:06:50 +0100 From: Marjolein Katsma To: htdig@sdsu.edu Subject: htdig: (Not) translating entities Hi all. I'm setting up ht;//Dig for two sites; one of them contains a lot of pages with code examples of a tag-based language. Looking at the excerpts produced by ht://Dig, I found that the code samples were had to read: opening '<' characters had disappeared. Some digging revealed tha both < and > are translated, and then '<' is converted to a space... not what I needed. For pages with code samples of such laguages (HTML and other tag-based languages) the automatic translation of such entities actually gets in the way - so I made it configurable. Also useful for pages/sites with mathematical formulae which should be recognizable in the excerpts. The new configuration parameters are: translate_amp translate_lt_gt translate_quot The default for all three is 'true' so the program behaves as before; set to 'false' to change behavior. Both the character entities and the numerical entities for these characters are filtered out if the corresponding parameter is set to 'false'. >From my configuration file: -------------------- # # This determines whether or not entities for '<' and '>' will be translated. # The default is true; setting it to false is useful for sites with code examples # of tag-based languages, or with mathematical formulae. This will ensure that # any excerpts shown in results pages will be recognizable. # translate_lt_gt: false # # This determines whteher or not entities for '&' will be translated. # The default is true; setting it to false will ensure that any excerpts shown in # results pages will not trip up any browsers. # translate_amp: false # # This determines whether or not entities for '"' (a quote) will be translated. # The default is true; since quotes are quite legal in normal text, you should # generally leave the default but not translating it may occasionally be useful. # translate_quot: true -------------------- Two files were adapted: defaults.cc and SGMLEntities.cc (note that defaults.cc contains a few other changes used elsewhere, commented in the header; I'll report these later). Patches in comparison with release 3.1.0b4: diff -c3p defaults.cc defaultsMK.cc *** defaults.cc Tue Dec 22 18:53:12 1998 --- defaultsMK.cc Mon Jan 11 10:41:35 1999 *************** *** 3,8 **** --- 3,22 ---- // // default values for the ht programs // + // Revision 1999-01-11 mkatsma + // Added options translate_amp, translate_lt_gt and translate_quote to enable + // configuration of whether or not entities for '&', '<', '>' and '"' will + // be translated. The default is true, leaving the normal operation of htdig + // unchanged. + // + // Revision 1999-01-10 mkatsma + // Implemented configurable 'no title' text (found on mail list archive) + // + // Revision 1999-01-06 mkatsma + // Added options index_skip_start and index_skip_end to enable NOT indexing + // some sections of code; useful to exclude such things as local page menus + // and server-generated code that changes faster than an indexing cycle. + // // $Log: defaults.cc,v $ // Revision 1.24 1998/12/11 02:49:54 ghutchis // Added option for server_max_docs as a limit on the number of docs returned *************** ConfigDefaults defaults[] = *** 168,173 **** --- 182,190 ---- {"no_excerpt_show_top", "false"}, {"no_next_page_text", "[next]"}, {"no_prev_page_text", "[prev]"}, + {"no_title_text", "[No title]"}, //mk19990110 + {"noindex_start", ""}, //mk19990106 + {"noindex_end", ""}, //mk19990106 {"nothing_found_file", "${common_dir}/nomatch.html"}, {"page_list_header", "
Pages:
"}, {"prefix_match_character", "*"}, *************** ConfigDefaults defaults[] = *** 195,200 **** --- 212,220 ---- {"text_factor", "1"}, {"timeout", "30"}, {"title_factor", "100"}, + {"translate_amp", "false"}, //mk19990111 + {"translate_lt_gt", "false"}, //mk19990111 + {"translate_quot", "false"}, //mk19990111 {"url_list", "${database_base}.urls"}, {"use_star_image", "true"}, {"use_meta_description", "false"}, diff -c3p SGMLEntities.cc SGMLEntitiesMK.cc *** SGMLEntities.cc Tue Dec 22 18:53:12 1998 --- SGMLEntitiesMK.cc Mon Jan 11 10:55:25 1999 *************** *** 3,8 **** --- 3,12 ---- // // Implementation of SGMLEntities // + // Revision 1999-01-11 mkatsma + // Implemented options translate_amp, translate_lt_gt and translate_quote to + // prevent translation of entities for '&', '<', '>' and '"' if so configured. + // // $Log: SGMLEntities.cc,v $ // Revision 1.5 1998/08/03 16:50:35 ghutchis // *************** static char RCSid[] = "$Id: SGMLEntities *** 30,35 **** --- 34,40 ---- #include #include #include + #include //mk19990111 static SGMLEntities junk; *************** static struct *** 39,44 **** --- 44,54 ---- unsigned char equiv; } entities[] = { + // + // (MK19990111) Note: some entities for languages not written in iso_8859_1 + // are missing here; they would therefore not be translated or converted + // into a space, making searching in those languages difficult... + // { "lt", '<' } , { "gt", '>' } , { "amp", '&' } , *************** SGMLEntities::translateAndUpdate(unsigne *** 210,216 **** entity.length() < 10) { entity << *entityStart++; ! } if (entity.length() >= 10) { // --- 220,270 ---- entity.length() < 10) { entity << *entityStart++; ! } ! ! if ( !config.Boolean("translate_quot") ) //mk19990111 ! { //mk19990111 ! // //mk19990111 ! // Do NOT translate entities for '"' (quote). //mk19990111 ! // //mk19990111 ! if (entity.compare(new String("quot")) == 0 || //mk19990111 ! entity.compare(new String("#34")) == 0 ) //mk19990111 ! { //mk19990111 ! entityStart = orig + 1; //mk19990111 ! return '&'; //mk19990111 ! } //mk19990111 ! } //mk19990111 ! ! if ( !config.Boolean("translate_amp") ) //mk19990111 ! { //mk19990111 ! // //mk19990111 ! // Do NOT translate entities for '&' since they can //mk19990111 ! // occur in code samples that might end up in an excerpt. //mk19990111 ! // //mk19990111 ! if (entity.compare(new String("amp")) == 0 || //mk19990111 ! entity.compare(new String("#38")) == 0 ) //mk19990111 ! { //mk19990111 ! entityStart = orig + 1; //mk19990111 ! return '&'; //mk19990111 ! } //mk19990111 ! } //mk19990111 ! ! if ( !config.Boolean("translate_lt_gt") ) //mk19990111 ! { //mk19990111 ! // //mk19990111 ! // Do NOT translate entities for '<' and '>' since they can //mk19990111 ! // occur in code samples that might end up in an excerpt. //mk19990111 ! // //mk19990111 ! if (entity.compare(new String("lt")) == 0 || //mk19990111 ! entity.compare(new String("#60")) == 0 || //mk19990111 ! entity.compare(new String("gt")) == 0 || //mk19990111 ! entity.compare(new String("#62")) == 0 ) //mk19990111 ! { //mk19990111 ! entityStart = orig + 1; //mk19990111 ! return '&'; //mk19990111 ! } //mk19990111 ! } //mk19990111 ! if (entity.length() >= 10) { // To see this in operation, try the following (still experimental!) URL: http://hshelp.com/search.html enter 'editorlayout' as a search term and set to 'detailed' output. You can see how it shows up in some code samples. Trying 'CAT' as a search term will show that an actual tag *name* still shows up properly as well. Cheers, Marjolein Katsma webmaster@javawoman.com Java Woman - http://javawoman.com/ ---------------------------------------------------------------------- To unsubscribe from the htdig mailing list, send a message to htdig-request@sdsu.edu containing the single word "unsubscribe" in the body of the message.