From Geoffrey.R.Hutchison@williams.edu Mon Oct 5 17:17:49 1998 Date: Mon, 05 Oct 1998 15:00:08 -0400 From: Geoff Hutchison To: Jerry Preeper Cc: htdig@sdsu.edu Subject: Re: htdig: single quotes in URL At 6:00 PM -0400 10/3/98, Jerry Preeper wrote: >I was wondering if anyone has a patch working satisfactorily yet to allow >htdig to find URL's that are referenced with single quotes, as well as >double quotes. The following patch brings HTML.cc up from the revision in 3.1.0b1 to the current revision (which includes several bug fixes in addition to the single quote problem). If someone wants *just* the single-quote patch, use CVSWeb on to grab the diff between revision 1.14 and 1.13 of HTML.cc. The patch seems to work for me, though I haven't given it a complete shakedown. -Geoff Hutchison Williams Students Online http://wso.williams.edu/ =================================================================== RCS file: /opt/htdig/cvs/htdig3/htdig/HTML.cc,v retrieving revision 1.10 retrieving revision 1.14 diff -u -r1.10 -r1.14 --- htdig3/htdig/HTML.cc 1998/09/10 04:16:25 1.10 +++ htdig3/htdig/HTML.cc 1998/09/30 17:31:50 1.14 @@ -4,7 +4,23 @@ // Implementation of HTML // // $Log: HTML.cc,v $ +// Revision 1.14 1998/09/30 17:31:50 ghutchis +// Changes for 3.1.0b2 +// +// Revision 1.13 1998/09/23 14:58:21 ghutchis +// +// Many, many bug fixes +// +// Revision 1.12 1998/09/18 18:45:55 ghutchis +// +// YABF (Yet another bug fix) +// +// Revision 1.11 1998/09/18 02:38:08 ghutchis +// +// Bug fixes for 3.1.0b2 +// // Revision 1.10 1998/09/10 04:16:25 ghutchis +// // More bug fixes. // // Revision 1.9 1998/09/08 03:29:09 ghutchis @@ -39,7 +55,7 @@ // // #if RELEASE -static char RCSid[] = "$Id: HTML.cc,v 1.10 1998/09/10 04:16:25 ghutchis Exp $"; +static char RCSid[] = "$Id: HTML.cc,v 1.14 1998/09/30 17:31:50 ghutchis Exp $"; #endif #include "htdig.h" @@ -360,7 +376,7 @@ HTML::do_tag(Retriever &retriever, String &tag) { char *position = tag.get() + 1; // Skip the '<' - char *q; + char *q, *t; int which, length; while (isspace(*position)) @@ -409,12 +425,34 @@ position++; while (isspace(*position)) position++; - if (*position == '"') + // + // Allow either single quotes or double quotes + // around the URL itself + // + if (*position == '"'||*position == '\'') { position++; - q = strchr(position, '"'); + q = strchr(position, position[-1]); if (!q) break; + // + // We seem to have matched the opening quote char + // Mark the end of the quotes as our endpoint, so + // that we can continue parsing after the current + // text + // + *q = '\0'; + // + // If a '?' or '#' is present in a quoted URL, + // treat that as the end of the URL, but we skip + // past the quote to parse the rest of the anchor. + // + // Is there a better way of looking for these? + // + if ((t = strchr(position, '#')) != NULL) + *t = '\0'; + if ((t = strchr(position, '?')) != NULL) + *t = '\0'; } else { @@ -425,8 +463,8 @@ *q != '?' && *q != '#') q++; + *q = '\0'; } - *q = '\0'; delete href; href = new URL(position, *base); in_ref = 1; @@ -447,20 +485,42 @@ position++; while (isspace(*position)) position++; - if (*position == '"') + // + // Allow either single quotes or double quotes + // around the URL itself + // + if (*position == '"'||*position == '\'') { position++; - q = strchr(position, '"'); + q = strchr(position, position[-1]); if (!q) break; + // + // We seem to have matched the opening quote char + // Mark the end of the quotes as our endpoint, so + // that we can continue parsing after the current + // text + // + *q = '\0'; + // + // If a '?' or '#' is present in a quoted URL, + // treat that as the end of the URL, but we skip + // past the quote to parse the rest of the anchor. + // + // Is there a better way of looking for these? + // + if ((t = strchr(position, '#')) != NULL) + *t = '\0'; + if ((t = strchr(position, '?')) != NULL) + *t = '\0'; } else { q = position; while (*q && *q != '>' && !isspace(*q)) q++; - } *q = '\0'; + } retriever.got_anchor(position); position = q + 1; break; @@ -537,20 +597,42 @@ position++; while (isspace(*position)) position++; - if (*position == '"') + // + // Allow either single quotes or double quotes + // around the URL itself + // + if (*position == '"'||*position == '\'') { position++; - q = strchr(position, '"'); + q = strchr(position, position[-1]); if (!q) break; + // + // We seem to have matched the opening quote char + // Mark the end of the quotes as our endpoint, so + // that we can continue parsing after the current + // text + // + *q = '\0'; + // + // If a '?' or '#' is present in a quoted URL, + // treat that as the end of the URL, but we skip + // past the quote to parse the rest of the anchor. + // + // Is there a better way of looking for these? + // + if ((t = strchr(position, '#')) != NULL) + *t = '\0'; + if ((t = strchr(position, '?')) != NULL) + *t = '\0'; } else { q = position; while (*q && *q != '>' && !isspace(*q)) q++; - } *q = '\0'; + } retriever.got_image(position); break; } @@ -609,8 +691,9 @@ { if (strlen(w) >= minimumWordLength) retriever.got_word(w, 1, 10); - w = strtok(0, " \t\r\n"); + w = strtok(0, " ,\t\r\n"); } + w = '\0'; } // @@ -629,8 +712,9 @@ { if (strlen(w) >= minimumWordLength) retriever.got_word(w, 1, 10); - w = strtok(0, " \t\r\n"); + w = strtok(0, " ,\t\r\n"); } + w = '\0'; } else if (mystrcasecmp(cache, "htdig-email") == 0) { @@ -655,14 +739,14 @@ { String content_cache = conf["content"]; - if (content_cache.indexOf("noindex") != 0) + if (content_cache.indexOf("noindex") != -1) { doindex = 0; retriever.got_noindex(); } - else if (content_cache.indexOf("nofollow") != 0) + else if (content_cache.indexOf("nofollow") != -1) dofollow = 0; - else if (content_cache.indexOf("none") != 0) + else if (content_cache.indexOf("none") != -1) { doindex = 0; dofollow = 0; @@ -677,7 +761,11 @@ // meta_dsc = conf["content"]; if (meta_dsc.length() > max_meta_description_length) - meta_dsc = meta_dsc.sub(0, max_meta_description_length); + { + String temp = meta_dsc.sub(0, max_meta_description_length); + meta_dsc = temp; + temp = 0; + } if (debug > 1) cout << "META Description: " << conf["content"] << endl; retriever.got_meta_dsc(meta_dsc); @@ -693,6 +781,7 @@ retriever.got_word(w, 1, 11); w = strtok(0, " \t\r\n"); } + w = '\0'; } } else if (conf["name"] && @@ -724,12 +813,34 @@ position++; while (isspace(*position)) position++; - if (*position == '"') + // + // Allow either single quotes or double quotes + // around the URL itself + // + if (*position == '"'||*position == '\'') { position++; - q = strchr(position, '"'); + q = strchr(position, position[-1]); if (!q) break; + // + // We seem to have matched the opening quote char + // Mark the end of the quotes as our endpoint, so + // that we can continue parsing after the current + // text + // + *q = '\0'; + // + // If a '?' or '#' is present in a quoted URL, + // treat that as the end of the URL, but we skip + // past the quote to parse the rest of the anchor. + // + // Is there a better way of looking for these? + // + if ((t = strchr(position, '#')) != NULL) + *t = '\0'; + if ((t = strchr(position, '?')) != NULL) + *t = '\0'; } else { @@ -740,8 +851,8 @@ *q != '?' && *q != '#') q++; + *q = '\0'; } - *q = '\0'; delete href; href = new URL(position, *base); if (dofollow) @@ -776,12 +887,34 @@ position++; while (isspace(*position)) position++; - if (*position == '"') + // + // Allow either single quotes or double quotes + // around the URL itself + // + if (*position == '"'||*position == '\'') { position++; - q = strchr(position, '"'); + q = strchr(position, position[-1]); if (!q) break; + // + // We seem to have matched the opening quote char + // Mark the end of the quotes as our endpoint, so + // that we can continue parsing after the current + // text + // + *q = '\0'; + // + // If a '?' or '#' is present in a quoted URL, + // treat that as the end of the URL, but we skip + // past the quote to parse the rest of the anchor. + // + // Is there a better way of looking for these? + // + if ((t = strchr(position, '#')) != NULL) + *t = '\0'; + if ((t = strchr(position, '?')) != NULL) + *t = '\0'; } else { @@ -792,8 +925,8 @@ *q != '?' && *q != '#') q++; + *q = '\0'; } - *q = '\0'; delete href; href = new URL(position, *base); if (dofollow) @@ -827,12 +960,34 @@ position++; while (isspace(*position)) position++; - if (*position == '"') + // + // Allow either single quotes or double quotes + // around the URL itself + // + if (*position == '"'||*position == '\'') { position++; - q = strchr(position, '"'); + q = strchr(position, position[-1]); if (!q) break; + // + // We seem to have matched the opening quote char + // Mark the end of the quotes as our endpoint, so + // that we can continue parsing after the current + // text + // + *q = '\0'; + // + // If a '?' or '#' is present in a quoted URL, + // treat that as the end of the URL, but we skip + // past the quote to parse the rest of the anchor. + // + // Is there a better way of looking for these? + // + if ((t = strchr(position, '#')) != NULL) + *t = '\0'; + if ((t = strchr(position, '?')) != NULL) + *t = '\0'; } else { @@ -843,8 +998,8 @@ *q != '?' && *q != '#') q++; - } *q = '\0'; + } URL tempBase(position, *base); *base = tempBase; } ---------------------------------------------------------------------- To unsubscribe from the htdig mailing list, send a message to htdig-request@sdsu.edu containing the single word "unsubscribe" in the body of the message.