diff -rcN tmp/htdig-3.1.5/README.plp.txt htdig-3.1.5/README.plp.txt *** tmp/htdig-3.1.5/README.plp.txt Thu Jan 1 02:00:00 1970 --- htdig-3.1.5/README.plp.txt Sun May 10 20:37:42 1998 *************** *** 0 **** --- 1,121 ---- + + About the patch to allow htdig to index soft-linked directories without + indexing the parent directories. + + Applies to: htdig stable 3.1.5 + Related files: the.patch htdig-3.1.5+prune_parent_dir_href-0.1.patch + Related URLs: http://www.htdig.org htdig mailing list archive + + 1. Description: + + 1.1 The problem: + + When htdig indexes a page under a DocumentRoot noted as http://a/b/c and + the page contains a href pointing to http://a/d/e where the latter page + is an Apache index, htdig will try to index http://a/d and all its + descendents if this is an Apache index. + + 1.2 The solution: + + To avoid this, a mechanism is implemented in htdig, that prevents it from + reaping and indexing any URLs that are the direct parents of the currently + indexed document. For example: + + If the document http://here/a/b/c is being indexed, then if the following + URLs that will be reaped from it, will need not to be added to the list of + URLs to be indexed: + + http:// + http://here + http://here/a + http://here/a/b + + In particular the last one would appear as a 'previous directory' entry in + an Apache-generated directory index for http://here/a/b/c. + + 2. Patch + + 2.1 Patch description: + + The patch modifies the htdig/Retriever class to add the required + functionality, and adds a new configuration option, that turns the new + feature ON or OFF. + + The feature is turned OFF by default, and it needs to be turned ON by an + entry in the config file used with htdig using a line like: + + prune_parent_dir_href: true + + 2.2 Patch application: + + copy the patch source to the htdig-3.1.5 directory and then apply the patch + using the command: + + patch -p1 #include "HtWordType.h" static WordList words; static int noSignal; - //***************************************************************************** // Retriever::Retriever() // --- 20,31 ---- #include #include "HtWordType.h" + // plp + #include + static WordList words; static int noSignal; //***************************************************************************** // Retriever::Retriever() // *************** *** 34,39 **** --- 36,44 ---- currenthopcount = 0; max_hop_count = config.Value("max_hop_count", 999999); + // plp + gus.hop_count = 0; + // // Initialize the weight factors for words in the different // HTML headers *************** *** 276,295 **** // There may be no more documents, or the server // has passed the server_max_docs limit ! // ! // We have a URL to index, now. We need to register the ! // fact that we are not done yet by setting the 'more' ! // variable. ! // ! more = 1; ! ! // ! // Deal with the actual URL. ! // We'll check with the server to see if we need to sleep() ! // before parsing it. ! // ! server->delay(); // This will pause if needed and reset the time ! parse_url(*ref); delete ref; } } --- 281,306 ---- // There may be no more documents, or the server // has passed the server_max_docs limit ! // plp: store and preprocess new url for parent dir stripping ! if (config.Boolean("prune_parent_dir_href", 0)) ! store_url(ref->URL()); ! else ! gus.hop_count = 0; // avoid chk config w every href ! ! // ! // We have a URL to index, now. We need to register the ! // fact that we are not done yet by setting the 'more' ! // variable. ! // ! more = 1; ! ! // ! // Deal with the actual URL. ! // We'll check with the server to see if we need to sleep() ! // before parsing it. ! // ! server->delay(); // This will pause if needed and reset the time ! parse_url(*ref); delete ref; } } *************** *** 1164,1169 **** --- 1175,1188 ---- url.normalize(); + // plp: check whether it is a substring of the base URL + if((gus.hop_count > 0) && (url_is_parent_dir(url.get()) != 0)) { + // cout << "got_href: pruning (is substr of base url) " << url.get() << "\n"; // debug + if(debug > 0) + cout << "!"; // bang ! in the progress indicator characters + return; + } + // If it is a backlink from the current document, // just update that field. Writing to the database // is meaningless, as it will be overwritten. *************** *** 1521,1523 **** --- 1540,1607 ---- } } + // plp + // private function used to chop and store the url for substring comparison + void + Retriever::chop_url(ChoppedUrlStore &cus,char *c_url) + { + int l; + + cus.url_store[0] = '\0'; + cus.hop_count = 0; + l = strlen(c_url); + if((l == 0) || (l >= MAX_CAN_URL_LEN)) { + if(debug > 0) + cout << "chop_url: failed on len==0\n"; + return; + } + strcpy(cus.url_store,c_url); + l = 0; + if((cus.url_store_chopped[l++] = strtok(cus.url_store,"/")) == NULL) { + cus.url_store[0] = '\0'; + if(debug > 0) + cout << "chop_url: failed on NULL with " << c_url << "\n"; + return; + } + while((cus.url_store_chopped[l++] = strtok(NULL,"/")) != NULL) { + if(l >= MAX_CAN_URL_HOPS) { + cus.url_store[0] = '\0'; + return; // fail silently with a valid url, print a bang somewhere else + } + } + cus.hop_count = l - 1; + return; // success + } + + // call this function to store the base URL of a document being indexed, + // when starting to index it (in HTML::parse or ExternalParser::parse) + void + Retriever::store_url(char *c_url) + { + chop_url(gus,c_url); + return; + } + + // call this function to decide if a reaped URL is a direct parent of + // the URL being indexed. call in Retriever::got_href() + int + Retriever::url_is_parent_dir(char *c_url) + { + int j,k; + ChoppedUrlStore cus; + + if(gus.hop_count == 0) + return 0; + chop_url(cus,c_url); + if(cus.hop_count == 0) + return 0; + // seek a matching first part (gus == substr of cus) + j = k = 0; + while(strcmp(gus.url_store_chopped[j++],cus.url_store_chopped[k++]) == 0) { + if(k == cus.hop_count) + return 1; // substring ! + if(j == gus.hop_count) + break; // not + } + return 0; // not + } diff -rcN tmp/htdig-3.1.5/htdig/Retriever.h htdig-3.1.5/htdig/Retriever.h *** tmp/htdig-3.1.5/htdig/Retriever.h Fri Feb 25 04:29:10 2000 --- htdig-3.1.5/htdig/Retriever.h Sun May 10 20:06:31 1998 *************** *** 24,29 **** --- 24,35 ---- Retriever_Restart }; + // plp 000503 - for prune_parent_href feature + // max length of URL, in chars, fail silently if exceeded + #define MAX_CAN_URL_LEN 256 + // max no. of slashes in same + 1, fail silently if exceeded + #define MAX_CAN_URL_HOPS 32 + class Retriever { public: *************** *** 64,79 **** // Allow for the indexing of protected sites by using a // username/password // ! void setUsernamePassword(char *credentials); // // Routines for dealing with local filesystem access // StringList * GetLocal(char *url); StringList * GetLocalUser(char *url, StringList *defaultdocs); ! int IsLocalURL(char *url); ! private: // // A hash to keep track of what we've seen // --- 70,102 ---- // Allow for the indexing of protected sites by using a // username/password // ! void setUsernamePassword(char *credentials); // // Routines for dealing with local filesystem access // StringList * GetLocal(char *url); StringList * GetLocalUser(char *url, StringList *defaultdocs); ! int IsLocalURL(char *url); ! ! // plp 000503 - for prune_parent_href feature ! void store_url(char *c_url); ! int url_is_parent_dir(char *c_url); ! private: + + // plp 000503 - for prune_parent_href feature + typedef struct { + char url_store[MAX_CAN_URL_LEN]; + char *url_store_chopped[MAX_CAN_URL_HOPS]; + int hop_count; // the last valid index in url_store_chopped + 1 or zero + } ChoppedUrlStore; + + ChoppedUrlStore gus; // Global chopped Url Store + + void chop_url(ChoppedUrlStore &cus,char *c_url); + // /plp + // // A hash to keep track of what we've seen //