: patch to allow addition of URLs for htdig-3.1.5 : conversion: 2000/07/21 uitm@blackflag.ru (original message follows) : : from http://www.htdig.org/mail/1998/04/0046.html : : patch to allow addition of URLs : ------------------------------------------------------------------------ : Edmond Abrahamian (edmond@greencedars.com.lb) : Wed, 15 Apr 1998 21:53:15 +0200 (EET) : * Messages sorted by: [ date ] [ thread ] [ subject ] [ author ] : * Next message: Olivier PRENANT: "htdig: htdig & apache 1.3xx" : * Previous message: Pierre Garriga: "htdig-3.0.8b2" : ------------------------------------------------------------------------ : Hi Andrew, : I am submitting this small patch for the file htdig/main.cc. It allows : the option of digging new URLs without having to re-dig all the URLs that : are already in the database. The patch is included as an attachment, and is : to be fed into larry wall's patch program's stdin. It will prompt you for : the file to patch (main.cc) which it will *replace*. I should mention that : I am talking about the htdig-3.0.8b2 version in this context. : Of course, once the new url(s) are digged, we still have to htmerge : afterwards but we save an awful amount of time by incrementally adding : URLS rather than digging everything from scratch. : I hope I have not grossly overlooked anything. I looked into the whole : htdig program, after which I concluded that this simple fix should do. I : hope I am right... : I have tested it on one small database and one rather large database, : with great success. I would hope that if you agree, to give the htdig : community the chance to beat on the patch while the stuff is still in beta. : regards, : Edmond Abrahamian (edmond@greencedars.com.lb) : 202,204c : if (!new_urls_only) { : List *list = docs.URLs(); : retriever.Initial(*list); : delete list; : } : . : 200c : // URLs to the initial list of the retriever. However do this only : // if we are not adding new URLs only (i.e. -n option) : . : 196c : // seed the retriever object with the list of start URLs, unless : // we're requesting to add new URLs only without scanning the ones : // already in the database (-n option), in which case we will seed : // the retriever with those new URLs only : if (!new_urls_only) : retriever.Initial(config["start_url"]); : else : retriever.Initial(config["new_url"]); : . : 151a : . : 143a : . : 142c : String l; : if (!new_urls_only) : l = config["limit_urls_to"]; : else : l = config["limit_new_urls_to"]; : . : 65a : case 'n': : new_urls_only=1; : break; : . : 38c : while ((c = getopt(ac, av, "sc:vith:u:an")) != -1) : . : 33a : int new_urls_only = 0; : . : ------------------------------------------------------------------------ : * Next message: Olivier PRENANT: "htdig: htdig & apache 1.3xx" : * Previous message: Pierre Garriga: "htdig-3.0.8b2" : ------------------------------------------------------------------------ :This archive was generated by hypermail 2.0b3 on Sat Jan 02 1999 - 16:26:01 PST --- htdig.cc.orig Fri Feb 25 05:29:10 2000 +++ htdig.cc Fri Jul 21 19:00:47 2000 @@ -33,6 +33,7 @@ FILE *urls_seen = NULL; FILE *images_seen = NULL; String configFile = DEFAULT_CONFIG_FILE; +int new_urls_only = 0; void usage(); void reportError(char *msg); @@ -55,7 +56,7 @@ // // Parse command line arguments // - while ((c = getopt(ac, av, "lsc:vith:u:a")) != -1) + while ((c = getopt(ac, av, "lsc:vith:u:an")) != -1) { int pos; switch (c) @@ -89,6 +90,9 @@ case 'l': flag = Retriever_logUrl; break; + case 'n': + new_urls_only = 1; + break; case '?': usage(); } @@ -184,7 +188,11 @@ // // Set up the limits list // - StringList l(config["limit_urls_to"], " \t"); + StringList l; + if (new_urls_only == 0) + l.Create(config["limit_urls_to"], " \t"); + else + l.Create(config["limit_new_urls_to"], " \t"); limits.IgnoreCase(); limits.Pattern(l.Join('|')); l.Release(); @@ -234,19 +242,32 @@ // // Create the Retriever object which we will use to parse all the // HTML files. - // In case this is just an update dig, we will add all existing - // URLs? - // Retriever retriever(flag); - List *list = docs.URLs(); - retriever.Initial(*list); - delete list; - - // Add start_url to the initial list of the retriever. - // Don't check a URL twice! - // Beware order is important, if this bugs you could change - // previous line retriever.Initial(*list, 0) to Initial(*list,1) - retriever.Initial(config["start_url"], 1); + + // In case this is just an update dig, we will add all existing + // URLs to the initial list of the retriever. However do this only + // if we are not adding new URLs only (i.e. -n option) + + if (new_urls_only == 0) + { + List *list = docs.URLs(); + retriever.Initial(*list); + delete list; + } + else + if (debug) + { + cout << "adding new URLs only (-n option given)" << endl; + } + + // Seed the retriever object with the list of start URLs, unless + // we're requesting to add new URLs only without scanning the ones + // already in the database (-n option), in which case we will seed + // the retriever with those new URLs only + if (new_urls_only == 0) + retriever.Initial(config["start_url"], 1); + else + retriever.Initial(config["new_url"], 1); // // Go do it!