From grdetil@scrc.umanitoba.ca Tue Oct 19 15:05:36 1999 Date: Tue, 19 Oct 1999 16:53:42 -0500 (CDT) From: Gilles Detillieux To: htdig3-dev@htdig.org Subject: [htdig3-dev] external converters support for htdig Hi, folks. I finally bit the bullet last Friday and developed some code for the external converters idea that had been tossed around before, and has been rattling around in my brain off and on since. This was precipitated by a couple e-mail messages I received in the past couple weeks: first a request for an external parser for Excel spreadsheets, then an announcement for an Excel to HTML converter, xlHtml. Rather than coming up with a whole new C++ class, and another config attribute, and figuring how to tie it all into the Retriever and Document classes, I opted for a simpler approach, which I hope you won't think is too much of a kludge. I designed it as an extension to the ExternalParser class, which kept the changes localized to one source file. A lot of the work was already done for me as well, so it's a fairly simple change. The only thing I'm not wild about is the fact I needed to duplicate some of the Parsable selection code from the Document class, because the parser isn't given the Document object (and because I can't delete the current ExternalParser object from within the method that's working on it). Not a big deal, though. Please have a look, give it a try, and let me know what you think. The way it works is instead of just specifying a single content-type as the first string of a pair in the external_parsers attribute, you specify two types as type1->type2, as one string with no spaces, then the second string will define an external converter rather than an external parser, to convert the first type to the second. E.g.: external_parsers: application/pdf /usr/local/bin/parse_doc.pl \ application/msword->text/html /usr/local/bin/wordtohtml If the second type is "user-defined", then it's up to the converter script to put out a "Content-Type: text/foo" header followed by a blank line, to indicate to htdig what type it should expect for the output, sort of like what a CGI script would do. E.g.: external_parsers: application/msword->user-defined /usr/local/bin/mswordconv \ application/x-gunzip->user-defined /usr/local/bin/ungzipper where mswordconv would output a "Content-Type: text/html" header, a blank line, then the HTML output of mswordview, and ungzipper would gunzip its input, determine the type, and output the apporpriate header before outputting the unzipped output. I'd also eventually like to add a second type of "magic", which would make htdig determine the content-type by looking at the start of the script's output. If anyone wan't to develop a function to do that (or "borrow" it from Apache's mod_magic), it'd be a big help. Here's my patch, too late for the feature freeze of course, unless you vote it in. (It's a patch for 3.1.3, but should apply to 3.2, if you change the unlink() call in the last hunk to match the 3.2 source.) --- htdig-3.1.3/htdig/ExternalParser.cc.noconv Wed Sep 22 11:18:40 1999 +++ htdig-3.1.3/htdig/ExternalParser.cc Tue Oct 19 16:40:09 1999 @@ -11,6 +11,9 @@ static char RCSid[] = "$Id: ExternalPars #endif #include "ExternalParser.h" +#include "HTML.h" +#include "Plaintext.h" +#include "PDF.h" #include "htdig.h" #include "htString.h" #include "QuotedStringList.h" @@ -21,6 +24,7 @@ static char RCSid[] = "$Id: ExternalPars #include "good_strtok.h" static Dictionary *parsers = 0; +static Dictionary *toTypes = 0; extern String configFile; //***************************************************************************** @@ -88,13 +92,25 @@ ExternalParser::canParse(char *contentTy if (!parsers) { parsers = new Dictionary(); + toTypes = new Dictionary(); QuotedStringList qsl(config["external_parsers"], " \t"); + String from, to; int i; + int sep; for (i = 0; qsl[i]; i += 2) { - parsers->Add(qsl[i], new String(qsl[i + 1])); + from = qsl[i]; + to = ""; + sep = from.indexOf("->"); + if (sep != -1) + { + to = from.sub(sep+2).get(); + from = from.sub(0, sep).get(); + } + parsers->Add(from, new String(qsl[i + 1])); + toTypes->Add(from, new String(to)); } } return parsers->Exists(contentType); @@ -150,8 +166,45 @@ ExternalParser::parse(Retriever &retriev char *token1, *token2, *token3; int loc, hd; URL url; + String convertToType = ((String *)toTypes->Find(contentType))->get(); + int get_hdr = (mystrcasecmp(convertToType, "user-defined") == 0); + int get_file = (convertToType.length() != 0); + String newcontent; while (readLine(input, line)) { + if (get_hdr) + { + line.chop('\r'); + if (line.length() == 0) + get_hdr = FALSE; + else if (mystrncasecmp(line, "content-type:", 13) == 0) + { + token1 = line.get() + 13; + while (*token1 && isspace(*token1)) + token1++; + token1 = strtok(token1, "\n\t"); + convertToType = token1; + } + continue; + } + if (get_file) + { + if (newcontent.length() == 0 && + !canParse(convertToType) && + mystrncasecmp(convertToType, "text/", 5) != 0 && + mystrncasecmp(convertToType, "application/pdf", 15) != 0) + { + if (mystrcasecmp(convertToType, "user-defined") == 0) + cerr << "External parser error: no Content-Type given\n"; + else + cerr << "External parser error: can't parse Content-Type \"" + << convertToType << "\"\n"; + cerr << " URL: " << base.get() << "\n"; + break; + } + newcontent << line << '\n'; + continue; + } token1 = strtok(line, "\t"); if (token1 == NULL) token1 = ""; @@ -340,6 +393,50 @@ ExternalParser::parse(Retriever &retriev } pclose(input); unlink(path); + + if (newcontent.length() > 0) + { + static HTML *html = 0; + static Plaintext *plaintext = 0; + static PDF *pdf = 0; + Parsable *parsable = 0; + + contentType = convertToType; + if (canParse(contentType)) + { + currentParser = ((String *)parsers->Find(contentType))->get(); + parsable = this; + } + else if (mystrncasecmp(contentType, "text/html", 9) == 0) + { + if (!html) + html = new HTML(); + parsable = html; + } + else if (mystrncasecmp(contentType, "text/plain", 10) == 0) + { + if (!plaintext) + plaintext = new Plaintext(); + parsable = plaintext; + } + else if (mystrncasecmp(contentType, "application/pdf", 15) == 0) + { + if (!pdf) + pdf = new PDF(); + parsable = pdf; + } + else + { + if (!plaintext) + plaintext = new Plaintext(); + parsable = plaintext; + if (debug) + cout << "External parser error: \"" << contentType << + "\" not a recognized type. Assuming text\n"; + } + parsable->setContents(newcontent.get(), newcontent.length()); + parsable->parse(retriever, base); + } } -- Gilles R. Detillieux E-mail: Spinal Cord Research Centre WWW: http://www.scrc.umanitoba.ca/~grdetil Dept. Physiology, U. of Manitoba Phone: (204)789-3766 Winnipeg, MB R3E 3J7 (Canada) Fax: (204)789-3930 ------------------------------------ To unsubscribe from the htdig3-dev mailing list, send a message to htdig3-dev@htdig.org containing the single word "unsubscribe" in the SUBJECT of the message.