This patch extends the external_parsers attribute to allow definition of external converters, which convert one content-type to another. --- htdig-3.1.3/htdig/ExternalParser.cc.noconv Wed Sep 22 11:18:40 1999 +++ htdig-3.1.3/htdig/ExternalParser.cc Tue Oct 19 16:40:09 1999 @@ -11,6 +11,9 @@ static char RCSid[] = "$Id: ExternalPars #endif #include "ExternalParser.h" +#include "HTML.h" +#include "Plaintext.h" +#include "PDF.h" #include "htdig.h" #include "htString.h" #include "QuotedStringList.h" @@ -21,6 +24,7 @@ static char RCSid[] = "$Id: ExternalPars #include "good_strtok.h" static Dictionary *parsers = 0; +static Dictionary *toTypes = 0; extern String configFile; //***************************************************************************** @@ -88,13 +92,25 @@ ExternalParser::canParse(char *contentTy if (!parsers) { parsers = new Dictionary(); + toTypes = new Dictionary(); QuotedStringList qsl(config["external_parsers"], " \t"); + String from, to; int i; + int sep; for (i = 0; qsl[i]; i += 2) { - parsers->Add(qsl[i], new String(qsl[i + 1])); + from = qsl[i]; + to = ""; + sep = from.indexOf("->"); + if (sep != -1) + { + to = from.sub(sep+2).get(); + from = from.sub(0, sep).get(); + } + parsers->Add(from, new String(qsl[i + 1])); + toTypes->Add(from, new String(to)); } } return parsers->Exists(contentType); @@ -150,8 +166,45 @@ ExternalParser::parse(Retriever &retriev char *token1, *token2, *token3; int loc, hd; URL url; + String convertToType = ((String *)toTypes->Find(contentType))->get(); + int get_hdr = (mystrcasecmp(convertToType, "user-defined") == 0); + int get_file = (convertToType.length() != 0); + String newcontent; while (readLine(input, line)) { + if (get_hdr) + { + line.chop('\r'); + if (line.length() == 0) + get_hdr = FALSE; + else if (mystrncasecmp(line, "content-type:", 13) == 0) + { + token1 = line.get() + 13; + while (*token1 && isspace(*token1)) + token1++; + token1 = strtok(token1, "\n\t"); + convertToType = token1; + } + continue; + } + if (get_file) + { + if (newcontent.length() == 0 && + !canParse(convertToType) && + mystrncasecmp(convertToType, "text/", 5) != 0 && + mystrncasecmp(convertToType, "application/pdf", 15) != 0) + { + if (mystrcasecmp(convertToType, "user-defined") == 0) + cerr << "External parser error: no Content-Type given\n"; + else + cerr << "External parser error: can't parse Content-Type \"" + << convertToType << "\"\n"; + cerr << " URL: " << base.get() << "\n"; + break; + } + newcontent << line << '\n'; + continue; + } token1 = strtok(line, "\t"); if (token1 == NULL) token1 = ""; @@ -340,6 +393,50 @@ ExternalParser::parse(Retriever &retriev } pclose(input); unlink(path); + + if (newcontent.length() > 0) + { + static HTML *html = 0; + static Plaintext *plaintext = 0; + static PDF *pdf = 0; + Parsable *parsable = 0; + + contentType = convertToType; + if (canParse(contentType)) + { + currentParser = ((String *)parsers->Find(contentType))->get(); + parsable = this; + } + else if (mystrncasecmp(contentType, "text/html", 9) == 0) + { + if (!html) + html = new HTML(); + parsable = html; + } + else if (mystrncasecmp(contentType, "text/plain", 10) == 0) + { + if (!plaintext) + plaintext = new Plaintext(); + parsable = plaintext; + } + else if (mystrncasecmp(contentType, "application/pdf", 15) == 0) + { + if (!pdf) + pdf = new PDF(); + parsable = pdf; + } + else + { + if (!plaintext) + plaintext = new Plaintext(); + parsable = plaintext; + if (debug) + cout << "External parser error: \"" << contentType << + "\" not a recognized type. Assuming text\n"; + } + parsable->setContents(newcontent.get(), newcontent.length()); + parsable->parse(retriever, base); + } } --- htdig-3.1.3/htdoc/attrs.html.noconv Wed Sep 22 11:18:41 1999 +++ htdig-3.1.3/htdoc/attrs.html Wed Oct 20 11:37:52 1999 @@ -1625,9 +1625,29 @@ content-type that the parser can handle while the second string of each pair is the path to the external parsing program. If quoted, it may contain parameters, - separated by spaces.
+ separated by spaces.
+ External parsing can also be done with external
+ converters, which convert one content-type to
+ another. To do this, instead of just specifying
+ a single content-type as the first string
+ of a pair, you specify two types, in the form
+ type1->type2,
+ as a single string with no spaces. The second
+ string will define an external converter
+ rather than an external parser, to convert
+ the first type to the second. If the second
+ type is user-defined, then
+ it's up to the converter script to put out a
+ "Content-Type: type" header followed
+ by a blank line, to indicate to htdig what type it
+ should expect for the output, much like what a CGI
+ script would do. The resulting content-type must
+ be one that htdig can parse, either internally,
+ or with another external parser or converter.
+ Only one external parser or converter can be
+ specified for any given content-type.
The parser program takes four command-line
- parameters, not counting parameters and parameters
+ parameters, not counting any parameters already
given in the command string:
infile content-type URL configuration-file
The external parser is to write information for
- htdig on its standard output.
+ htdig on its standard output. Unless it is an
+ external converter, which will output a document
+ of a different content-type, then its output must
+ follow the format described here.
The output consists of records, each record terminated
with a newline. Each record is a series of (unless
expressively allowed to be empty) non-empty tab-separated
@@ -1927,7 +1950,9 @@