From jamie.anstice@sli-systems.com Mon Oct 15 23:40:13 2001 Date: Tue, 16 Oct 2001 16:08:10 +1300 From: Jamie Anstice To: htdig-general@lists.sourceforge.net Subject: [htdig] PATCH: untranslated entity presentation (3.2.x) Here's a quickie that someone else might like to verify if they've run into the same problem. When htdig encounters an entity that it doesn't know about (say ’ - which should really be ’ but that's another issue) it copies it verbatim to the extract - so far so good. When the extract is sent out in Display::hilight, the extract is decoded with HtSGMLCodec to transform the unsigned char characters to entities, and as well as the characters above 160 it translates & to &, which is fine except when & is the start of an entity. This is what leaves things like &146; in extracts. Here's a patch to HtSGMLCodec::decode to make sure that it doesn't break real entities. ====================================== diff -rup htdig/htcommon/HtSGMLCodec.cc htdig-patch2/htcommon/HtSGMLCodec.cc --- htdig/htcommon/HtSGMLCodec.cc Fri Oct 20 16:40:55 2000 +++ htdig-patch2/htcommon/HtSGMLCodec.cc Tue Oct 16 15:37:05 2001 @@ -19,6 +19,8 @@ #include "HtSGMLCodec.h" +#include + // Constructor: parses the appropriate parameters using the // encapsulated HtWordCodec class. // Only used in privacy. @@ -106,5 +108,92 @@ HtSGMLCodec::instance() return _instance; } + + +// *********************************************** +int +HtSGMLCodec::IsEntity( const String &entity ) const +{ + // entity if starts with &, finishes with ;, has no spaces, is at least 3 chars long + // if second char is # and the third is not x, the others are decimal digits, min len 4 + // if the entity starts with &#x then the remaining digits must be hexidecimal, min len 5 + // I'm not supporting entities that don't end with a semi-colon. + + int is_decimal = 0; + int is_hex = 0; + int len = entity.length(); + int start = 1; + + if (len < 3 && entity[0] != '&' && entity[len-1] != ';' ) + return 0; + + if ( entity[1] == '#' ) + { + if ( len > 3 && ( entity[2] == 'x' || entity[2] == 'X' ) ) { + is_hex = 1; + start = 3; + if ( len < 5 ) + return 0; + } else { + is_decimal = 1; + start = 2; + if ( len < 4 ) + return 0; + } + } + + for (int i = start; i < len-start-1; i++ ) + { + if ( !isalnum( entity[i] ) ) + return 0; + + if ( is_decimal && !isdigit( entity[i] ) ) + return 0; + if ( is_hex && !isxdigit( entity[i] ) ) + return 0; + } + return 1; +} + + +// *********************************************** +String HtSGMLCodec::decode(const String &coded) const +{ + String out; + int semi_pos = -1; + int amp_pos = coded.indexOf( '&' ); + int last_pos = 0; + + while( last_pos <= coded.length() ) + { + amp_pos = coded.indexOf( '&', last_pos ); + + if ( amp_pos != -1 ) + semi_pos = coded.indexOf( ';', amp_pos+1 ); + else + semi_pos = -1; + + if ( amp_pos == -1 || semi_pos == -1 ) // no more possible entities + { + out << myTextWordCodec->decode( coded.sub( last_pos) ); + break; + } + semi_pos++; // jump over the semi-colon + + if ( IsEntity( coded.sub(amp_pos, semi_pos - amp_pos ) ) ) + { + out << myTextWordCodec->decode( coded.sub(last_pos, amp_pos - last_pos ) ); + out << coded.sub(amp_pos, semi_pos - amp_pos ); + } + else + { + out << myTextWordCodec->decode( coded.sub( last_pos, semi_pos - amp_pos ) ); + } + last_pos = semi_pos; + } + + return out; +} + // End of HtSGMLCodec.cc diff -rup htdig/htcommon/HtSGMLCodec.h htdig-patch2/htcommon/HtSGMLCodec.h --- htdig/htcommon/HtSGMLCodec.h Fri Oct 20 16:40:55 2000 +++ htdig-patch2/htcommon/HtSGMLCodec.h Tue Oct 16 15:33:08 2001 @@ -33,8 +33,8 @@ public: { return myTextWordCodec->encode(myNumWordCodec->encode(uncoded)); } // But we only want to decode into one form i.e. &foo; NOT &#nnn; - String decode(const String &coded) const - { return myTextWordCodec->decode(coded); } + // but we don't want to decode & if it's part of an entity. + String decode(const String &coded) const; // If an error was discovered during the parsing of // entities, this returns an error message @@ -54,6 +54,9 @@ private: HtSGMLCodec(); HtSGMLCodec(const HtSGMLCodec &); void operator= (const HtSGMLCodec &); + + //! returns true if the parameter is an entity. + int IsEntity( const String &entity ) const; HtWordCodec *myTextWordCodec; // For &foo; HtWordCodec *myNumWordCodec; // For &#foo; ====================================== Jamie Anstice Search Engineer S.L.I. Systems jamie.anstice@sli-systems.com ph: 64 961 3262 mobile: 64 21 264 9347 _______________________________________________ htdig-general mailing list To unsubscribe, send a message to with a subject of unsubscribe FAQ: http://htdig.sourceforge.net/FAQ.html