From toivo@eleusis.ucs.uwa.edu.au Thu Sep 20 12:01:25 2001 Date: Thu, 20 Sep 2001 11:11:45 +0800 From: Toivo Pedaste To: htdig-dev@lists.sourceforge.net Subject: [htdig-dev] Fix for phrase searching [ The following text is in the "iso-8859-1" character set. ] [ Your display is set for the "US-ASCII" character set. ] [ Some characters may be displayed incorrectly. ] Phrase searching can get extremely slow due to n**2 behaviour when there are large numbers of matches for each word. This patch to parser.cc makes it linear, could someone apply it --- parser.cc.old Thu Sep 20 11:05:30 2001 +++ parser.cc Thu Sep 20 11:07:03 2001 @@ -20,6 +20,7 @@ #include "parser.h" #include "HtPack.h" #include "Collection.h" +#include "Dictionary.h" #define WORD 1000 #define DONE 1001 @@ -350,24 +351,50 @@ // OK, now we have a previous list in wordList and a new list List *results = new List; + Dictionary newDict(5000); + + String nid; + newWords->Start_Get(); + while ((newWord = (HtWordReference *) newWords->Get_Next())) + { + nid = ""; + int did = newWord->DocID(); + nid << did; + nid << "-"; + int loc = newWord->Location(); + nid << loc; + if (! newDict.Exists(nid)) { + newDict.Add(nid, (Object *)newWord); + } else { +// cerr << "perform_phrase: NewWords Duplicate: " << nid << "\n"; +// Double addition is a problem if you don't want your original objects deleted + } + } + + String oid; oldWords->Start_Get(); while ((oldWord = (HtWordReference *) oldWords->Get_Next())) { - newWords->Start_Get(); - while ((newWord = (HtWordReference *) newWords->Get_Next())) - { - if (oldWord->DocID() == newWord->DocID()) - if ((oldWord->Location() + 1) == newWord->Location()) - { - HtWordReference *result = new HtWordReference(*oldWord); + oid = ""; + int did = oldWord->DocID(); + oid << did; + oid << "-"; + int loc = oldWord->Location(); + oid << loc+1; + if (newDict.Exists(oid)) + { + newWord = (HtWordReference *)newDict.Find(oid); + + HtWordReference *result = new HtWordReference(*oldWord); - result->Flags(oldWord->Flags() & newWord->Flags()); - result->Location(newWord->Location()); + result->Flags(oldWord->Flags() & newWord->Flags()); + result->Location(newWord->Location()); - results->Add(result); - } + results->Add(result); } } + + newDict.Release(); if(debug) cerr << "old words count: " << oldWords->Count() << endl; if(debug) cerr << "results count: " << results->Count() << endl; -- Toivo Pedaste Email: toivo@ucs.uwa.edu.au University Communications Services, Phone: +61 8 9 380 2605 University of Western Australia Fax: +61 8 9 380 1109 "The time has come", the Walrus said, "to talk of many things"... _______________________________________________ htdig-dev mailing list htdig-dev@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/htdig-dev