[Xapian-discuss] Xapian::Queryparser / Encoding Problem (Utf8)

Sat Aug 20 18:08:31 BST 2005

On Tue, Aug 16, 2005 at 12:49:51AM +0200, R. Mattes wrote:
> Could you send me the patch?

Attached.  I got this off the gmane machine which lost a disk recently
so I'm not totally certain it's the latest version.  I can't get to my
dev box at present as I'm away from home and failed to open a suitable
hole in the firewall before I left...

Cheers,
    Olly
-------------- next part --------------
diff -ru orig/xapian-core-0.9.1/queryparser/Makefile.am xapian-core-0.9.1/queryparser/Makefile.am

--- orig/xapian-core-0.9.1/queryparser/Makefile.am	2005-06-07 17:59:07.000000000 +0200
+++ xapian-core-0.9.1/queryparser/Makefile.am	2005-06-23 01:39:35.000000000 +0200
@@ -1,6 +1,6 @@
 ## Process this file with automake to produce Makefile.in
 
-INCLUDES = -I$(top_srcdir)/include -I${top_builddir}/include -I$(top_srcdir)/common -I$(top_srcdir)/api
+INCLUDES = -I$(top_srcdir)/include -I${top_builddir}/include -I$(top_srcdir)/common -I$(top_srcdir)/api -I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include
 
 noinst_HEADERS = accentnormalisingitor.h symboltab.h queryparser_internal.h \
 	queryparser_token.h
@@ -22,3 +22,4 @@
 endif
 
 libqueryparser_la_SOURCES = queryparser.cc queryparser_internal.cc
+libqueryparser_la_LIBADD = /usr/lib/libglib-2.0.la
diff -ru orig/xapian-core-0.9.1/queryparser/accentnormalisingitor.h xapian-core-0.9.1/queryparser/accentnormalisingitor.h
--- orig/xapian-core-0.9.1/queryparser/accentnormalisingitor.h	2005-06-07 17:59:07.000000000 +0200
+++ xapian-core-0.9.1/queryparser/accentnormalisingitor.h	2005-06-23 03:22:19.000000000 +0200
@@ -19,64 +19,74 @@
  */
 
 #include "symboltab.h"
+#include <glib/gunicode.h>
 
 #include <string>
 
 using std::string;
 
-/** A wrapper class for a char which returns the char if dereferenced 
+typedef gunichar char_type;
+
+/** A wrapper class for a char_type which returns the char_type if dereferenced 
  *  with *.  We need this to implement input_iterator semantics.
  */
 class CharWrapper {
     private:
-	char ch;
+	char_type ch;
     public:
-	CharWrapper(char ch_) : ch(ch_) { }
-	char operator*() const { return ch; }
+	CharWrapper(char_type ch_) : ch(ch_) { }
+	char_type operator*() const { return ch; }
 };
 
 class AccentNormalisingItor {
   private:
-    string::const_iterator itor;
-    char queued;
+    /*string::const_iterator*/const gchar * itor;
+    const gchar * end;
+
+    char_type queued;
     size_t trans;
 
   public:
     AccentNormalisingItor()
-	: itor(), queued(0), trans(0) {}
-    AccentNormalisingItor(string::const_iterator itor_)
-	: itor(itor_), queued(0), trans(0) {}
+	: itor(NULL), queued(0), trans(0) {}
+    explicit AccentNormalisingItor(const char * itor_)
+	: itor(itor_), end(itor_), queued(0), trans(0) {}
+    AccentNormalisingItor(const char * itor_, const char *end_)
+	: itor(itor_), end(end_), queued(0), trans(0) {}
+#if 0
     void operator=(string::const_iterator itor_)
     {
 	itor = itor_;
+	end = end_;
 	queued = 0;
 	trans = 0;
     }
+#endif
     bool operator==(const AccentNormalisingItor &o) const {
 	return queued == o.queued && itor == o.itor;
     }
     bool operator!=(const AccentNormalisingItor &o) const {
 	return !(*this == o);
     }
-    char operator*() const {
+    char_type operator*() const {
 	if (queued) return queued;
-	unsigned char ch = (unsigned char)*itor;
+	char_type ch = g_utf8_get_char_validated(itor, end - itor);
 	if (ch >= 160
-#if CHAR_BIT > 8 // Avoid compiler warning.
-		      && ch < 256
-#endif
-				 ) return TRANSLIT1[ch - 160];
-	return (char)ch;
+//#if CHAR_BIT > 8 // Avoid compiler warning.
+		      && ch < 0x240
+//#endif
+				 ) ch = /*return*/ char_type(TRANSLIT1[ch - 160]);
+	return /*(char)*/ch;
     }
     AccentNormalisingItor & operator++() {
 	if (queued) {
 	    queued = 0;
 	} else {
-	    unsigned char ch = (unsigned char)*itor;
+	    char_type ch = g_utf8_get_char_validated(itor, end - itor);
 	    if (ch >= 160
-#if CHAR_BIT > 8 // Avoid compiler warning.
-			  && ch < 256
-#endif
+//#if CHAR_BIT > 8 // Avoid compiler warning.
+		      && ch < 0x240
+//#endif
 				     ) {
 		++trans;
 		ch = TRANSLIT2[ch - 160];
@@ -86,16 +96,22 @@
 		}
 	    }
 	}
-	++itor;
+	// ++itor;  becomes:
+	size_t skip = g_utf8_skip[*reinterpret_cast<const guchar *>(itor)];
+	if (size_t(end - itor) < skip) {
+	    itor = end;
+	} else {
+	    itor += skip;
+	}
 	return *this;
     }
     CharWrapper operator++(int) {
-	char tmp = **this;
+	char_type tmp = **this;
 	operator++();
 	return CharWrapper(tmp);
     }
     size_t transliterations() const { return trans; }
-    string::const_iterator raw() const { return itor; }
+    //string::const_iterator raw() const { return itor; }
 
     /// We implement the semantics of an STL input_iterator.
     //@{
diff -ru orig/xapian-core-0.9.1/queryparser/queryparser.lemony xapian-core-0.9.1/queryparser/queryparser.lemony
--- orig/xapian-core-0.9.1/queryparser/queryparser.lemony	2005-06-07 17:59:07.000000000 +0200
+++ xapian-core-0.9.1/queryparser/queryparser.lemony	2005-06-23 03:22:45.000000000 +0200
@@ -36,6 +36,41 @@
 
 using namespace Xapian;
 
+static inline bool
+U_isupper(gunichar ch) {
+    return (ch < 128 && C_isupper(ch));
+}
+
+static inline bool
+U_isspace(gunichar ch) {
+    return (ch < 128 && C_isspace(ch));
+}
+
+static inline bool
+U_isnotspace(gunichar ch) {
+    return !U_isspace(ch);
+}
+
+static inline bool
+U_isalnum(gunichar ch) {
+    return (ch < 128 && C_isalnum(ch));
+}
+
+static inline bool
+U_isnotalnum(gunichar ch) {
+    return !U_isalnum(ch);
+}
+
+static inline bool
+U_issign(gunichar ch) {
+    return (ch < 128 && C_issign(ch));
+}
+
+static inline bool
+G_unichar_isnotalnum(gunichar ch) {
+    return !g_unichar_isalnum(ch);
+}
+
 // Disable debug code lemon adds.
 #define NDEBUG
 
@@ -93,27 +128,27 @@
 static inline string
 downcase_term(const string &term)
 {
-    string t;
-    t.reserve(term.size());
-    AccentNormalisingItor i(term.begin());
-    const AccentNormalisingItor end(term.end());
-    while (i != end) t += C_tolower(*i++);
+    gchar * r;
+    r = g_utf8_strdown(static_cast<const gchar*>(term.data()),
+		       term.length());
+    string t(static_cast<char *>(r));
+    free(r);
     return t;
 }
 
 static inline bool
-is_phrase_generator(unsigned char ch)
+is_phrase_generator(gunichar ch)
 {
     // These characters generate a phrase search.
     // Ordered mostly by frequency of calls to this function done when
     // running queryparsertest.
-    return (ch && strchr(".-/':\\_@", ch) != NULL);
+    return (ch && ch < 128 && strchr(".-/':\\_@", ch) != NULL);
 }
 
 static inline bool
-prefix_needs_colon(const string & prefix, unsigned char ch)
+prefix_needs_colon(const string & prefix, gunichar ch)
 {
-    if (!C_isupper(ch)) return false;
+    if (!U_isupper(ch)) return false;
     string::size_type len = prefix.length();
     return (len > 1 && prefix[len - 1] != ':');
 }
@@ -126,6 +161,7 @@
 Query
 QueryParser::Internal::parse_query(const string &qs, unsigned int flags)
 {
+    gchar ubuf[6];
 #ifndef NDEBUG
     // Set the prefix added to Lemon's debug output, if it's enabled.
     // FIXME: arrange to send this to the Xapian debug log, and turn
@@ -136,29 +172,29 @@
     void * pParser = ParseAlloc(malloc);
 
     termpos term_pos = 1;
-    AccentNormalisingItor it(qs.begin()), end(qs.end());
+    AccentNormalisingItor it(qs.data(), qs.data() + qs.size()), end(qs.data() + qs.size());
 
     State state(this);
 
     enum { DEFAULT, IN_QUOTES, IN_PHRASED_TERM } mode = DEFAULT;
-    unsigned char newprev = ' ';
+    gunichar newprev = ' ';
     while (it != end) {
 	if (mode == IN_PHRASED_TERM) mode = DEFAULT;
-	if (C_isspace(*it)) {
+	if (U_isspace(*it)) {
 	    newprev = ' ';
 	    ++it;
-	    it = find_if(it, end, C_isnotspace);
+	    it = find_if(it, end, U_isnotspace);
 	    if (it == end) break;
 	}
 
-	if (!C_isalnum(*it)) {
-	    unsigned char prev = newprev;
-	    unsigned char ch = *it++;
+	if (!g_unichar_isalnum(*it)) {
+	    gunichar prev = newprev;
+	    gunichar ch = *it++;
 	    if (it != end) newprev = *it;
 	    switch (ch) {
 	      case '"':
 		// Skip whitespace.
-		it = find_if(it, end, C_isnotspace);
+		it = find_if(it, end, U_isnotspace);
 		if (mode != IN_QUOTES) {
 		    if (it == end) {
 			// Ignore an unmatched " at the end of the query to
@@ -191,7 +227,7 @@
 		    // Or if not after whitespace or an open bracket.
 		    continue;
 		}
-		if (C_isspace(*it) || *it == '+' || *it == '-') {
+		if (U_isspace(*it) || *it == '+' || *it == '-') {
 		    // Ignore + or - followed by a space, or further + or -.
 		    // Postfix + (such as in C++ and H+) is handled as part of
 		    // the term lexing code below.
@@ -204,7 +240,7 @@
 
 	      case '(':
 		// Skip whitespace.
-		it = find_if(it, end, C_isnotspace);
+		it = find_if(it, end, U_isnotspace);
 		// Ignore ( at end of query.
 		if (it == end) goto done;
 		if (prev > ' ' && prev != '(' && prev != ')') {
@@ -239,24 +275,25 @@
 	string prefix;
 	if (mode == DEFAULT && !prefixes.empty()) {
 	    // Check for fieldname prefixes (e.g. title:historical).
-	    AccentNormalisingItor p = find_if(it, end, C_isnotalnum);
+	    AccentNormalisingItor p = find_if(it, end, G_unichar_isnotalnum);
 	    if (p != end && *p == ':' && ++p != end) {
-		unsigned char ch = *p;
-		if (C_isalnum(ch) ||
+		gunichar ch = *p;
+		if (g_unichar_isalnum(ch) ||
 		    ((flags & FLAG_PHRASE) && ch == '"') || 
 		    ((flags & FLAG_BOOLEAN) && ch == '(')) {
 		    string field;
 		    p = it;
-		    while (*p != ':') field += *p++;
+		    while (*p != ':')
+			field += string(ubuf, g_unichar_to_utf8(*p++, ubuf));
 		    map<string, pair<bool, string> >::const_iterator f;
 		    f = prefixes.find(field);
 		    if (f != prefixes.end()) {
 			// Can't boolean prefix a subexpression or phrase.
 			bool boolean_filter = f->second.first;
-			if (!boolean_filter || C_isalnum(ch)) {
+			if (!boolean_filter || g_unichar_isalnum(ch)) {
 			    it = p;
 			    ++it;
-			    if (!C_isalnum(ch)) {
+			    if (!g_unichar_isalnum(ch)) {
 				newprev = ch;
 				++it;
 				state.push_prefix(f->second.second);
@@ -275,7 +312,7 @@
 				if (prefix_needs_colon(prefix, *it))
 				    prefix += ':';
 				while (it != end && *it > ' ' && *it != ')')
-				    prefix += *it++;
+				    prefix += string(ubuf, g_unichar_to_utf8(*it++, ubuf));
 				Parse(pParser, BOOLEAN_FILTER,
 				      new Term(prefix, 0), &state);
 				continue;
@@ -291,18 +328,18 @@
 	size_t transliterations = it.transliterations();
 	// Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
 	// Don't worry if there's a trailing '.' or not.
-	if (C_isupper(*it)) {
+	if (U_isupper(*it)) {
 	    string t;
 	    AccentNormalisingItor p = it;
 	    do {
-		t += *p++;
-	    } while (p != end && *p == '.' && ++p != end && C_isupper(*p));
+		t += string(ubuf, g_unichar_to_utf8(*p++, ubuf));
+	    } while (p != end && *p == '.' && ++p != end && U_isupper(*p));
 	    // One letter does not make an acronym!  If we handled a single
 	    // uppercase letter here, we wouldn't catch M&S below.
 	    if (t.length() > 1) {
 		// Check there's not a (lower case) letter or digit
 		// immediately after it.
-		if (p == end || !C_isalnum(*p)) {
+		if (p == end || !g_unichar_isalnum(*p)) {
 		    it = p;
 		    swap(term, t);
 		}
@@ -312,17 +349,17 @@
 
 	if (term.empty()) {
 	    while (it != end) {
-		if (!C_isalnum(*it)) {
+		if (!g_unichar_isalnum(*it)) {
 		    // Treat a single embedded '&' as a word character
 		    // (e.g. AT&T).
 		    if (*it != '&') break;
 		    AccentNormalisingItor p = it;
 		    ++p;
-		    if (p == end || !C_isalnum(*p)) break;
+		    if (p == end || !g_unichar_isalnum(*p)) break;
 		}
-		term += *it++;
+		term.append(ubuf, g_unichar_to_utf8(*it++, ubuf));
 	    }
-	    if (it != end && (*it == '#' || C_issign(*it))) {
+	    if (it != end && (*it == '#' || U_issign(*it))) {
 		string suff_term = term;
 		AccentNormalisingItor p = it;
 		if (*p == '#') {
@@ -331,17 +368,17 @@
 		    while (++p != end && *p == '#') { }
 		} else {
 		    // Keep trailing +, and - (e.g. C++, Na+, Cl-).
-		    // FIXME: keeping trailing "-" is of dubious utilpy and
+		    // FIXME: keeping trailing "-" is of dubious utility and
 		    // there's the risk of hyphens getting stuck onto the end of
 		    // terms...
 		    // FIXME: generating a term like foo+---+++ doesn't make
 		    // much sense - we should probably be more conservative as
 		    // to what combinations are allowed.
 		    do {
-			suff_term += *p++;
-		    } while (p != end && C_issign(*p));
+			suff_term += string(ubuf, g_unichar_to_utf8(*p++, ubuf));
+		    } while (p != end && U_issign(*p));
 		}
-		if (p == end || !C_isalnum(*p)) {
+		if (p == end || !g_unichar_isalnum(*p)) {
 		    // If the suffixed term doesn't exist, check that the
 		    // non-suffixed term does.  This also takes care of
 		    // the case when QueryParser::set_database() hasn't
@@ -358,7 +395,7 @@
 	if (mode == DEFAULT && (flags & FLAG_BOOLEAN)) {
 	    // Don't want to interpret A.N.D. or ?ND as an AND operator.
 	    if (!was_acronym && transliterations == it.transliterations()) {
-		if (prefix.empty() && !term.empty() && C_isupper(term[0])) {
+		if (prefix.empty() && !term.empty() && U_isupper(term[0])) {
 		    if (term == "AND") {
 			Parse(pParser, AND, NULL, &state);
 			continue;
@@ -388,13 +425,13 @@
 	    // stem terms in a phrased term with '.' phrase generators -
 	    // e.g. "example.com" should give a phrase search for "exampl"
 	    // and "com", not "example" and "com".
-	    if (p == end || C_isspace(*p)) {
+	    if (p == end || U_isspace(*p)) {
 		it = p;
 		// If topterms added a term with a trailing '.', it will be
 		// lower case.  So if it has an initial capital it must be an
 		// initial in someone's name, a full stop in pasted text or
 		// something like that.
-		if (!C_isupper(term[0])) {
+		if (!U_isupper(term[0])) {
 		    unstemmed_term = term + '.';
 		    need_to_stem = false;
 		}
@@ -404,7 +441,7 @@
 	if (unstemmed_term.empty()) unstemmed_term = term;
 	term = downcase_term(term);
 	if (need_to_stem) {
-	    if (stem_action == STEM_SOME && C_isupper(unstemmed_term[0]))
+	    if (stem_action == STEM_SOME && U_isupper(unstemmed_term[0]))
 		term = 'R' + term;
 	    else 
 		term = stemmer.stem_word(term);
@@ -436,7 +473,7 @@
 	    } while (it != end && is_phrase_generator(*it));
 	    // Don't generate a phrase unless the phrase generators are
 	    // immediately followed by another term.
-	    if (it != end && C_isalnum(*it)) {
+	    if (it != end && g_unichar_isalnum(*it)) {
 		mode = IN_PHRASED_TERM;
 		goto phrased_term;
 	    }
-------------- next part --------------
A non-text attachment was scrubbed...
Name: symboltab.h
Type: text/x-chdr
Size: 1063 bytes
Desc: not available
Url : http://lists.tartarus.org/pipermail/xapian-discuss/attachments/20050820/5a08d009/symboltab.bin