Skip site navigation (1)Skip section navigation (2)
Date:      Sun, 16 Apr 2017 19:12:08 +0000 (UTC)
From:      Baptiste Daroussin <bapt@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r317034 - in head/lib/libc: string tests/string
Message-ID:  <201704161912.v3GJC868073706@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: bapt
Date: Sun Apr 16 19:12:07 2017
New Revision: 317034
URL: https://svnweb.freebsd.org/changeset/base/317034

Log:
  Fix strcoll_l disagreeing with strxfrm by reworking the forward order case in
  wcscoll_l().
  
  Illumos fixed this while grabbing back our patches:
  https://www.illumos.org/rb/r/402/
  
  This does not 100% fix what postgresql folks reported as there is still a
  remaining issue: https://www.illumos.org/issues/7962, it improves the situation
  
  The initial issue was reported in postgresql mailing lists:
  https://www.postgresql.org/message-id/flat/111D0E27-A8F3-4A84-A4E0-B0FB703863DF@s24.com#111D0E27-A8F3-4A84-A4E0-B0FB703863DF@s24.com
  
  Submitted by:	Yuri Pankov <yuri.pankov@nexenta.com>
  Obtained from:	Illumos
  MFC after:	2 weeks

Modified:
  head/lib/libc/string/wcscoll.c
  head/lib/libc/tests/string/wcscoll_test.c

Modified: head/lib/libc/string/wcscoll.c
==============================================================================
--- head/lib/libc/string/wcscoll.c	Sun Apr 16 17:53:44 2017	(r317033)
+++ head/lib/libc/string/wcscoll.c	Sun Apr 16 19:12:07 2017	(r317034)
@@ -1,5 +1,5 @@
 /*-
- * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
  * Copyright (c) 2002 Tim J. Robbins
  * All rights reserved.
  *
@@ -42,22 +42,22 @@ __FBSDID("$FreeBSD$");
 int
 wcscoll_l(const wchar_t *ws1, const wchar_t *ws2, locale_t locale)
 {
-	int len1, len2, pri1, pri2, ret;
+	int len1, len2, pri1, pri2;
 	wchar_t *tr1 = NULL, *tr2 = NULL;
 	int direc, pass;
+	int ret = wcscmp(ws1, ws2);
 
 	FIX_LOCALE(locale);
 	struct xlocale_collate *table =
 		(struct xlocale_collate*)locale->components[XLC_COLLATE];
 
-	if (table->__collate_load_error)
-		/*
-		 * Locale has no special collating order or could not be
-		 * loaded, do a fast binary comparison.
-		 */
-		return (wcscmp(ws1, ws2));
+	if (table->__collate_load_error || ret == 0)
+		return (ret);
 
-	ret = 0;
+	if (*ws1 == 0 && *ws2 != 0)
+		return (-1);
+	if (*ws1 != 0 && *ws2 == 0)
+		return (1);
 
 	/*
 	 * Once upon a time we had code to try to optimize this, but
@@ -77,19 +77,19 @@ wcscoll_l(const wchar_t *ws1, const wcha
 		const int32_t *st2 = NULL;
 		const wchar_t	*w1 = ws1;
 		const wchar_t	*w2 = ws2;
-		int check1, check2;
 
 		/* special pass for UNDEFINED */
 		if (pass == table->info->directive_count) {
-			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
+			direc = DIRECTIVE_FORWARD;
 		} else {
 			direc = table->info->directive[pass];
 		}
 
 		if (direc & DIRECTIVE_BACKWARD) {
 			wchar_t *bp, *fp, c;
+			free(tr1);
 			if ((tr1 = wcsdup(w1)) == NULL)
-				goto fail;
+				goto end;
 			bp = tr1;
 			fp = tr1 + wcslen(tr1) - 1;
 			while (bp < fp) {
@@ -97,8 +97,9 @@ wcscoll_l(const wchar_t *ws1, const wcha
 				*bp++ = *fp;
 				*fp-- = c;
 			}
+			free(tr2);
 			if ((tr2 = wcsdup(w2)) == NULL)
-				goto fail;
+				goto end;
 			bp = tr2;
 			fp = tr2 + wcslen(tr2) - 1;
 			while (bp < fp) {
@@ -111,6 +112,7 @@ wcscoll_l(const wchar_t *ws1, const wcha
 		}
 
 		if (direc & DIRECTIVE_POSITION) {
+			int check1, check2;
 			while (*w1 && *w2) {
 				pri1 = pri2 = 0;
 				check1 = check2 = 1;
@@ -120,7 +122,7 @@ wcscoll_l(const wchar_t *ws1, const wcha
 						    &pri1, pass, &st1);
 						if (pri1 < 0) {
 							errno = EINVAL;
-							goto fail;
+							goto end;
 						}
 						if (!pri1) {
 							pri1 = COLLATE_MAX_PRIORITY;
@@ -133,7 +135,7 @@ wcscoll_l(const wchar_t *ws1, const wcha
 						    &pri2, pass, &st2);
 						if (pri2 < 0) {
 							errno = EINVAL;
-							goto fail;
+							goto end;
 						}
 						if (!pri2) {
 							pri2 = COLLATE_MAX_PRIORITY;
@@ -149,58 +151,64 @@ wcscoll_l(const wchar_t *ws1, const wcha
 				w1 += len1;
 				w2 += len2;
 			}
+			if (!*w1) {
+				if (*w2) {
+					ret = -(int)*w2;
+					goto end;
+				}
+			} else {
+				ret = *w1;
+				goto end;
+			}
 		} else {
-			while (*w1 && *w2) {
-				pri1 = pri2 = 0;
-				check1 = check2 = 1;
-				while ((pri1 == pri2) && (check1 || check2)) {
-					while (check1 && *w1) {
-						_collate_lookup(table, w1,
-						    &len1, &pri1, pass, &st1);
-						if (pri1 > 0)
-							break;
-						if (pri1 < 0) {
-							errno = EINVAL;
-							goto fail;
-						}
-						st1 = NULL;
-						w1 += 1;
+			int vpri1 = 0, vpri2 = 0;
+			while (*w1 || *w2 || st1 || st2) {
+				pri1 = 1;
+				while (*w1 || st1) {
+					_collate_lookup(table, w1, &len1, &pri1,
+					    pass, &st1);
+					w1 += len1;
+					if (pri1 > 0) {
+						vpri1++;
+						break;
 					}
-					check1 = (st1 != NULL);
-					while (check2 && *w2) {
-						_collate_lookup(table, w2,
-						    &len2, &pri2, pass, &st2);
-						if (pri2 > 0)
-							break;
-						if (pri2 < 0) {
-							errno = EINVAL;
-							goto fail;
-						}
-						st2 = NULL;
-						w2 += 1;
+
+					if (pri1 < 0) {
+						errno = EINVAL;
+						goto end;
 					}
-					check2 = (st2 != NULL);
-					if (!pri1 || !pri2)
+					st1 = NULL;
+				}
+				pri2 = 1;
+				while (*w2 || st2) {
+					_collate_lookup(table, w2, &len2, &pri2,
+					    pass, &st2);
+					w2 += len2;
+					if (pri2 > 0) {
+						vpri2++;
 						break;
+					}
+					if (pri2 < 0) {
+						errno = EINVAL;
+						goto end;
+					}
+					st2 = NULL;
 				}
-				if (!pri1 || !pri2)
+				if ((!pri1 || !pri2) && (vpri1 == vpri2))
 					break;
 				if (pri1 != pri2) {
 					ret = pri1 - pri2;
 					goto end;
 				}
-				w1 += len1;
-				w2 += len2;
 			}
-		}
-		if (!*w1) {
-			if (*w2) {
-				ret = -(int)*w2;
+			if (vpri1 && !vpri2) {
+				ret = 1;
+				goto end;
+			}
+			if (!vpri1 && vpri2) {
+				ret = -1;
 				goto end;
 			}
-		} else {
-			ret = *w1;
-			goto end;
 		}
 	}
 	ret = 0;
@@ -210,10 +218,6 @@ end:
 	free(tr2);
 
 	return (ret);
-
-fail:
-	ret = wcscmp(ws1, ws2);
-	goto end;
 }
 
 int

Modified: head/lib/libc/tests/string/wcscoll_test.c
==============================================================================
--- head/lib/libc/tests/string/wcscoll_test.c	Sun Apr 16 17:53:44 2017	(r317033)
+++ head/lib/libc/tests/string/wcscoll_test.c	Sun Apr 16 19:12:07 2017	(r317034)
@@ -1,5 +1,7 @@
 /*-
  * Copyright (c) 2016 Baptiste Daroussin <bapt@FreeBSD.org>
+ * Copyright 2016 Tom Lane <tgl@sss.pgh.pa.us>
+ * Copyright 2017 Nexenta Systems, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -30,6 +32,8 @@ __FBSDID("$FreeBSD$");
 #include <wchar.h>
 #include <locale.h>
 #include <stdlib.h>
+#include <time.h>
+#include <errno.h>
 
 #include <atf-c.h>
 
@@ -55,9 +59,98 @@ ATF_TC_BODY(russian_collation, tc)
 	    "Bad collation, expected: '%ls' got '%ls'", res, c);
 }
 
+#define	NSTRINGS 2000
+#define	MAXSTRLEN 20
+#define	MAXXFRMLEN (MAXSTRLEN * 20)
+
+typedef struct {
+	char	sval[MAXSTRLEN];
+	char	xval[MAXXFRMLEN];
+} cstr;
+
+ATF_TC_WITHOUT_HEAD(strcoll_vs_strxfrm);
+ATF_TC_BODY(strcoll_vs_strxfrm, tc)
+{
+	cstr	data[NSTRINGS];
+	char	*curloc;
+	int	i, j;
+
+	curloc = setlocale(LC_ALL, "en_US.UTF-8");
+	ATF_CHECK_MSG(curloc != NULL, "Fail to set locale");
+
+	/* Ensure new random() values on every run */
+	srandom((unsigned int) time(NULL));
+
+	/* Generate random UTF8 strings of length less than MAXSTRLEN bytes */
+	for (i = 0; i < NSTRINGS; i++) {
+		char	*p;
+		int	len;
+
+again:
+		p = data[i].sval;
+		len = 1 + (random() % (MAXSTRLEN - 1));
+		while (len > 0) {
+			int c;
+			/*
+			 * Generate random printable char in ISO8859-1 range.
+			 * Bias towards producing a lot of spaces.
+			 */
+
+			if ((random() % 16) < 3) {
+				c = ' ';
+			} else {
+				do {
+					c = random() & 0xFF;
+				} while (!((c >= ' ' && c <= 127) ||
+				    (c >= 0xA0 && c <= 0xFF)));
+			}
+
+			if (c <= 127) {
+				*p++ = c;
+				len--;
+			} else {
+				if (len < 2)
+					break;
+				/* Poor man's utf8-ification */
+				*p++ = 0xC0 + (c >> 6);
+				len--;
+				*p++ = 0x80 + (c & 0x3F);
+				len--;
+			}
+		}
+		*p = '\0';
+		/* strxfrm() each string as we produce it */
+		errno = 0;
+		ATF_CHECK_MSG(strxfrm(data[i].xval, data[i].sval,
+		    MAXXFRMLEN) < MAXXFRMLEN, "strxfrm() result for %d-length "
+		    " string exceeded %d bytes", (int)strlen(data[i].sval),
+		    MAXXFRMLEN);
+
+		/*
+		 * Amend strxfrm() failing on certain characters to be fixed and
+		 * test later
+		 */
+		if (errno != 0)
+			goto again;
+	}
+
+	for (i = 0; i < NSTRINGS; i++) {
+		for (j = 0; j < NSTRINGS; j++) {
+			int sr = strcoll(data[i].sval, data[j].sval);
+			int sx = strcmp(data[i].xval, data[j].xval);
+
+			ATF_CHECK_MSG(!((sr * sx < 0) ||
+			    (sr * sx == 0 && sr + sx != 0)),
+			    "%s: diff for \"%s\" and \"%s\"",
+			    curloc, data[i].sval, data[j].sval);
+		}
+	}
+}
+
 ATF_TP_ADD_TCS(tp)
 {
 	ATF_TP_ADD_TC(tp, russian_collation);
+	ATF_TP_ADD_TC(tp, strcoll_vs_strxfrm);
 
 	return (atf_no_error());
 }



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201704161912.v3GJC868073706>