Index: source/test/intltest/dicttest.cpp
===================================================================
--- source/test/intltest/dicttest.cpp (revision 0)
+++ source/test/intltest/dicttest.cpp (revision 0)
@@ -0,0 +1,140 @@
+/*
+**********************************************************************
+* Copyright (C) 2011-2011, International Business Machines Corporation
+* and others. All Rights Reserved.
+**********************************************************************
+************************************************************************
+* Date Name Description
+* 05/14/2011 grhoten Creation.
+************************************************************************/
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_BREAK_ITERATION
+
+#include "dicttest.h"
+#include "textfile.h"
+#include "uvector.h"
+#include "unicode/rbbi.h"
+
+void DictionaryWordTest::TestThaiBreaks() {
+ UErrorCode status=U_ZERO_ERROR;
+ BreakIterator* b;
+ Locale locale = Locale("th");
+ int32_t p, index;
+ UChar c[]= {
+ 0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B,
+ 0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19,
+ 0x0E16, 0x0E49, 0x0E33, 0x0000
+ };
+ int32_t expectedWordResult[] = {
+ 2, 3, 6, 10, 11, 15, 17, 20, 22
+ };
+ int32_t expectedLineResult[] = {
+ 3, 6, 11, 15, 17, 20, 22
+ };
+
+ int32_t size = u_strlen(c);
+ UnicodeString text=UnicodeString(c);
+
+ b = BreakIterator::createWordInstance(locale, status);
+ if (U_FAILURE(status)) {
+ errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status));
+ return;
+ }
+ b->setText(text);
+ p = index = 0;
+ while ((p=b->next())!=BreakIterator::DONE && p < size) {
+ if (p != expectedWordResult[index++]) {
+ errln("Incorrect break given by thai word break iterator. Expected: %d Got: %d", expectedWordResult[index-1], p);
+ }
+ }
+ delete b;
+
+ b = BreakIterator::createLineInstance(locale, status);
+ if (U_FAILURE(status)) {
+ printf("Unable to create thai line break iterator.\n");
+ return;
+ }
+ b->setText(text);
+ p = index = 0;
+ while ((p=b->next())!=BreakIterator::DONE && p < size) {
+ if (p != expectedLineResult[index++]) {
+ errln("Incorrect break given by thai line break iterator. Expected: %d Got: %d", expectedLineResult[index-1], p);
+ }
+ }
+
+ delete b;
+}
+
+#define DICTIONARY_TEST_FILE "wordsegments.txt"
+
+void DictionaryWordTest::TestWordBoundaries() {
+ UErrorCode status = U_ZERO_ERROR;
+
+ TextFile phrases(DICTIONARY_TEST_FILE, "UTF8", status);
+ if (U_FAILURE(status)) {
+ dataerrln("Can't open "DICTIONARY_TEST_FILE": %s; skipping test",
+ u_errorName(status));
+ return;
+ }
+
+ // Due to how the word break iterator works,
+ // scripts for languages that use no spaces should use the correct dictionary by default.
+ BreakIterator *wb = BreakIterator::createWordInstance("en", status);
+ if (U_FAILURE(status)) {
+ dataerrln("Word break iterator can not be opened: %s; skipping test",
+ u_errorName(status));
+ return;
+ }
+
+ int32_t pos, pIdx;
+ int32_t testLines = 0;
+ UnicodeString phrase;
+ while (phrases.readLineSkippingComments(phrase, status, FALSE) && U_SUCCESS(status)) {
+ UVector breaks(status);
+
+ for (pIdx = 0; pIdx < phrase.length(); pIdx++) {
+ if (phrase.charAt(pIdx) == 0x007C /* | */) {
+ breaks.addElement(pIdx, status);
+ phrase.remove(pIdx, 1);
+ }
+ }
+ breaks.addElement(pIdx, status);
+
+ wb->setText(phrase);
+ int32_t brkArrPos = 0;
+ while ((pos=wb->next())!=BreakIterator::DONE) {
+ int32_t expectedPos = breaks.elementAti(brkArrPos);
+ if (expectedPos != pos) {
+ errln("Incorrect forward word break on line %d. Expected: %d Got: %d",
+ phrases.getLineNumber(), breaks.elementAt(brkArrPos), pos);
+ }
+ brkArrPos++;
+ }
+ brkArrPos = breaks.size() - 1;
+ while ((pos=wb->previous())!=BreakIterator::DONE) {
+ brkArrPos--;
+ int32_t expectedPos = breaks.elementAti(brkArrPos);
+ if (expectedPos != pos) {
+ errln("Incorrect backward word break on line %d. Expected: %d Got: %d",
+ phrases.getLineNumber(), breaks.elementAt(brkArrPos), pos);
+ }
+ }
+ testLines++;
+ }
+ delete wb;
+ logln("%d tests were run.", testLines);
+}
+
+void DictionaryWordTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par */)
+{
+ if (exec) logln("TestSuite DictionaryWordTest: ");
+ TESTCASE_AUTO_BEGIN;
+ TESTCASE_AUTO(TestThaiBreaks);
+ TESTCASE_AUTO(TestWordBoundaries);
+ TESTCASE_AUTO_END;
+}
+
+
+#endif
Property changes on: source/test/intltest/dicttest.cpp
___________________________________________________________________
Added: svn:eol-style
+ native
Index: source/test/intltest/intltest.vcxproj.filters
===================================================================
--- source/test/intltest/intltest.vcxproj.filters (revision 30114)
+++ source/test/intltest/intltest.vcxproj.filters (working copy)
@@ -444,6 +444,9 @@
collation
+
+ break iteration
+
@@ -812,5 +815,8 @@
collation
+
+ break iteration
+
\ No newline at end of file
Index: source/test/intltest/rbbitst.h
===================================================================
--- source/test/intltest/rbbitst.h (revision 30114)
+++ source/test/intltest/rbbitst.h (working copy)
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 1999-2010, International Business Machines
+ * Copyright (c) 1999-2011, International Business Machines
* Corporation and others. All Rights Reserved.
*************************************************************************
* Date Name Description
@@ -68,7 +68,6 @@
void TestTrieDict();
void TestUnicodeFiles();
void TestBug5775();
- void TestThaiBreaks();
void TestTailoredBreaks();
void TestDictRules();
void TestBug5532();
Index: source/test/intltest/intltest.vcxproj
===================================================================
--- source/test/intltest/intltest.vcxproj (revision 30114)
+++ source/test/intltest/intltest.vcxproj (working copy)
@@ -224,6 +224,7 @@
+
@@ -389,6 +390,7 @@
+
@@ -533,4 +535,4 @@
-
+
\ No newline at end of file
Index: source/test/intltest/itrbbi.cpp
===================================================================
--- source/test/intltest/itrbbi.cpp (revision 30114)
+++ source/test/intltest/itrbbi.cpp (working copy)
@@ -1,6 +1,6 @@
/*
**********************************************************************
-* Copyright (C) 1998-2001, International Business Machines Corporation
+* Copyright (C) 1998-2011, International Business Machines Corporation
* and others. All Rights Reserved.
**********************************************************************
*/
@@ -19,28 +19,27 @@
#include "itrbbi.h"
#include "rbbiapts.h"
#include "rbbitst.h"
+#include "dicttest.h"
+#define TESTCLASS(n,classname) \
+ case n: \
+ name = #classname; \
+ if (exec) { \
+ logln(#classname "---"); \
+ logln(""); \
+ classname t; \
+ callTest(t, par); \
+ } \
+ break
+
+
void IntlTestRBBI::runIndexedTest( int32_t index, UBool exec, const char* &name, char* par )
{
if (exec) logln("TestSuite RuleBasedBreakIterator: ");
switch (index) {
- case 0:
- name = "RBBIAPITest";
- if (exec) {
- logln("RBBIAPITest--"); logln("");
- RBBIAPITest test;
- callTest( test, par );
- }
- break;
-
- case 1:
- name = "RBBITest";
- if (exec) {
- logln("RBBITest---"); logln("");
- RBBITest test;
- callTest( test, par );
- }
- break;
+ TESTCLASS(0, RBBIAPITest);
+ TESTCLASS(1, RBBITest);
+ TESTCLASS(2, DictionaryWordTest);
default: name=""; break;
}
}
Index: source/test/intltest/Makefile.in
===================================================================
--- source/test/intltest/Makefile.in (revision 30114)
+++ source/test/intltest/Makefile.in (working copy)
@@ -51,7 +51,7 @@
tsmthred.o tsnmfmt.o tsputil.o tstnrapi.o tstnorm.o tzbdtest.o \
tzregts.o tztest.o ucdtest.o usettest.o ustrtest.o strcase.o transtst.o strtest.o thcoll.o \
bytestrietest.o ucharstrietest.o \
-itrbbi.o rbbiapts.o rbbitst.o ittrans.o transapi.o cpdtrtst.o \
+itrbbi.o rbbiapts.o dicttest.o rbbitst.o ittrans.o transapi.o cpdtrtst.o \
testutil.o transrt.o trnserr.o normconf.o sfwdchit.o \
jamotest.o srchtest.o reptest.o regextst.o \
itrbnf.o itrbnfrt.o itrbnfp.o ucaconf.o icusvtst.o \
Index: source/test/intltest/rbbitst.cpp
===================================================================
--- source/test/intltest/rbbitst.cpp (revision 30114)
+++ source/test/intltest/rbbitst.cpp (working copy)
@@ -134,17 +134,15 @@
#if !UCONFIG_NO_FILE_IO
case 21: name = "TestBug5775";
if (exec) TestBug5775(); break;
- case 22: name = "TestThaiBreaks";
- if (exec) TestThaiBreaks(); break;
- case 23: name = "TestTailoredBreaks";
+ case 22: name = "TestTailoredBreaks";
if (exec) TestTailoredBreaks(); break;
#else
- case 21: case 22: case 23: name = "skip";
+ case 21: case 22: name = "skip";
break;
#endif
- case 24: name = "TestDictRules";
+ case 23: name = "TestDictRules";
if (exec) TestDictRules(); break;
- case 25: name = "TestBug5532";
+ case 24: name = "TestBug5532";
if (exec) TestBug5532(); break;
default: name = ""; break; //needed to end loop
}
@@ -1810,56 +1808,6 @@
#endif
}
-void RBBITest::TestThaiBreaks() {
- UErrorCode status=U_ZERO_ERROR;
- BreakIterator* b;
- Locale locale = Locale("th");
- int32_t p, index;
- UChar c[]= {
- 0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B,
- 0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19,
- 0x0E16, 0x0E49, 0x0E33, 0x0000
- };
- int32_t expectedWordResult[] = {
- 2, 3, 6, 10, 11, 15, 17, 20, 22
- };
- int32_t expectedLineResult[] = {
- 3, 6, 11, 15, 17, 20, 22
- };
-
- int32_t size = u_strlen(c);
- UnicodeString text=UnicodeString(c);
-
- b = BreakIterator::createWordInstance(locale, status);
- if (U_FAILURE(status)) {
- errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status));
- return;
- }
- b->setText(text);
- p = index = 0;
- while ((p=b->next())!=BreakIterator::DONE && p < size) {
- if (p != expectedWordResult[index++]) {
- errln("Incorrect break given by thai word break iterator. Expected: %d Got: %d", expectedWordResult[index-1], p);
- }
- }
- delete b;
-
- b = BreakIterator::createLineInstance(locale, status);
- if (U_FAILURE(status)) {
- printf("Unable to create thai line break iterator.\n");
- return;
- }
- b->setText(text);
- p = index = 0;
- while ((p=b->next())!=BreakIterator::DONE && p < size) {
- if (p != expectedLineResult[index++]) {
- errln("Incorrect break given by thai line break iterator. Expected: %d Got: %d", expectedLineResult[index-1], p);
- }
- }
-
- delete b;
-}
-
// UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX"
// Words don't include colon or period (cldrbug #1969).
static const char posxWordText[] = "Can't have breaks in xx:yy or struct.field for CS-types.";
Index: source/test/intltest/dicttest.h
===================================================================
--- source/test/intltest/dicttest.h (revision 0)
+++ source/test/intltest/dicttest.h (revision 0)
@@ -0,0 +1,30 @@
+/*
+**********************************************************************
+* Copyright (C) 2011-2011, International Business Machines Corporation
+* and others. All Rights Reserved.
+**********************************************************************
+************************************************************************
+* Date Name Description
+* 05/14/2011 grhoten Creation.
+************************************************************************/
+
+#ifndef DICTTEST_H
+#define DICTTEST_H
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_BREAK_ITERATION
+
+#include "intltest.h"
+
+
+class DictionaryWordTest: public IntlTest {
+public:
+ void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL );
+ void TestWordBoundaries();
+ void TestThaiBreaks();
+};
+
+#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
+
+#endif
Property changes on: source/test/intltest/dicttest.h
___________________________________________________________________
Added: svn:eol-style
+ native
Index: source/test/testdata/wordsegments.txt
===================================================================
--- source/test/testdata/wordsegments.txt (revision 0)
+++ source/test/testdata/wordsegments.txt (revision 0)
@@ -0,0 +1,23 @@
+# Copyright (C) 2011-2011, International Business Machines Corporation
+# and others. All Rights Reserved.
+#
+# file name: wordsegments.txt
+# encoding: UTF-8
+#
+# created on: 2011may14
+# created by: George Rhoten
+# created by: Nathan Wells
+#
+# Word boundary test data for languages that contain no spaces.
+# Boundaries are deliminated with the | character so that it's easier to debug.
+#
+# If you have test data with zero width spaces to deliminate the words, use the following command example.
+# Be sure to copy the zero width space in the sed command.
+# echo 'សូមចំណាយពេលបន្តិចដើម្បីអធិស្ឋានអរព្រះគុណដល់ព្រះអង្គ' | sed 's//\|/g'
+#
+
+# Thai
+กู| |กิน|กุ้ง| |ปิ้่|งอ|ยู่|ใน|ถ้ำ
+
+# Khmer
+សូម|ចំណាយពេល|បន្តិច|ដើម្បី|អធិស្ឋាន|អរ|ព្រះគុណ|ដល់|ព្រះអង្គ
Property changes on: source/test/testdata/wordsegments.txt
___________________________________________________________________
Added: svn:mime-type
+ text/plain;charset=utf-8
Added: svn:eol-style
+ native