2 /* $Id: ustring.cc 749 2008-12-10 14:23:33Z jjongsma $ */
4 /* Copyright (C) 2002 The gtkmm Development Team
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Library General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Library General Public License for more details.
16 * You should have received a copy of the GNU Library General Public
17 * License along with this library; if not, write to the Free
18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 #include <glibmm/ustring.h>
22 #include <glibmm/convert.h>
23 #include <glibmm/error.h>
24 #include <glibmm/utility.h>
34 #include <glibmmconfig.h>
35 #ifdef GLIBMM_EXCEPTIONS_ENABLED
38 GLIBMM_USING_STD(find)
45 // Little helper to make the conversion from gunichar to UTF-8 a one-liner.
50 ustring::size_type len;
52 explicit UnicharToUtf8(gunichar uc)
53 : len (g_unichar_to_utf8(uc, buf)) {}
57 // All utf8_*_offset() functions return npos if offset is out of range.
58 // The caller should decide if npos is a valid argument and just marks
59 // the whole string, or if it is not allowed (e.g. for start positions).
60 // In the latter case std::out_of_range should be thrown, but usually
61 // std::string will do that for us.
63 // First overload: stop on '\0' character.
65 ustring::size_type utf8_byte_offset(const char* str, ustring::size_type offset)
67 if(offset == ustring::npos)
70 const char *const utf8_skip = g_utf8_skip;
73 for(; offset != 0; --offset)
75 const unsigned int c = static_cast<unsigned char>(*p);
86 // Second overload: stop when reaching maxlen.
88 ustring::size_type utf8_byte_offset(const char* str, ustring::size_type offset,
89 ustring::size_type maxlen)
91 if(offset == ustring::npos)
94 const char *const utf8_skip = g_utf8_skip;
95 const char *const pend = str + maxlen;
98 for(; offset != 0; --offset)
101 return ustring::npos;
103 p += utf8_skip[static_cast<unsigned char>(*p)];
109 // Third overload: stop when reaching str.size().
112 ustring::size_type utf8_byte_offset(const std::string& str, ustring::size_type offset)
114 return utf8_byte_offset(str.data(), offset, str.size());
117 // Takes UTF-8 character offset and count in ci and cn.
118 // Returns the byte offset and count in i and n.
120 struct Utf8SubstrBounds
122 ustring::size_type i;
123 ustring::size_type n;
125 Utf8SubstrBounds(const std::string& str, ustring::size_type ci, ustring::size_type cn)
127 i (utf8_byte_offset(str, ci)),
130 if(i != ustring::npos)
131 n = utf8_byte_offset(str.data() + i, cn, str.size() - i);
135 // Converts byte offset to UTF-8 character offset.
137 ustring::size_type utf8_char_offset(const std::string& str, ustring::size_type offset)
139 if(offset == ustring::npos)
140 return ustring::npos;
142 const char *const pdata = str.data();
143 return g_utf8_pointer_to_offset(pdata, pdata + offset);
147 // Helper to implement ustring::find_first_of() and find_first_not_of().
148 // Returns the UTF-8 character offset, or ustring::npos if not found.
150 ustring::size_type utf8_find_first_of(const std::string& str, ustring::size_type offset,
151 const char* utf8_match, long utf8_match_size,
154 const ustring::size_type byte_offset = utf8_byte_offset(str, offset);
155 if(byte_offset == ustring::npos)
156 return ustring::npos;
158 long ucs4_match_size = 0;
159 const Glib::ScopedPtr<gunichar> ucs4_match
160 (g_utf8_to_ucs4_fast(utf8_match, utf8_match_size, &ucs4_match_size));
162 const gunichar *const match_begin = ucs4_match.get();
163 const gunichar *const match_end = match_begin + ucs4_match_size;
165 const char *const str_begin = str.data();
166 const char *const str_end = str_begin + str.size();
168 for(const char* pstr = str_begin + byte_offset;
170 pstr = g_utf8_next_char(pstr))
172 const gunichar *const pfound = std::find(match_begin, match_end, g_utf8_get_char(pstr));
174 if((pfound != match_end) != find_not_of)
180 return ustring::npos;
183 // Helper to implement ustring::find_last_of() and find_last_not_of().
184 // Returns the UTF-8 character offset, or ustring::npos if not found.
186 ustring::size_type utf8_find_last_of(const std::string& str, ustring::size_type offset,
187 const char* utf8_match, long utf8_match_size,
190 long ucs4_match_size = 0;
191 const Glib::ScopedPtr<gunichar> ucs4_match
192 (g_utf8_to_ucs4_fast(utf8_match, utf8_match_size, &ucs4_match_size));
194 const gunichar *const match_begin = ucs4_match.get();
195 const gunichar *const match_end = match_begin + ucs4_match_size;
197 const char *const str_begin = str.data();
198 const char* pstr = str_begin;
200 // Set pstr one byte beyond the actual start position.
201 const ustring::size_type byte_offset = utf8_byte_offset(str, offset);
202 pstr += (byte_offset < str.size()) ? byte_offset + 1 : str.size();
204 while(pstr > str_begin)
206 // Move to previous character.
209 while((static_cast<unsigned char>(*pstr) & 0xC0u) == 0x80);
211 const gunichar *const pfound = std::find(match_begin, match_end, g_utf8_get_char(pstr));
213 if((pfound != match_end) != find_not_of)
214 return g_utf8_pointer_to_offset(str_begin, pstr);
217 return ustring::npos;
220 } // anonymous namespace
226 #ifndef GLIBMM_HAVE_ALLOWS_STATIC_INLINE_NPOS
227 // Initialize static member here,
228 // because the compiler did not allow us do it inline.
229 const ustring::size_type ustring::npos = std::string::npos;
233 * We need our own version of g_utf8_get_char(), because the std::string
234 * iterator is not necessarily a plain pointer (it's in fact not in GCC's
235 * libstdc++-v3). Copying the UTF-8 data into a temporary buffer isn't an
236 * option since this operation is quite time critical. The implementation
237 * is quite different from g_utf8_get_char() -- both more generic and likely
240 * By looking at the first byte of a UTF-8 character one can determine the
241 * number of bytes used. GLib offers the g_utf8_skip[] array for this purpose,
242 * but accessing this global variable would, on IA32 at least, introduce
243 * a function call to fetch the Global Offset Table, plus two levels of
244 * indirection in order to read the value. Even worse, fetching the GOT is
245 * always done right at the start of the function instead of the branch that
246 * actually uses the variable.
248 * Fortunately, there's a better way to get the byte count. As this table
249 * shows, there's a nice regular pattern in the UTF-8 encoding scheme:
251 * 0x00000000 - 0x0000007F: 0xxxxxxx
252 * 0x00000080 - 0x000007FF: 110xxxxx 10xxxxxx
253 * 0x00000800 - 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
254 * 0x00010000 - 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
255 * 0x00200000 - 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
256 * 0x04000000 - 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
258 * Except for the single byte case, the number of leading 1-bits equals the
259 * byte count. All that is needed is to shift the first byte to the left
260 * until bit 7 becomes 0. Naturally, doing so requires a loop -- but since
261 * we already have one, no additional cost is introduced. This shifting can
262 * further be combined with the computation of the bitmask needed to eliminate
263 * the leading length bits, thus saving yet another register.
265 * Note: If you change this code, it is advisable to also review what the
266 * compiler makes of it in the assembler output. Except for some pointless
267 * register moves, the generated code is sufficiently close to the optimum
268 * with GCC 4.1.2 on x86_64.
270 gunichar get_unichar_from_std_iterator(std::string::const_iterator pos)
272 unsigned int result = static_cast<unsigned char>(*pos);
274 if((result & 0x80) != 0)
276 unsigned int mask = 0x40;
281 const unsigned int c = static_cast<unsigned char>(*++pos);
285 while((result & mask) != 0);
294 /**** Glib::ustring ********************************************************/
301 ustring::ustring(const ustring& other)
303 string_ (other.string_)
306 ustring::ustring(const ustring& src, ustring::size_type i, ustring::size_type n)
310 const Utf8SubstrBounds bounds (src.string_, i, n);
311 string_.assign(src.string_, bounds.i, bounds.n);
314 ustring::ustring(const char* src, ustring::size_type n)
316 string_ (src, utf8_byte_offset(src, n))
319 ustring::ustring(const char* src)
324 ustring::ustring(ustring::size_type n, gunichar uc)
330 // Optimize the probably most common case.
331 string_.assign(n, static_cast<char>(uc));
335 const UnicharToUtf8 conv (uc);
336 string_.reserve(n * conv.len);
339 string_.append(conv.buf, conv.len);
343 ustring::ustring(ustring::size_type n, char c)
348 ustring::ustring(const std::string& src)
356 void ustring::swap(ustring& other)
358 string_.swap(other.string_);
362 /**** Glib::ustring::operator=() *******************************************/
364 ustring& ustring::operator=(const ustring& other)
366 string_ = other.string_;
370 ustring& ustring::operator=(const std::string& src)
376 ustring& ustring::operator=(const char* src)
382 ustring& ustring::operator=(gunichar uc)
384 const UnicharToUtf8 conv (uc);
385 string_.assign(conv.buf, conv.len);
389 ustring& ustring::operator=(char c)
396 /**** Glib::ustring::assign() **********************************************/
398 ustring& ustring::assign(const ustring& src)
400 string_ = src.string_;
404 ustring& ustring::assign(const ustring& src, ustring::size_type i, ustring::size_type n)
406 const Utf8SubstrBounds bounds (src.string_, i, n);
407 string_.assign(src.string_, bounds.i, bounds.n);
411 ustring& ustring::assign(const char* src, ustring::size_type n)
413 string_.assign(src, utf8_byte_offset(src, n));
417 ustring& ustring::assign(const char* src)
423 ustring& ustring::assign(ustring::size_type n, gunichar uc)
425 ustring temp (n, uc);
426 string_.swap(temp.string_);
430 ustring& ustring::assign(ustring::size_type n, char c)
432 string_.assign(n, c);
437 /**** Glib::ustring::operator+=() ******************************************/
439 ustring& ustring::operator+=(const ustring& src)
441 string_ += src.string_;
445 ustring& ustring::operator+=(const char* src)
451 ustring& ustring::operator+=(gunichar uc)
453 const UnicharToUtf8 conv (uc);
454 string_.append(conv.buf, conv.len);
458 ustring& ustring::operator+=(char c)
465 /**** Glib::ustring::push_back() *******************************************/
467 void ustring::push_back(gunichar uc)
469 const UnicharToUtf8 conv (uc);
470 string_.append(conv.buf, conv.len);
473 void ustring::push_back(char c)
479 /**** Glib::ustring::append() **********************************************/
481 ustring& ustring::append(const ustring& src)
483 string_ += src.string_;
487 ustring& ustring::append(const ustring& src, ustring::size_type i, ustring::size_type n)
489 const Utf8SubstrBounds bounds (src.string_, i, n);
490 string_.append(src.string_, bounds.i, bounds.n);
494 ustring& ustring::append(const char* src, ustring::size_type n)
496 string_.append(src, utf8_byte_offset(src, n));
500 ustring& ustring::append(const char* src)
506 ustring& ustring::append(ustring::size_type n, gunichar uc)
508 string_.append(ustring(n, uc).string_);
512 ustring& ustring::append(ustring::size_type n, char c)
514 string_.append(n, c);
519 /**** Glib::ustring::insert() **********************************************/
521 ustring& ustring::insert(ustring::size_type i, const ustring& src)
523 string_.insert(utf8_byte_offset(string_, i), src.string_);
527 ustring& ustring::insert(ustring::size_type i, const ustring& src,
528 ustring::size_type i2, ustring::size_type n)
530 const Utf8SubstrBounds bounds2 (src.string_, i2, n);
531 string_.insert(utf8_byte_offset(string_, i), src.string_, bounds2.i, bounds2.n);
535 ustring& ustring::insert(ustring::size_type i, const char* src, ustring::size_type n)
537 string_.insert(utf8_byte_offset(string_, i), src, utf8_byte_offset(src, n));
541 ustring& ustring::insert(ustring::size_type i, const char* src)
543 string_.insert(utf8_byte_offset(string_, i), src);
547 ustring& ustring::insert(ustring::size_type i, ustring::size_type n, gunichar uc)
549 string_.insert(utf8_byte_offset(string_, i), ustring(n, uc).string_);
553 ustring& ustring::insert(ustring::size_type i, ustring::size_type n, char c)
555 string_.insert(utf8_byte_offset(string_, i), n, c);
559 ustring::iterator ustring::insert(ustring::iterator p, gunichar uc)
561 const size_type offset = p.base() - string_.begin();
562 const UnicharToUtf8 conv (uc);
563 string_.insert(offset, conv.buf, conv.len);
564 return iterator(string_.begin() + offset);
567 ustring::iterator ustring::insert(ustring::iterator p, char c)
569 return iterator(string_.insert(p.base(), c));
572 void ustring::insert(ustring::iterator p, ustring::size_type n, gunichar uc)
574 string_.insert(p.base() - string_.begin(), ustring(n, uc).string_);
577 void ustring::insert(ustring::iterator p, ustring::size_type n, char c)
579 string_.insert(p.base(), n, c);
583 /**** Glib::ustring::replace() *********************************************/
585 ustring& ustring::replace(ustring::size_type i, ustring::size_type n, const ustring& src)
587 const Utf8SubstrBounds bounds (string_, i, n);
588 string_.replace(bounds.i, bounds.n, src.string_);
592 ustring& ustring::replace(ustring::size_type i, ustring::size_type n,
593 const ustring& src, ustring::size_type i2, ustring::size_type n2)
595 const Utf8SubstrBounds bounds (string_, i, n);
596 const Utf8SubstrBounds bounds2 (src.string_, i2, n2);
597 string_.replace(bounds.i, bounds.n, src.string_, bounds2.i, bounds2.n);
601 ustring& ustring::replace(ustring::size_type i, ustring::size_type n,
602 const char* src, ustring::size_type n2)
604 const Utf8SubstrBounds bounds (string_, i, n);
605 string_.replace(bounds.i, bounds.n, src, utf8_byte_offset(src, n2));
609 ustring& ustring::replace(ustring::size_type i, ustring::size_type n, const char* src)
611 const Utf8SubstrBounds bounds (string_, i, n);
612 string_.replace(bounds.i, bounds.n, src);
616 ustring& ustring::replace(ustring::size_type i, ustring::size_type n,
617 ustring::size_type n2, gunichar uc)
619 const Utf8SubstrBounds bounds (string_, i, n);
620 string_.replace(bounds.i, bounds.n, ustring(n2, uc).string_);
624 ustring& ustring::replace(ustring::size_type i, ustring::size_type n,
625 ustring::size_type n2, char c)
627 const Utf8SubstrBounds bounds (string_, i, n);
628 string_.replace(bounds.i, bounds.n, n2, c);
632 ustring& ustring::replace(ustring::iterator pbegin, ustring::iterator pend, const ustring& src)
634 string_.replace(pbegin.base(), pend.base(), src.string_);
638 ustring& ustring::replace(ustring::iterator pbegin, ustring::iterator pend,
639 const char* src, ustring::size_type n)
641 string_.replace(pbegin.base(), pend.base(), src, utf8_byte_offset(src, n));
645 ustring& ustring::replace(ustring::iterator pbegin, ustring::iterator pend, const char* src)
647 string_.replace(pbegin.base(), pend.base(), src);
651 ustring& ustring::replace(ustring::iterator pbegin, ustring::iterator pend,
652 ustring::size_type n, gunichar uc)
654 string_.replace(pbegin.base(), pend.base(), ustring(n, uc).string_);
658 ustring& ustring::replace(ustring::iterator pbegin, ustring::iterator pend,
659 ustring::size_type n, char c)
661 string_.replace(pbegin.base(), pend.base(), n, c);
666 /**** Glib::ustring::erase() ***********************************************/
668 void ustring::clear()
673 ustring& ustring::erase(ustring::size_type i, ustring::size_type n)
675 const Utf8SubstrBounds bounds (string_, i, n);
676 string_.erase(bounds.i, bounds.n);
680 ustring& ustring::erase()
686 ustring::iterator ustring::erase(ustring::iterator p)
688 ustring::iterator iter_end = p;
691 return iterator(string_.erase(p.base(), iter_end.base()));
694 ustring::iterator ustring::erase(ustring::iterator pbegin, ustring::iterator pend)
696 return iterator(string_.erase(pbegin.base(), pend.base()));
700 /**** Glib::ustring::compare() *********************************************/
702 int ustring::compare(const ustring& rhs) const
704 return g_utf8_collate(string_.c_str(), rhs.string_.c_str());
707 int ustring::compare(const char* rhs) const
709 return g_utf8_collate(string_.c_str(), rhs);
712 int ustring::compare(ustring::size_type i, ustring::size_type n, const ustring& rhs) const
714 return ustring(*this, i, n).compare(rhs);
717 int ustring::compare(ustring::size_type i, ustring::size_type n,
718 const ustring& rhs, ustring::size_type i2, ustring::size_type n2) const
720 return ustring(*this, i, n).compare(ustring(rhs, i2, n2));
723 int ustring::compare(ustring::size_type i, ustring::size_type n,
724 const char* rhs, ustring::size_type n2) const
726 return ustring(*this, i, n).compare(ustring(rhs, n2));
729 int ustring::compare(ustring::size_type i, ustring::size_type n, const char* rhs) const
731 return ustring(*this, i, n).compare(rhs);
735 /**** Glib::ustring -- index access ****************************************/
737 ustring::value_type ustring::operator[](ustring::size_type i) const
739 return g_utf8_get_char(g_utf8_offset_to_pointer(string_.data(), i));
742 ustring::value_type ustring::at(ustring::size_type i) const
744 const size_type byte_offset = utf8_byte_offset(string_, i);
746 // Throws std::out_of_range if the index is invalid.
747 return g_utf8_get_char(&string_.at(byte_offset));
751 /**** Glib::ustring -- iterator access *************************************/
753 ustring::iterator ustring::begin()
755 return iterator(string_.begin());
758 ustring::iterator ustring::end()
760 return iterator(string_.end());
763 ustring::const_iterator ustring::begin() const
765 return const_iterator(string_.begin());
768 ustring::const_iterator ustring::end() const
770 return const_iterator(string_.end());
773 ustring::reverse_iterator ustring::rbegin()
775 return reverse_iterator(iterator(string_.end()));
778 ustring::reverse_iterator ustring::rend()
780 return reverse_iterator(iterator(string_.begin()));
783 ustring::const_reverse_iterator ustring::rbegin() const
785 return const_reverse_iterator(const_iterator(string_.end()));
788 ustring::const_reverse_iterator ustring::rend() const
790 return const_reverse_iterator(const_iterator(string_.begin()));
794 /**** Glib::ustring::find() ************************************************/
796 ustring::size_type ustring::find(const ustring& str, ustring::size_type i) const
798 return utf8_char_offset(string_, string_.find(str.string_, utf8_byte_offset(string_, i)));
801 ustring::size_type ustring::find(const char* str, ustring::size_type i, ustring::size_type n) const
803 return utf8_char_offset(string_, string_.find(str, utf8_byte_offset(string_, i),
804 utf8_byte_offset(str, n)));
807 ustring::size_type ustring::find(const char* str, ustring::size_type i) const
809 return utf8_char_offset(string_, string_.find(str, utf8_byte_offset(string_, i)));
812 ustring::size_type ustring::find(gunichar uc, ustring::size_type i) const
814 const UnicharToUtf8 conv (uc);
815 return utf8_char_offset(string_, string_.find(conv.buf, utf8_byte_offset(string_, i), conv.len));
818 ustring::size_type ustring::find(char c, ustring::size_type i) const
820 return utf8_char_offset(string_, string_.find(c, utf8_byte_offset(string_, i)));
824 /**** Glib::ustring::rfind() ***********************************************/
826 ustring::size_type ustring::rfind(const ustring& str, ustring::size_type i) const
828 return utf8_char_offset(string_, string_.rfind(str.string_, utf8_byte_offset(string_, i)));
831 ustring::size_type ustring::rfind(const char* str, ustring::size_type i,
832 ustring::size_type n) const
834 return utf8_char_offset(string_, string_.rfind(str, utf8_byte_offset(string_, i),
835 utf8_byte_offset(str, n)));
838 ustring::size_type ustring::rfind(const char* str, ustring::size_type i) const
840 return utf8_char_offset(string_, string_.rfind(str, utf8_byte_offset(string_, i)));
843 ustring::size_type ustring::rfind(gunichar uc, ustring::size_type i) const
845 const UnicharToUtf8 conv (uc);
846 return utf8_char_offset(string_, string_.rfind(conv.buf, utf8_byte_offset(string_, i), conv.len));
849 ustring::size_type ustring::rfind(char c, ustring::size_type i) const
851 return utf8_char_offset(string_, string_.rfind(c, utf8_byte_offset(string_, i)));
855 /**** Glib::ustring::find_first_of() ***************************************/
857 ustring::size_type ustring::find_first_of(const ustring& match, ustring::size_type i) const
859 return utf8_find_first_of(string_, i, match.string_.data(), match.string_.size(), false);
862 ustring::size_type ustring::find_first_of(const char* match,
863 ustring::size_type i, ustring::size_type n) const
865 return utf8_find_first_of(string_, i, match, n, false);
868 ustring::size_type ustring::find_first_of(const char* match, ustring::size_type i) const
870 return utf8_find_first_of(string_, i, match, -1, false);
873 ustring::size_type ustring::find_first_of(gunichar uc, ustring::size_type i) const
878 ustring::size_type ustring::find_first_of(char c, ustring::size_type i) const
884 /**** Glib::ustring::find_last_of() ****************************************/
886 ustring::size_type ustring::find_last_of(const ustring& match, ustring::size_type i) const
888 return utf8_find_last_of(string_, i, match.string_.data(), match.string_.size(), false);
891 ustring::size_type ustring::find_last_of(const char* match,
892 ustring::size_type i, ustring::size_type n) const
894 return utf8_find_last_of(string_, i, match, n, false);
897 ustring::size_type ustring::find_last_of(const char* match, ustring::size_type i) const
899 return utf8_find_last_of(string_, i, match, -1, false);
902 ustring::size_type ustring::find_last_of(gunichar uc, ustring::size_type i) const
907 ustring::size_type ustring::find_last_of(char c, ustring::size_type i) const
913 /**** Glib::ustring::find_first_not_of() ***********************************/
915 ustring::size_type ustring::find_first_not_of(const ustring& match, ustring::size_type i) const
917 return utf8_find_first_of(string_, i, match.string_.data(), match.string_.size(), true);
920 ustring::size_type ustring::find_first_not_of(const char* match,
921 ustring::size_type i, ustring::size_type n) const
923 return utf8_find_first_of(string_, i, match, n, true);
926 ustring::size_type ustring::find_first_not_of(const char* match, ustring::size_type i) const
928 return utf8_find_first_of(string_, i, match, -1, true);
931 // Unfortunately, all of the find_*_not_of() methods for single
932 // characters need their own special implementation.
934 ustring::size_type ustring::find_first_not_of(gunichar uc, ustring::size_type i) const
936 const size_type bi = utf8_byte_offset(string_, i);
939 const char *const pbegin = string_.data();
940 const char *const pend = pbegin + string_.size();
942 for(const char* p = pbegin + bi;
944 p = g_utf8_next_char(p), ++i)
946 if(g_utf8_get_char(p) != uc)
953 ustring::size_type ustring::find_first_not_of(char c, ustring::size_type i) const
955 const size_type bi = utf8_byte_offset(string_, i);
958 const char *const pbegin = string_.data();
959 const char *const pend = pbegin + string_.size();
961 for(const char* p = pbegin + bi;
963 p = g_utf8_next_char(p), ++i)
973 /**** Glib::ustring::find_last_not_of() ************************************/
975 ustring::size_type ustring::find_last_not_of(const ustring& match, ustring::size_type i) const
977 return utf8_find_last_of(string_, i, match.string_.data(), match.string_.size(), true);
980 ustring::size_type ustring::find_last_not_of(const char* match,
981 ustring::size_type i, ustring::size_type n) const
983 return utf8_find_last_of(string_, i, match, n, true);
986 ustring::size_type ustring::find_last_not_of(const char* match, ustring::size_type i) const
988 return utf8_find_last_of(string_, i, match, -1, true);
991 // Unfortunately, all of the find_*_not_of() methods for single
992 // characters need their own special implementation.
994 ustring::size_type ustring::find_last_not_of(gunichar uc, ustring::size_type i) const
996 const char *const pbegin = string_.data();
997 const char *const pend = pbegin + string_.size();
999 size_type i_found = npos;
1001 for(const char* p = pbegin;
1002 p < pend && i_cur <= i;
1003 p = g_utf8_next_char(p), ++i_cur)
1005 if(g_utf8_get_char(p) != uc)
1011 ustring::size_type ustring::find_last_not_of(char c, ustring::size_type i) const
1013 const char *const pbegin = string_.data();
1014 const char *const pend = pbegin + string_.size();
1015 size_type i_cur = 0;
1016 size_type i_found = npos;
1018 for(const char* p = pbegin;
1019 p < pend && i_cur <= i;
1020 p = g_utf8_next_char(p), ++i_cur)
1029 /**** Glib::ustring -- get size and resize *********************************/
1031 bool ustring::empty() const
1033 return string_.empty();
1036 ustring::size_type ustring::size() const
1038 const char *const pdata = string_.data();
1039 return g_utf8_pointer_to_offset(pdata, pdata + string_.size());
1042 ustring::size_type ustring::length() const
1044 const char *const pdata = string_.data();
1045 return g_utf8_pointer_to_offset(pdata, pdata + string_.size());
1048 ustring::size_type ustring::bytes() const
1050 return string_.size();
1053 ustring::size_type ustring::capacity() const
1055 return string_.capacity();
1058 ustring::size_type ustring::max_size() const
1060 return string_.max_size();
1063 void ustring::resize(ustring::size_type n, gunichar uc)
1065 const size_type size_now = size();
1068 else if(n > size_now)
1069 append(n - size_now, uc);
1072 void ustring::resize(ustring::size_type n, char c)
1074 const size_type size_now = size();
1077 else if(n > size_now)
1078 string_.append(n - size_now, c);
1081 void ustring::reserve(ustring::size_type n)
1087 /**** Glib::ustring -- C string access *************************************/
1089 const char* ustring::data() const
1091 return string_.data();
1094 const char* ustring::c_str() const
1096 return string_.c_str();
1099 // Note that copy() requests UTF-8 character offsets as
1100 // parameters, but returns the number of copied bytes.
1102 ustring::size_type ustring::copy(char* dest, ustring::size_type n, ustring::size_type i) const
1104 const Utf8SubstrBounds bounds (string_, i, n);
1105 return string_.copy(dest, bounds.n, bounds.i);
1109 /**** Glib::ustring -- UTF-8 utilities *************************************/
1111 bool ustring::validate() const
1113 return (g_utf8_validate(string_.data(), string_.size(), 0) != 0);
1116 bool ustring::validate(ustring::iterator& first_invalid)
1118 const char *const pdata = string_.data();
1119 const char* valid_end = pdata;
1120 const int is_valid = g_utf8_validate(pdata, string_.size(), &valid_end);
1122 first_invalid = iterator(string_.begin() + (valid_end - pdata));
1123 return (is_valid != 0);
1126 bool ustring::validate(ustring::const_iterator& first_invalid) const
1128 const char *const pdata = string_.data();
1129 const char* valid_end = pdata;
1130 const int is_valid = g_utf8_validate(pdata, string_.size(), &valid_end);
1132 first_invalid = const_iterator(string_.begin() + (valid_end - pdata));
1133 return (is_valid != 0);
1136 bool ustring::is_ascii() const
1138 const char* p = string_.data();
1139 const char *const pend = p + string_.size();
1141 for(; p != pend; ++p)
1143 if((static_cast<unsigned char>(*p) & 0x80u) != 0)
1150 ustring ustring::normalize(NormalizeMode mode) const
1152 const ScopedPtr<char> buf (g_utf8_normalize(string_.data(), string_.size(),
1153 static_cast<GNormalizeMode>(int(mode))));
1154 return ustring(buf.get());
1157 ustring ustring::uppercase() const
1159 const ScopedPtr<char> buf (g_utf8_strup(string_.data(), string_.size()));
1160 return ustring(buf.get());
1163 ustring ustring::lowercase() const
1165 const ScopedPtr<char> buf (g_utf8_strdown(string_.data(), string_.size()));
1166 return ustring(buf.get());
1169 ustring ustring::casefold() const
1171 const ScopedPtr<char> buf (g_utf8_casefold(string_.data(), string_.size()));
1172 return ustring(buf.get());
1175 std::string ustring::collate_key() const
1177 const ScopedPtr<char> buf (g_utf8_collate_key(string_.data(), string_.size()));
1178 return std::string(buf.get());
1181 std::string ustring::casefold_collate_key() const
1183 char *const casefold_buf = g_utf8_casefold(string_.data(), string_.size());
1184 char *const key_buf = g_utf8_collate_key(casefold_buf, -1);
1185 g_free(casefold_buf);
1186 return std::string(ScopedPtr<char>(key_buf).get());
1189 /**** Glib::ustring -- Message formatting **********************************/
1192 ustring ustring::compose_argv(const Glib::ustring& fmt, int argc, const ustring* const* argv)
1194 std::string::size_type result_size = fmt.raw().size();
1196 // Guesstimate the final string size.
1197 for (int i = 0; i < argc; ++i)
1198 result_size += argv[i]->raw().size();
1201 result.reserve(result_size);
1203 const char* const pfmt = fmt.raw().c_str();
1204 const char* start = pfmt;
1206 while (const char* const stop = std::strchr(start, '%'))
1210 result.append(start, stop - start + 1);
1215 const int index = Ascii::digit_value(stop[1]) - 1;
1217 if (index >= 0 && index < argc)
1219 result.append(start, stop - start);
1220 result += argv[index]->raw();
1225 const char* const next = (stop[1] != '\0') ? g_utf8_next_char(stop + 1) : (stop + 1);
1227 // Copy invalid substitutions literally to the output.
1228 result.append(start, next - start);
1230 g_warning("invalid substitution \"%s\" in fmt string \"%s\"",
1231 result.c_str() + result.size() - (next - stop), pfmt);
1237 result.append(start, pfmt + fmt.raw().size() - start);
1242 /**** Glib::ustring::SequenceToString **************************************/
1244 ustring::SequenceToString<Glib::ustring::iterator,gunichar>
1245 ::SequenceToString(Glib::ustring::iterator pbegin, Glib::ustring::iterator pend)
1247 std::string(pbegin.base(), pend.base())
1250 ustring::SequenceToString<Glib::ustring::const_iterator,gunichar>
1251 ::SequenceToString(Glib::ustring::const_iterator pbegin, Glib::ustring::const_iterator pend)
1253 std::string(pbegin.base(), pend.base())
1256 /**** Glib::ustring::FormatStream ******************************************/
1258 ustring::FormatStream::FormatStream()
1263 ustring::FormatStream::~FormatStream()
1266 ustring ustring::FormatStream::to_string() const
1270 #ifdef GLIBMM_HAVE_WIDE_STREAM
1271 const std::wstring str = stream_.str();
1273 # if defined(__STDC_ISO_10646__) && SIZEOF_WCHAR_T == 4
1274 // Avoid going through iconv if wchar_t always contains UCS-4.
1276 const ScopedPtr<char> buf (g_ucs4_to_utf8(reinterpret_cast<const gunichar*>(str.data()),
1277 str.size(), 0, &n_bytes, &error));
1278 # elif defined(G_OS_WIN32) && SIZEOF_WCHAR_T == 2
1279 // Avoid going through iconv if wchar_t always contains UTF-16.
1281 const ScopedPtr<char> buf (g_utf16_to_utf8(reinterpret_cast<const gunichar2*>(str.data()),
1282 str.size(), 0, &n_bytes, &error));
1285 const ScopedPtr<char> buf (g_convert(reinterpret_cast<const char*>(str.data()),
1286 str.size() * sizeof(std::wstring::value_type),
1287 "UTF-8", "WCHAR_T", 0, &n_bytes, &error));
1288 # endif /* !(__STDC_ISO_10646__ || G_OS_WIN32) */
1290 #else /* !GLIBMM_HAVE_WIDE_STREAM */
1291 const std::string str = stream_.str();
1294 const ScopedPtr<char> buf (g_locale_to_utf8(str.data(), str.size(), 0, &n_bytes, &error));
1295 #endif /* !GLIBMM_HAVE_WIDE_STREAM */
1299 #ifdef GLIBMM_EXCEPTIONS_ENABLED
1300 Glib::Error::throw_exception(error);
1302 g_warning("%s: %s", G_STRFUNC, error->message);
1303 g_error_free(error);
1308 return ustring(buf.get(), buf.get() + n_bytes);
1311 /**** Glib::ustring -- stream I/O operators ********************************/
1313 std::istream& operator>>(std::istream& is, Glib::ustring& utf8_string)
1320 const ScopedPtr<char> buf (g_locale_to_utf8(str.data(), str.size(), 0, &n_bytes, &error));
1324 #ifdef GLIBMM_EXCEPTIONS_ENABLED
1325 Glib::Error::throw_exception(error);
1327 g_warning("%s: %s", G_STRFUNC, error->message);
1328 g_error_free(error);
1333 utf8_string.assign(buf.get(), buf.get() + n_bytes);
1338 std::ostream& operator<<(std::ostream& os, const Glib::ustring& utf8_string)
1341 const ScopedPtr<char> buf (g_locale_from_utf8(utf8_string.raw().data(),
1342 utf8_string.raw().size(), 0, 0, &error));
1345 #ifdef GLIBMM_EXCEPTIONS_ENABLED
1346 Glib::Error::throw_exception(error);
1348 g_warning("%s: %s", G_STRFUNC, error->message);
1349 g_error_free(error);
1354 // This won't work if the string contains NUL characters. Unfortunately,
1355 // std::ostream::write() ignores format flags, so we cannot use that.
1356 // The only option would be to create a temporary std::string. However,
1357 // even then GCC's libstdc++-v3 prints only the characters up to the first
1358 // NUL. Given this, there doesn't seem much of a point in allowing NUL in
1359 // formatted output. The semantics would be unclear anyway: what's the
1360 // screen width of a NUL?
1366 #ifdef GLIBMM_HAVE_WIDE_STREAM
1368 std::wistream& operator>>(std::wistream& is, ustring& utf8_string)
1375 #if defined(__STDC_ISO_10646__) && SIZEOF_WCHAR_T == 4
1376 // Avoid going through iconv if wchar_t always contains UCS-4.
1378 const ScopedPtr<char> buf (g_ucs4_to_utf8(reinterpret_cast<const gunichar*>(wstr.data()),
1379 wstr.size(), 0, &n_bytes, &error));
1380 #elif defined(G_OS_WIN32) && SIZEOF_WCHAR_T == 2
1381 // Avoid going through iconv if wchar_t always contains UTF-16.
1383 const ScopedPtr<char> buf (g_utf16_to_utf8(reinterpret_cast<const gunichar2*>(wstr.data()),
1384 wstr.size(), 0, &n_bytes, &error));
1387 const ScopedPtr<char> buf (g_convert(reinterpret_cast<const char*>(wstr.data()),
1388 wstr.size() * sizeof(std::wstring::value_type),
1389 "UTF-8", "WCHAR_T", 0, &n_bytes, &error));
1390 #endif /* !(__STDC_ISO_10646__ || G_OS_WIN32) */
1394 #ifdef GLIBMM_EXCEPTIONS_ENABLED
1395 Glib::Error::throw_exception(error);
1397 g_warning("%s: %s", G_STRFUNC, error->message);
1398 g_error_free(error);
1403 utf8_string.assign(buf.get(), buf.get() + n_bytes);
1408 std::wostream& operator<<(std::wostream& os, const ustring& utf8_string)
1412 #if defined(__STDC_ISO_10646__) && SIZEOF_WCHAR_T == 4
1413 // Avoid going through iconv if wchar_t always contains UCS-4.
1414 const ScopedPtr<gunichar> buf (g_utf8_to_ucs4(utf8_string.raw().data(),
1415 utf8_string.raw().size(), 0, 0, &error));
1416 #elif defined(G_OS_WIN32) && SIZEOF_WCHAR_T == 2
1417 // Avoid going through iconv if wchar_t always contains UTF-16.
1418 const ScopedPtr<gunichar2> buf (g_utf8_to_utf16(utf8_string.raw().data(),
1419 utf8_string.raw().size(), 0, 0, &error));
1421 // TODO: For some reason the conversion from UTF-8 to WCHAR_T doesn't work
1422 // with g_convert(), while iconv on the command line handles it just fine.
1423 // Maybe a bug in GLib?
1424 const ScopedPtr<char> buf (g_convert(utf8_string.raw().data(), utf8_string.raw().size(),
1425 "WCHAR_T", "UTF-8", 0, 0, &error));
1426 #endif /* !(__STDC_ISO_10646__ || G_OS_WIN32) */
1430 #ifdef GLIBMM_EXCEPTIONS_ENABLED
1431 Glib::Error::throw_exception(error);
1433 g_warning("%s: %s", G_STRFUNC, error->message);
1434 g_error_free(error);
1439 // This won't work if the string contains NUL characters. Unfortunately,
1440 // std::wostream::write() ignores format flags, so we cannot use that.
1441 // The only option would be to create a temporary std::wstring. However,
1442 // even then GCC's libstdc++-v3 prints only the characters up to the first
1443 // NUL. Given this, there doesn't seem much of a point in allowing NUL in
1444 // formatted output. The semantics would be unclear anyway: what's the
1445 // screen width of a NUL?
1446 os << reinterpret_cast<wchar_t*>(buf.get());
1451 #endif /* GLIBMM_HAVE_WIDE_STREAM */