encodedstream.h 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299
  1. // Tencent is pleased to support the open source community by making RapidJSON available.
  2. //
  3. // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
  4. //
  5. // Licensed under the MIT License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // http://opensource.org/licenses/MIT
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #ifndef RAPIDJSON_ENCODEDSTREAM_H_
  15. #define RAPIDJSON_ENCODEDSTREAM_H_
  16. #include "stream.h"
  17. #include "memorystream.h"
  18. #ifdef __GNUC__
  19. RAPIDJSON_DIAG_PUSH
  20. RAPIDJSON_DIAG_OFF(effc++)
  21. #endif
  22. #ifdef __clang__
  23. RAPIDJSON_DIAG_PUSH
  24. RAPIDJSON_DIAG_OFF(padded)
  25. #endif
  26. RAPIDJSON_NAMESPACE_BEGIN
  27. //! Input byte stream wrapper with a statically bound encoding.
  28. /*!
  29. \tparam Encoding The interpretation of encoding of the stream. Either UTF8, UTF16LE, UTF16BE, UTF32LE, UTF32BE.
  30. \tparam InputByteStream Type of input byte stream. For example, FileReadStream.
  31. */
  32. template <typename Encoding, typename InputByteStream>
  33. class EncodedInputStream {
  34. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
  35. public:
  36. typedef typename Encoding::Ch Ch;
  37. EncodedInputStream(InputByteStream& is) : is_(is) {
  38. current_ = Encoding::TakeBOM(is_);
  39. }
  40. Ch Peek() const { return current_; }
  41. Ch Take() { Ch c = current_; current_ = Encoding::Take(is_); return c; }
  42. size_t Tell() const { return is_.Tell(); }
  43. // Not implemented
  44. void Put(Ch) { RAPIDJSON_ASSERT(false); }
  45. void Flush() { RAPIDJSON_ASSERT(false); }
  46. Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
  47. size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
  48. private:
  49. EncodedInputStream(const EncodedInputStream&);
  50. EncodedInputStream& operator=(const EncodedInputStream&);
  51. InputByteStream& is_;
  52. Ch current_;
  53. };
  54. //! Specialized for UTF8 MemoryStream.
  55. template <>
  56. class EncodedInputStream<UTF8<>, MemoryStream> {
  57. public:
  58. typedef UTF8<>::Ch Ch;
  59. EncodedInputStream(MemoryStream& is) : is_(is) {
  60. if (static_cast<unsigned char>(is_.Peek()) == 0xEFu) is_.Take();
  61. if (static_cast<unsigned char>(is_.Peek()) == 0xBBu) is_.Take();
  62. if (static_cast<unsigned char>(is_.Peek()) == 0xBFu) is_.Take();
  63. }
  64. Ch Peek() const { return is_.Peek(); }
  65. Ch Take() { return is_.Take(); }
  66. size_t Tell() const { return is_.Tell(); }
  67. // Not implemented
  68. void Put(Ch) {}
  69. void Flush() {}
  70. Ch* PutBegin() { return 0; }
  71. size_t PutEnd(Ch*) { return 0; }
  72. MemoryStream& is_;
  73. private:
  74. EncodedInputStream(const EncodedInputStream&);
  75. EncodedInputStream& operator=(const EncodedInputStream&);
  76. };
  77. //! Output byte stream wrapper with statically bound encoding.
  78. /*!
  79. \tparam Encoding The interpretation of encoding of the stream. Either UTF8, UTF16LE, UTF16BE, UTF32LE, UTF32BE.
  80. \tparam OutputByteStream Type of input byte stream. For example, FileWriteStream.
  81. */
  82. template <typename Encoding, typename OutputByteStream>
  83. class EncodedOutputStream {
  84. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
  85. public:
  86. typedef typename Encoding::Ch Ch;
  87. EncodedOutputStream(OutputByteStream& os, bool putBOM = true) : os_(os) {
  88. if (putBOM)
  89. Encoding::PutBOM(os_);
  90. }
  91. void Put(Ch c) { Encoding::Put(os_, c); }
  92. void Flush() { os_.Flush(); }
  93. // Not implemented
  94. Ch Peek() const { RAPIDJSON_ASSERT(false); return 0;}
  95. Ch Take() { RAPIDJSON_ASSERT(false); return 0;}
  96. size_t Tell() const { RAPIDJSON_ASSERT(false); return 0; }
  97. Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
  98. size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
  99. private:
  100. EncodedOutputStream(const EncodedOutputStream&);
  101. EncodedOutputStream& operator=(const EncodedOutputStream&);
  102. OutputByteStream& os_;
  103. };
  104. #define RAPIDJSON_ENCODINGS_FUNC(x) UTF8<Ch>::x, UTF16LE<Ch>::x, UTF16BE<Ch>::x, UTF32LE<Ch>::x, UTF32BE<Ch>::x
  105. //! Input stream wrapper with dynamically bound encoding and automatic encoding detection.
  106. /*!
  107. \tparam CharType Type of character for reading.
  108. \tparam InputByteStream type of input byte stream to be wrapped.
  109. */
  110. template <typename CharType, typename InputByteStream>
  111. class AutoUTFInputStream {
  112. RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
  113. public:
  114. typedef CharType Ch;
  115. //! Constructor.
  116. /*!
  117. \param is input stream to be wrapped.
  118. \param type UTF encoding type if it is not detected from the stream.
  119. */
  120. AutoUTFInputStream(InputByteStream& is, UTFType type = kUTF8) : is_(&is), type_(type), hasBOM_(false) {
  121. RAPIDJSON_ASSERT(type >= kUTF8 && type <= kUTF32BE);
  122. DetectType();
  123. static const TakeFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Take) };
  124. takeFunc_ = f[type_];
  125. current_ = takeFunc_(*is_);
  126. }
  127. UTFType GetType() const { return type_; }
  128. bool HasBOM() const { return hasBOM_; }
  129. Ch Peek() const { return current_; }
  130. Ch Take() { Ch c = current_; current_ = takeFunc_(*is_); return c; }
  131. size_t Tell() const { return is_->Tell(); }
  132. // Not implemented
  133. void Put(Ch) { RAPIDJSON_ASSERT(false); }
  134. void Flush() { RAPIDJSON_ASSERT(false); }
  135. Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
  136. size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
  137. private:
  138. AutoUTFInputStream(const AutoUTFInputStream&);
  139. AutoUTFInputStream& operator=(const AutoUTFInputStream&);
  140. // Detect encoding type with BOM or RFC 4627
  141. void DetectType() {
  142. // BOM (Byte Order Mark):
  143. // 00 00 FE FF UTF-32BE
  144. // FF FE 00 00 UTF-32LE
  145. // FE FF UTF-16BE
  146. // FF FE UTF-16LE
  147. // EF BB BF UTF-8
  148. const unsigned char* c = reinterpret_cast<const unsigned char *>(is_->Peek4());
  149. if (!c)
  150. return;
  151. unsigned bom = static_cast<unsigned>(c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24));
  152. hasBOM_ = false;
  153. if (bom == 0xFFFE0000) { type_ = kUTF32BE; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); is_->Take(); }
  154. else if (bom == 0x0000FEFF) { type_ = kUTF32LE; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); is_->Take(); }
  155. else if ((bom & 0xFFFF) == 0xFFFE) { type_ = kUTF16BE; hasBOM_ = true; is_->Take(); is_->Take(); }
  156. else if ((bom & 0xFFFF) == 0xFEFF) { type_ = kUTF16LE; hasBOM_ = true; is_->Take(); is_->Take(); }
  157. else if ((bom & 0xFFFFFF) == 0xBFBBEF) { type_ = kUTF8; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); }
  158. // RFC 4627: Section 3
  159. // "Since the first two characters of a JSON text will always be ASCII
  160. // characters [RFC0020], it is possible to determine whether an octet
  161. // stream is UTF-8, UTF-16 (BE or LE), or UTF-32 (BE or LE) by looking
  162. // at the pattern of nulls in the first four octets."
  163. // 00 00 00 xx UTF-32BE
  164. // 00 xx 00 xx UTF-16BE
  165. // xx 00 00 00 UTF-32LE
  166. // xx 00 xx 00 UTF-16LE
  167. // xx xx xx xx UTF-8
  168. if (!hasBOM_) {
  169. int pattern = (c[0] ? 1 : 0) | (c[1] ? 2 : 0) | (c[2] ? 4 : 0) | (c[3] ? 8 : 0);
  170. switch (pattern) {
  171. case 0x08: type_ = kUTF32BE; break;
  172. case 0x0A: type_ = kUTF16BE; break;
  173. case 0x01: type_ = kUTF32LE; break;
  174. case 0x05: type_ = kUTF16LE; break;
  175. case 0x0F: type_ = kUTF8; break;
  176. default: break; // Use type defined by user.
  177. }
  178. }
  179. // Runtime check whether the size of character type is sufficient. It only perform checks with assertion.
  180. if (type_ == kUTF16LE || type_ == kUTF16BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 2);
  181. if (type_ == kUTF32LE || type_ == kUTF32BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 4);
  182. }
  183. typedef Ch (*TakeFunc)(InputByteStream& is);
  184. InputByteStream* is_;
  185. UTFType type_;
  186. Ch current_;
  187. TakeFunc takeFunc_;
  188. bool hasBOM_;
  189. };
  190. //! Output stream wrapper with dynamically bound encoding and automatic encoding detection.
  191. /*!
  192. \tparam CharType Type of character for writing.
  193. \tparam OutputByteStream type of output byte stream to be wrapped.
  194. */
  195. template <typename CharType, typename OutputByteStream>
  196. class AutoUTFOutputStream {
  197. RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
  198. public:
  199. typedef CharType Ch;
  200. //! Constructor.
  201. /*!
  202. \param os output stream to be wrapped.
  203. \param type UTF encoding type.
  204. \param putBOM Whether to write BOM at the beginning of the stream.
  205. */
  206. AutoUTFOutputStream(OutputByteStream& os, UTFType type, bool putBOM) : os_(&os), type_(type) {
  207. RAPIDJSON_ASSERT(type >= kUTF8 && type <= kUTF32BE);
  208. // Runtime check whether the size of character type is sufficient. It only perform checks with assertion.
  209. if (type_ == kUTF16LE || type_ == kUTF16BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 2);
  210. if (type_ == kUTF32LE || type_ == kUTF32BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 4);
  211. static const PutFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Put) };
  212. putFunc_ = f[type_];
  213. if (putBOM)
  214. PutBOM();
  215. }
  216. UTFType GetType() const { return type_; }
  217. void Put(Ch c) { putFunc_(*os_, c); }
  218. void Flush() { os_->Flush(); }
  219. // Not implemented
  220. Ch Peek() const { RAPIDJSON_ASSERT(false); return 0;}
  221. Ch Take() { RAPIDJSON_ASSERT(false); return 0;}
  222. size_t Tell() const { RAPIDJSON_ASSERT(false); return 0; }
  223. Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
  224. size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
  225. private:
  226. AutoUTFOutputStream(const AutoUTFOutputStream&);
  227. AutoUTFOutputStream& operator=(const AutoUTFOutputStream&);
  228. void PutBOM() {
  229. typedef void (*PutBOMFunc)(OutputByteStream&);
  230. static const PutBOMFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(PutBOM) };
  231. f[type_](*os_);
  232. }
  233. typedef void (*PutFunc)(OutputByteStream&, Ch);
  234. OutputByteStream* os_;
  235. UTFType type_;
  236. PutFunc putFunc_;
  237. };
  238. #undef RAPIDJSON_ENCODINGS_FUNC
  239. RAPIDJSON_NAMESPACE_END
  240. #ifdef __clang__
  241. RAPIDJSON_DIAG_POP
  242. #endif
  243. #ifdef __GNUC__
  244. RAPIDJSON_DIAG_POP
  245. #endif
  246. #endif // RAPIDJSON_FILESTREAM_H_