diff --git a/CMakeLists.txt b/CMakeLists.txt index 296eebd5..7506456b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -69,6 +69,7 @@ set( IXWEBSOCKET_HEADERS ixwebsocket/IXSocketFactory.h ixwebsocket/IXSocketServer.h ixwebsocket/IXUrlParser.h + ixwebsocket/IXUtf8Validator.h ixwebsocket/IXUserAgent.h ixwebsocket/IXWebSocket.h ixwebsocket/IXWebSocketCloseConstants.h diff --git a/DOCKER_VERSION b/DOCKER_VERSION index 220d8e0a..8710cfdf 100644 --- a/DOCKER_VERSION +++ b/DOCKER_VERSION @@ -1 +1 @@ -5.1.5 +5.1.6 diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 324d20a3..60c68198 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -1,6 +1,11 @@ # Changelog All notable changes to this project will be documented in this file. +## [5.1.6] - 2019-09-03 + +Sending invalid UTF-8 TEXT message should fail and close the connection (fix remaining autobahn test: 6.X UTF-8 Handling) +Fix failing unittest which was sending binary data in text mode with WebSocket::send to call properly call WebSocket::sendBinary instead. + ## [5.1.5] - 2019-09-03 Framentation: data and continuation blocks received out of order (fix autobahn test: 5.9 through 5.20 Fragmentation) diff --git a/docs/design.md b/docs/design.md index 634dcf8c..1f4eee7b 100644 --- a/docs/design.md +++ b/docs/design.md @@ -32,7 +32,6 @@ The regression test is running after each commit on travis. * On Windows TLS is not setup yet to validate certificates. * There is no convenient way to embed a ca cert. -* No utf-8 validation is made when sending TEXT message with sendText() * Automatic reconnection works at the TCP socket level, and will detect remote end disconnects. However, if the device/computer network become unreachable (by turning off wifi), it is quite hard to reliably and timely detect it at the socket level using `recv` and `send` error codes. [Here](https://stackoverflow.com/questions/14782143/linux-socket-how-to-detect-disconnected-network-in-a-client-program) is a good discussion on the subject. This behavior is consistent with other runtimes such as node.js. One way to detect a disconnected device with low level C code is to do a name resolution with DNS but this can be expensive. Mobile devices have good and reliable API to do that. * The server code is using select to detect incoming data, and creates one OS thread per connection. This is not as scalable as strategies using epoll or kqueue. diff --git a/ixwebsocket/IXUtf8Validator.h b/ixwebsocket/IXUtf8Validator.h new file mode 100644 index 00000000..276b0109 --- /dev/null +++ b/ixwebsocket/IXUtf8Validator.h @@ -0,0 +1,167 @@ +/* + * The following code is adapted from code originally written by Bjoern + * Hoehrmann . See + * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. + * + * The original license: + * + * Copyright (c) 2008-2009 Bjoern Hoehrmann + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. +*/ + +/* + * IXUtf8Validator.h + * Author: Benjamin Sergeant + * Copyright (c) 2019 Machine Zone, Inc. All rights reserved. + * + * From websocketpp. Tiny modifications made for code style, function names etc... + */ + +#pragma once + +#include +#include + +namespace ix +{ + /// State that represents a valid utf8 input sequence + static unsigned int const utf8_accept = 0; + /// State that represents an invalid utf8 input sequence + static unsigned int const utf8_reject = 1; + + /// Lookup table for the UTF8 decode state machine + static uint8_t const utf8d[] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df + 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef + 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff + 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 + 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 + 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 + 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 + }; + + /// Decode the next byte of a UTF8 sequence + /** + * @param [out] state The decoder state to advance + * @param [out] codep The codepoint to fill in + * @param [in] byte The byte to input + * @return The ending state of the decode operation + */ + inline uint32_t decodeNextByte(uint32_t * state, uint32_t * codep, uint8_t byte) + { + uint32_t type = utf8d[byte]; + + *codep = (*state != utf8_accept) ? + (byte & 0x3fu) | (*codep << 6) : + (0xff >> type) & (byte); + + *state = utf8d[256 + *state*16 + type]; + return *state; + } + + /// Provides streaming UTF8 validation functionality + class Utf8Validator + { + public: + /// Construct and initialize the validator + Utf8Validator() : m_state(utf8_accept),m_codepoint(0) {} + + /// Advance the state of the validator with the next input byte + /** + * @param byte The byte to advance the validation state with + * @return Whether or not the byte resulted in a validation error. + */ + bool consume(uint8_t byte) + { + if (decodeNextByte(&m_state,&m_codepoint,byte) == utf8_reject) + { + return false; + } + return true; + } + + /// Advance Validator state with input from an iterator pair + /** + * @param begin Input iterator to the start of the input range + * @param end Input iterator to the end of the input range + * @return Whether or not decoding the bytes resulted in a validation error. + */ + template + bool decode(iterator_type begin, iterator_type end) + { + for (iterator_type it = begin; it != end; ++it) + { + unsigned int result = decodeNextByte( + &m_state, + &m_codepoint, + static_cast(*it) + ); + + if (result == utf8_reject) + { + return false; + } + } + return true; + } + + /// Return whether the input sequence ended on a valid utf8 codepoint + /** + * @return Whether or not the input sequence ended on a valid codepoint. + */ + bool complete() + { + return m_state == utf8_accept; + } + + /// Reset the Validator to decode another message + void reset() + { + m_state = utf8_accept; + m_codepoint = 0; + } + private: + uint32_t m_state; + uint32_t m_codepoint; + }; + + /// Validate a UTF8 string + /** + * convenience function that creates a Validator, validates a complete string + * and returns the result. + */ + inline bool validateUtf8(std::string const & s) + { + Utf8Validator v; + if (!v.decode(s.begin(),s.end())) + { + return false; + } + return v.complete(); + } + +} // namespace ix diff --git a/ixwebsocket/IXWebSocket.cpp b/ixwebsocket/IXWebSocket.cpp index 0449c260..4cef1a46 100644 --- a/ixwebsocket/IXWebSocket.cpp +++ b/ixwebsocket/IXWebSocket.cpp @@ -8,66 +8,12 @@ #include "IXSetThreadName.h" #include "IXWebSocketHandshake.h" #include "IXExponentialBackoff.h" +#include "IXUtf8Validator.h" #include #include +#include -namespace -{ - // - // Stolen from here http://www.zedwood.com/article/cpp-is-valid-utf8-string-function - // There doesn't seem to be anything in the C++ library so far to do that. - // The closest thing is code for converting from utf-8 to utf-16 or utf-32 but - // that isn't working well for some broken input strings. - // - bool isValidUtf8(const std::string& str) - { - size_t i = 0; - size_t ix = str.length(); - int c, n, j; - - for (; i < ix; i++) - { - c = (unsigned char) str[i]; - //if (c==0x09 || c==0x0a || c==0x0d || (0x20 <= c && c <= 0x7e) ) n = 0; // is_printable_ascii - if (0x00 <= c && c <= 0x7f) - { - n = 0; // 0bbbbbbb - } - else if ((c & 0xE0) == 0xC0) - { - n = 1; // 110bbbbb - } - else if ( c==0xed && i<(ix-1) && ((unsigned char)str[i+1] & 0xa0)==0xa0) - { - return false; //U+d800 to U+dfff - } - else if ((c & 0xF0) == 0xE0) - { - n = 2; // 1110bbbb - } - else if ((c & 0xF8) == 0xF0) - { - n = 3; // 11110bbb - } - //else if (($c & 0xFC) == 0xF8) n=4; // 111110bb //byte 5, unnecessary in 4 byte UTF-8 - //else if (($c & 0xFE) == 0xFC) n=5; // 1111110b //byte 6, unnecessary in 4 byte UTF-8 - else - { - return false; - } - - for (j=0; jsend(msg->str); + client->sendBinary(msg->str); } } }