net/src/HTTPParser.cpp

00001 // ------------------------------------------------------------------
00002 // pion-net: a C++ framework for building lightweight HTTP interfaces
00003 // ------------------------------------------------------------------
00004 // Copyright (C) 2007-2008 Atomic Labs, Inc.  (http://www.atomiclabs.com)
00005 //
00006 // Distributed under the Boost Software License, Version 1.0.
00007 // See http://www.boost.org/LICENSE_1_0.txt
00008 //
00009 
00010 #include <cstdlib>
00011 #include <boost/logic/tribool.hpp>
00012 #include <pion/net/HTTPParser.hpp>
00013 #include <pion/net/HTTPRequest.hpp>
00014 #include <pion/net/HTTPResponse.hpp>
00015 #include <pion/net/HTTPMessage.hpp>
00016 
00017 
00018 namespace pion {    // begin namespace pion
00019 namespace net {     // begin namespace net (Pion Network Library)
00020 
00021 
00022 // static members of HTTPParser
00023 
00024 const boost::uint32_t   HTTPParser::STATUS_MESSAGE_MAX = 1024;  // 1 KB
00025 const boost::uint32_t   HTTPParser::METHOD_MAX = 1024;  // 1 KB
00026 const boost::uint32_t   HTTPParser::RESOURCE_MAX = 256 * 1024;  // 256 KB
00027 const boost::uint32_t   HTTPParser::QUERY_STRING_MAX = 1024 * 1024; // 1 MB
00028 const boost::uint32_t   HTTPParser::HEADER_NAME_MAX = 1024; // 1 KB
00029 const boost::uint32_t   HTTPParser::HEADER_VALUE_MAX = 1024 * 1024; // 1 MB
00030 const boost::uint32_t   HTTPParser::QUERY_NAME_MAX = 1024;  // 1 KB
00031 const boost::uint32_t   HTTPParser::QUERY_VALUE_MAX = 1024 * 1024;  // 1 MB
00032 const boost::uint32_t   HTTPParser::COOKIE_NAME_MAX = 1024; // 1 KB
00033 const boost::uint32_t   HTTPParser::COOKIE_VALUE_MAX = 1024 * 1024; // 1 MB
00034 const std::size_t       HTTPParser::DEFAULT_CONTENT_MAX = 1024 * 1024;  // 1 MB
00035 
00036 
00037 // HTTPParser member functions
00038 
00039 boost::tribool HTTPParser::parse(HTTPMessage& http_msg)
00040 {
00041     PION_ASSERT(! eof() );
00042 
00043     boost::tribool rc = boost::indeterminate;
00044     std::size_t total_bytes_parsed = 0;
00045 
00046     if(http_msg.hasMissingPackets()) {
00047         http_msg.setDataAfterMissingPacket(true);
00048     }
00049 
00050     do {
00051         switch (m_message_parse_state) {
00052             // just started parsing the HTTP message
00053             case PARSE_START:
00054                 m_message_parse_state = PARSE_HEADERS;
00055                 // step through to PARSE_HEADERS
00056 
00057             // parsing the HTTP headers
00058             case PARSE_HEADERS:
00059                 rc = parseHeaders(http_msg);
00060                 total_bytes_parsed += m_bytes_last_read;
00061                 // check if we have finished parsing HTTP headers
00062                 if (rc == true) {
00063                     // finishHeaderParsing() updates m_message_parse_state
00064                     rc = finishHeaderParsing(http_msg);
00065                 }
00066                 break;
00067 
00068             // parsing chunked payload content
00069             case PARSE_CHUNKS:
00070                 rc = parseChunks(http_msg.getChunkCache());
00071                 total_bytes_parsed += m_bytes_last_read;
00072                 // check if we have finished parsing all chunks
00073                 if (rc == true) {
00074                     http_msg.concatenateChunks();
00075                 }
00076                 break;
00077 
00078             // parsing regular payload content with a known length
00079             case PARSE_CONTENT:
00080                 rc = consumeContent(http_msg);
00081                 total_bytes_parsed += m_bytes_last_read;
00082                 break;
00083 
00084             // parsing payload content with no length (until EOF)
00085             case PARSE_CONTENT_NO_LENGTH:
00086                 consumeContentAsNextChunk(http_msg.getChunkCache());
00087                 total_bytes_parsed += m_bytes_last_read;
00088                 break;
00089 
00090             // finished parsing the HTTP message
00091             case PARSE_END:
00092                 rc = true;
00093                 break;
00094         }
00095     } while ( boost::indeterminate(rc) && ! eof() );
00096 
00097     // check if we've finished parsing the HTTP message
00098     if (rc == true) {
00099         m_message_parse_state = PARSE_END;
00100         finish(http_msg);
00101     } else if(rc == false) {
00102         computeMsgStatus(http_msg, false);
00103     }
00104 
00105     // update bytes last read (aggregate individual operations for caller)
00106     m_bytes_last_read = total_bytes_parsed;
00107 
00108     return rc;
00109 }
00110 
00111 boost::tribool HTTPParser::parseMissingData(HTTPMessage& http_msg, std::size_t len)
00112 {
00113     static const char MISSING_DATA_CHAR = 'X';
00114     boost::tribool rc = boost::indeterminate;
00115 
00116     http_msg.setMissingPackets(true);
00117 
00118     switch (m_message_parse_state) {
00119 
00120         // cannot recover from missing data while parsing HTTP headers
00121         case PARSE_START:
00122         case PARSE_HEADERS:
00123             rc = false;
00124             break;
00125 
00126         // parsing chunked payload content
00127         case PARSE_CHUNKS:
00128             // parsing chunk data -> we can only recover if data fits into current chunk
00129             if (m_chunked_content_parse_state == PARSE_CHUNK
00130                 && m_bytes_read_in_current_chunk < m_size_of_current_chunk
00131                 && (m_size_of_current_chunk - m_bytes_read_in_current_chunk) >= len)
00132             {
00133                 // use dummy content for missing data
00134                 for (std::size_t n = 0; n < len && http_msg.getChunkCache().size() < m_max_content_length; ++n) 
00135                     http_msg.getChunkCache().push_back(MISSING_DATA_CHAR);
00136 
00137                 m_bytes_read_in_current_chunk += len;
00138                 m_bytes_last_read = len;
00139                 m_bytes_total_read += len;
00140                 m_bytes_content_read += len;
00141 
00142                 if (m_bytes_read_in_current_chunk == m_size_of_current_chunk) {
00143                     m_chunked_content_parse_state = PARSE_EXPECTING_CR_AFTER_CHUNK;
00144                 }
00145             } else {
00146                 // cannot recover from missing data
00147                 rc = false;
00148             }
00149             break;
00150 
00151         // parsing regular payload content with a known length
00152         case PARSE_CONTENT:
00153             // parsing content (with length) -> we can only recover if data fits into content
00154             if (m_bytes_content_remaining == 0) {
00155                 // we have all of the remaining payload content
00156                 rc = true;
00157             } else if (m_bytes_content_remaining < len) {
00158                 // cannot recover from missing data
00159                 rc = false;
00160             } else {
00161 
00162                 // make sure content buffer is not already full
00163                 if ( (m_bytes_content_read+len) <= m_max_content_length) {
00164                     // use dummy content for missing data
00165                     for (std::size_t n = 0; n < len; ++n)
00166                         http_msg.getContent()[m_bytes_content_read++] = MISSING_DATA_CHAR;
00167                 } else {
00168                     m_bytes_content_read += len;
00169                 }
00170 
00171                 m_bytes_content_remaining -= len;
00172                 m_bytes_total_read += len;
00173                 m_bytes_last_read = len;
00174 
00175                 if (m_bytes_content_remaining == 0)
00176                     rc = true;
00177             }
00178             break;
00179 
00180         // parsing payload content with no length (until EOF)
00181         case PARSE_CONTENT_NO_LENGTH:
00182             // use dummy content for missing data
00183             for (std::size_t n = 0; n < len && http_msg.getChunkCache().size() < m_max_content_length; ++n) 
00184                 http_msg.getChunkCache().push_back(MISSING_DATA_CHAR);
00185             m_bytes_last_read = len;
00186             m_bytes_total_read += len;
00187             m_bytes_content_read += len;
00188             break;
00189 
00190         // finished parsing the HTTP message
00191         case PARSE_END:
00192             rc = true;
00193             break;
00194     }
00195 
00196     // check if we've finished parsing the HTTP message
00197     if (rc == true) {
00198         m_message_parse_state = PARSE_END;
00199         finish(http_msg);
00200     } else if(rc == false) {
00201         computeMsgStatus(http_msg, false);
00202     }
00203 
00204     return rc;
00205 }
00206 
00207 boost::tribool HTTPParser::parseHeaders(HTTPMessage& http_msg)
00208 {
00209     //
00210     // note that boost::tribool may have one of THREE states:
00211     //
00212     // false: encountered an error while parsing HTTP headers
00213     // true: finished successfully parsing the HTTP headers
00214     // indeterminate: parsed bytes, but the HTTP headers are not yet finished
00215     //
00216     const char *read_start_ptr = m_read_ptr;
00217     m_bytes_last_read = 0;
00218     while (m_read_ptr < m_read_end_ptr) {
00219 
00220         switch (m_headers_parse_state) {
00221         case PARSE_METHOD_START:
00222             // we have not yet started parsing the HTTP method string
00223             if (*m_read_ptr != ' ' && *m_read_ptr!='\r' && *m_read_ptr!='\n') { // ignore leading whitespace
00224                 if (!isChar(*m_read_ptr) || isControl(*m_read_ptr) || isSpecial(*m_read_ptr))
00225                     return false;
00226                 m_headers_parse_state = PARSE_METHOD;
00227                 m_method.erase();
00228                 m_method.push_back(*m_read_ptr);
00229             }
00230             break;
00231 
00232         case PARSE_METHOD:
00233             // we have started parsing the HTTP method string
00234             if (*m_read_ptr == ' ') {
00235                 m_resource.erase();
00236                 m_headers_parse_state = PARSE_URI_STEM;
00237             } else if (!isChar(*m_read_ptr) || isControl(*m_read_ptr) || isSpecial(*m_read_ptr)) {
00238                 return false;
00239             } else if (m_method.size() >= METHOD_MAX) {
00240                 return false;
00241             } else {
00242                 m_method.push_back(*m_read_ptr);
00243             }
00244             break;
00245 
00246         case PARSE_URI_STEM:
00247             // we have started parsing the URI stem (or resource name)
00248             if (*m_read_ptr == ' ') {
00249                 m_headers_parse_state = PARSE_HTTP_VERSION_H;
00250             } else if (*m_read_ptr == '?') {
00251                 m_query_string.erase();
00252                 m_headers_parse_state = PARSE_URI_QUERY;
00253             } else if (isControl(*m_read_ptr)) {
00254                 return false;
00255             } else if (m_resource.size() >= RESOURCE_MAX) {
00256                 return false;
00257             } else {
00258                 m_resource.push_back(*m_read_ptr);
00259             }
00260             break;
00261 
00262         case PARSE_URI_QUERY:
00263             // we have started parsing the URI query string
00264             if (*m_read_ptr == ' ') {
00265                 m_headers_parse_state = PARSE_HTTP_VERSION_H;
00266             } else if (isControl(*m_read_ptr)) {
00267                 return false;
00268             } else if (m_query_string.size() >= QUERY_STRING_MAX) {
00269                 return false;
00270             } else {
00271                 m_query_string.push_back(*m_read_ptr);
00272             }
00273             break;
00274 
00275         case PARSE_HTTP_VERSION_H:
00276             // parsing "HTTP"
00277             if (*m_read_ptr != 'H') return false;
00278             m_headers_parse_state = PARSE_HTTP_VERSION_T_1;
00279             break;
00280 
00281         case PARSE_HTTP_VERSION_T_1:
00282             // parsing "HTTP"
00283             if (*m_read_ptr != 'T') return false;
00284             m_headers_parse_state = PARSE_HTTP_VERSION_T_2;
00285             break;
00286 
00287         case PARSE_HTTP_VERSION_T_2:
00288             // parsing "HTTP"
00289             if (*m_read_ptr != 'T') return false;
00290             m_headers_parse_state = PARSE_HTTP_VERSION_P;
00291             break;
00292 
00293         case PARSE_HTTP_VERSION_P:
00294             // parsing "HTTP"
00295             if (*m_read_ptr != 'P') return false;
00296             m_headers_parse_state = PARSE_HTTP_VERSION_SLASH;
00297             break;
00298 
00299         case PARSE_HTTP_VERSION_SLASH:
00300             // parsing slash after "HTTP"
00301             if (*m_read_ptr != '/') return false;
00302             m_headers_parse_state = PARSE_HTTP_VERSION_MAJOR_START;
00303             break;
00304 
00305         case PARSE_HTTP_VERSION_MAJOR_START:
00306             // parsing the first digit of the major version number
00307             if (!isDigit(*m_read_ptr)) return false;
00308             http_msg.setVersionMajor(*m_read_ptr - '0');
00309             m_headers_parse_state = PARSE_HTTP_VERSION_MAJOR;
00310             break;
00311 
00312         case PARSE_HTTP_VERSION_MAJOR:
00313             // parsing the major version number (not first digit)
00314             if (*m_read_ptr == '.') {
00315                 m_headers_parse_state = PARSE_HTTP_VERSION_MINOR_START;
00316             } else if (isDigit(*m_read_ptr)) {
00317                 http_msg.setVersionMajor( (http_msg.getVersionMajor() * 10)
00318                                           + (*m_read_ptr - '0') );
00319             } else {
00320                 return false;
00321             }
00322             break;
00323 
00324         case PARSE_HTTP_VERSION_MINOR_START:
00325             // parsing the first digit of the minor version number
00326             if (!isDigit(*m_read_ptr)) return false;
00327             http_msg.setVersionMinor(*m_read_ptr - '0');
00328             m_headers_parse_state = PARSE_HTTP_VERSION_MINOR;
00329             break;
00330 
00331         case PARSE_HTTP_VERSION_MINOR:
00332             // parsing the major version number (not first digit)
00333             if (*m_read_ptr == ' ') {
00334                 // should only happen for responses
00335                 if (m_is_request) return false;
00336                 m_headers_parse_state = PARSE_STATUS_CODE_START;
00337             } else if (*m_read_ptr == '\r') {
00338                 // should only happen for requests
00339                 if (! m_is_request) return false;
00340                 m_headers_parse_state = PARSE_EXPECTING_NEWLINE;
00341             } else if (*m_read_ptr == '\n') {
00342                 // should only happen for requests
00343                 if (! m_is_request) return false;
00344                 m_headers_parse_state = PARSE_EXPECTING_CR;
00345             } else if (isDigit(*m_read_ptr)) {
00346                 http_msg.setVersionMinor( (http_msg.getVersionMinor() * 10)
00347                                           + (*m_read_ptr - '0') );
00348             } else {
00349                 return false;
00350             }
00351             break;
00352 
00353         case PARSE_STATUS_CODE_START:
00354             // parsing the first digit of the response status code
00355             if (!isDigit(*m_read_ptr)) return false;
00356             m_status_code = (*m_read_ptr - '0');
00357             m_headers_parse_state = PARSE_STATUS_CODE;
00358             break;
00359 
00360         case PARSE_STATUS_CODE:
00361             // parsing the response status code (not first digit)
00362             if (*m_read_ptr == ' ') {
00363                 m_status_message.erase();
00364                 m_headers_parse_state = PARSE_STATUS_MESSAGE;
00365             } else if (isDigit(*m_read_ptr)) {
00366                 m_status_code = ( (m_status_code * 10) + (*m_read_ptr - '0') );
00367             } else {
00368                 return false;
00369             }
00370             break;
00371 
00372         case PARSE_STATUS_MESSAGE:
00373             // parsing the response status message
00374             if (*m_read_ptr == '\r') {
00375                 m_headers_parse_state = PARSE_EXPECTING_NEWLINE;
00376             } else if (*m_read_ptr == '\n') {
00377                 m_headers_parse_state = PARSE_EXPECTING_CR;
00378             } else if (isControl(*m_read_ptr)) {
00379                 return false;
00380             } else if (m_status_message.size() >= STATUS_MESSAGE_MAX) {
00381                 return false;
00382             } else {
00383                 m_status_message.push_back(*m_read_ptr);
00384             }
00385             break;
00386 
00387         case PARSE_EXPECTING_NEWLINE:
00388             // we received a CR; expecting a newline to follow
00389             if (*m_read_ptr == '\n') {
00390                 m_headers_parse_state = PARSE_HEADER_START;
00391             } else if (*m_read_ptr == '\r') {
00392                 // we received two CR's in a row
00393                 // assume CR only is (incorrectly) being used for line termination
00394                 // therefore, the message is finished
00395                 ++m_read_ptr;
00396                 m_bytes_last_read = (m_read_ptr - read_start_ptr);
00397                 m_bytes_total_read += m_bytes_last_read;
00398                 return true;
00399             } else if (*m_read_ptr == '\t' || *m_read_ptr == ' ') {
00400                 m_headers_parse_state = PARSE_HEADER_WHITESPACE;
00401             } else if (!isChar(*m_read_ptr) || isControl(*m_read_ptr) || isSpecial(*m_read_ptr)) {
00402                 return false;
00403             } else {
00404                 // assume it is the first character for the name of a header
00405                 m_header_name.erase();
00406                 m_header_name.push_back(*m_read_ptr);
00407                 m_headers_parse_state = PARSE_HEADER_NAME;
00408             }
00409             break;
00410 
00411         case PARSE_EXPECTING_CR:
00412             // we received a newline without a CR
00413             if (*m_read_ptr == '\r') {
00414                 m_headers_parse_state = PARSE_HEADER_START;
00415             } else if (*m_read_ptr == '\n') {
00416                 // we received two newlines in a row
00417                 // assume newline only is (incorrectly) being used for line termination
00418                 // therefore, the message is finished
00419                 ++m_read_ptr;
00420                 m_bytes_last_read = (m_read_ptr - read_start_ptr);
00421                 m_bytes_total_read += m_bytes_last_read;
00422                 return true;
00423             } else if (*m_read_ptr == '\t' || *m_read_ptr == ' ') {
00424                 m_headers_parse_state = PARSE_HEADER_WHITESPACE;
00425             } else if (!isChar(*m_read_ptr) || isControl(*m_read_ptr) || isSpecial(*m_read_ptr)) {
00426                 return false;
00427             } else {
00428                 // assume it is the first character for the name of a header
00429                 m_header_name.erase();
00430                 m_header_name.push_back(*m_read_ptr);
00431                 m_headers_parse_state = PARSE_HEADER_NAME;
00432             }
00433             break;
00434 
00435         case PARSE_HEADER_WHITESPACE:
00436             // parsing whitespace before a header name
00437             if (*m_read_ptr == '\r') {
00438                 m_headers_parse_state = PARSE_EXPECTING_NEWLINE;
00439             } else if (*m_read_ptr == '\n') {
00440                 m_headers_parse_state = PARSE_EXPECTING_CR;
00441             } else if (*m_read_ptr != '\t' && *m_read_ptr != ' ') {
00442                 if (!isChar(*m_read_ptr) || isControl(*m_read_ptr) || isSpecial(*m_read_ptr))
00443                     return false;
00444                 // assume it is the first character for the name of a header
00445                 m_header_name.erase();
00446                 m_header_name.push_back(*m_read_ptr);
00447                 m_headers_parse_state = PARSE_HEADER_NAME;
00448             }
00449             break;
00450 
00451         case PARSE_HEADER_START:
00452             // parsing the start of a new header
00453             if (*m_read_ptr == '\r') {
00454                 m_headers_parse_state = PARSE_EXPECTING_FINAL_NEWLINE;
00455             } else if (*m_read_ptr == '\n') {
00456                 m_headers_parse_state = PARSE_EXPECTING_FINAL_CR;
00457             } else if (*m_read_ptr == '\t' || *m_read_ptr == ' ') {
00458                 m_headers_parse_state = PARSE_HEADER_WHITESPACE;
00459             } else if (!isChar(*m_read_ptr) || isControl(*m_read_ptr) || isSpecial(*m_read_ptr)) {
00460                 return false;
00461             } else {
00462                 // first character for the name of a header
00463                 m_header_name.erase();
00464                 m_header_name.push_back(*m_read_ptr);
00465                 m_headers_parse_state = PARSE_HEADER_NAME;
00466             }
00467             break;
00468 
00469         case PARSE_HEADER_NAME:
00470             // parsing the name of a header
00471             if (*m_read_ptr == ':') {
00472                 m_header_value.erase();
00473                 m_headers_parse_state = PARSE_SPACE_BEFORE_HEADER_VALUE;
00474             } else if (!isChar(*m_read_ptr) || isControl(*m_read_ptr) || isSpecial(*m_read_ptr)) {
00475                 return false;
00476             } else if (m_header_name.size() >= HEADER_NAME_MAX) {
00477                 return false;
00478             } else {
00479                 // character (not first) for the name of a header
00480                 m_header_name.push_back(*m_read_ptr);
00481             }
00482             break;
00483 
00484         case PARSE_SPACE_BEFORE_HEADER_VALUE:
00485             // parsing space character before a header's value
00486             if (*m_read_ptr == ' ') {
00487                 m_headers_parse_state = PARSE_HEADER_VALUE;
00488             } else if (*m_read_ptr == '\r') {
00489                 http_msg.addHeader(m_header_name, m_header_value);
00490                 m_headers_parse_state = PARSE_EXPECTING_NEWLINE;
00491             } else if (*m_read_ptr == '\n') {
00492                 http_msg.addHeader(m_header_name, m_header_value);
00493                 m_headers_parse_state = PARSE_EXPECTING_CR;
00494             } else if (!isChar(*m_read_ptr) || isControl(*m_read_ptr) || isSpecial(*m_read_ptr)) {
00495                 return false;
00496             } else {
00497                 // assume it is the first character for the value of a header
00498                 m_header_value.push_back(*m_read_ptr);
00499                 m_headers_parse_state = PARSE_HEADER_VALUE;
00500             }
00501             break;
00502 
00503         case PARSE_HEADER_VALUE:
00504             // parsing the value of a header
00505             if (*m_read_ptr == '\r') {
00506                 http_msg.addHeader(m_header_name, m_header_value);
00507                 m_headers_parse_state = PARSE_EXPECTING_NEWLINE;
00508             } else if (*m_read_ptr == '\n') {
00509                 http_msg.addHeader(m_header_name, m_header_value);
00510                 m_headers_parse_state = PARSE_EXPECTING_CR;
00511             } else if (isControl(*m_read_ptr)) {
00512                 return false;
00513             } else if (m_header_value.size() >= HEADER_VALUE_MAX) {
00514                 return false;
00515             } else {
00516                 // character (not first) for the value of a header
00517                 m_header_value.push_back(*m_read_ptr);
00518             }
00519             break;
00520 
00521         case PARSE_EXPECTING_FINAL_NEWLINE:
00522             if (*m_read_ptr == '\n') ++m_read_ptr;
00523             m_bytes_last_read = (m_read_ptr - read_start_ptr);
00524             m_bytes_total_read += m_bytes_last_read;
00525             return true;
00526 
00527         case PARSE_EXPECTING_FINAL_CR:
00528             if (*m_read_ptr == '\r') ++m_read_ptr;
00529             m_bytes_last_read = (m_read_ptr - read_start_ptr);
00530             m_bytes_total_read += m_bytes_last_read;
00531             return true;
00532         }
00533         
00534         if (m_save_raw_headers)
00535             m_raw_headers += *m_read_ptr;
00536         
00537         ++m_read_ptr;
00538     }
00539 
00540     m_bytes_last_read = (m_read_ptr - read_start_ptr);
00541     m_bytes_total_read += m_bytes_last_read;
00542     return boost::indeterminate;
00543 }
00544 
00545 void HTTPParser::updateMessageWithHeaderData(HTTPMessage& http_msg) const
00546 {
00547     if (isParsingRequest()) {
00548 
00549         // finish an HTTP request message
00550 
00551         HTTPRequest& http_request(dynamic_cast<HTTPRequest&>(http_msg));
00552         http_request.setMethod(m_method);
00553         http_request.setResource(m_resource);
00554         http_request.setQueryString(m_query_string);
00555 
00556         // parse query pairs from the URI query string
00557         if (! m_query_string.empty()) {
00558             if (! parseURLEncoded(http_request.getQueryParams(),
00559                                   m_query_string.c_str(),
00560                                   m_query_string.size())) 
00561                 PION_LOG_WARN(m_logger, "Request query string parsing failed (URI): \""
00562                     << m_query_string << "\"");
00563         }
00564 
00565         // parse "Cookie" headers in request
00566         std::pair<HTTPTypes::Headers::const_iterator, HTTPTypes::Headers::const_iterator>
00567         cookie_pair = http_request.getHeaders().equal_range(HTTPTypes::HEADER_COOKIE);
00568         for (HTTPTypes::Headers::const_iterator cookie_iterator = cookie_pair.first;
00569              cookie_iterator != http_request.getHeaders().end()
00570              && cookie_iterator != cookie_pair.second; ++cookie_iterator)
00571         {
00572             if (! parseCookieHeader(http_request.getCookieParams(),
00573                                     cookie_iterator->second, false) )
00574                 PION_LOG_WARN(m_logger, "Cookie header parsing failed");
00575         }
00576 
00577     } else {
00578 
00579         // finish an HTTP response message
00580 
00581         HTTPResponse& http_response(dynamic_cast<HTTPResponse&>(http_msg));
00582         http_response.setStatusCode(m_status_code);
00583         http_response.setStatusMessage(m_status_message);
00584 
00585         // parse "Set-Cookie" headers in response
00586         std::pair<HTTPTypes::Headers::const_iterator, HTTPTypes::Headers::const_iterator>
00587         cookie_pair = http_response.getHeaders().equal_range(HTTPTypes::HEADER_SET_COOKIE);
00588         for (HTTPTypes::Headers::const_iterator cookie_iterator = cookie_pair.first;
00589              cookie_iterator != http_response.getHeaders().end()
00590              && cookie_iterator != cookie_pair.second; ++cookie_iterator)
00591         {
00592             if (! parseCookieHeader(http_response.getCookieParams(),
00593                                     cookie_iterator->second, true) )
00594                 PION_LOG_WARN(m_logger, "Set-Cookie header parsing failed");
00595         }
00596 
00597     }
00598 }
00599 
00600 boost::tribool HTTPParser::finishHeaderParsing(HTTPMessage& http_msg)
00601 {
00602     boost::tribool rc = boost::indeterminate;
00603 
00604     m_bytes_content_remaining = m_bytes_content_read = 0;
00605     http_msg.setContentLength(0);
00606     http_msg.updateTransferCodingUsingHeader();
00607     updateMessageWithHeaderData(http_msg);
00608 
00609     if (http_msg.isChunked()) {
00610 
00611         // content is encoded using chunks
00612         m_message_parse_state = PARSE_CHUNKS;
00613 
00614     } else if (http_msg.isContentLengthImplied()) {
00615 
00616         // content length is implied to be zero
00617         m_message_parse_state = PARSE_END;
00618         rc = true;
00619 
00620     } else {
00621         // content length should be specified in the headers
00622 
00623         if (http_msg.hasHeader(HTTPTypes::HEADER_CONTENT_LENGTH)) {
00624 
00625             // message has a content-length header
00626             try {
00627                 http_msg.updateContentLengthUsingHeader();
00628             } catch (...) {
00629                 PION_LOG_ERROR(m_logger, "Unable to update content length");
00630                 return false;
00631             }
00632 
00633             // check if content-length header == 0
00634             if (http_msg.getContentLength() == 0) {
00635                 m_message_parse_state = PARSE_END;
00636                 rc = true;
00637             } else {
00638                 m_message_parse_state = PARSE_CONTENT;
00639                 m_bytes_content_remaining = http_msg.getContentLength();
00640 
00641                 // check if content-length exceeds maximum allowed
00642                 if (m_bytes_content_remaining > m_max_content_length)
00643                     http_msg.setContentLength(m_max_content_length);
00644             }
00645 
00646         } else {
00647             // no content-length specified, and the content length cannot 
00648             // otherwise be determined
00649 
00650             // only if not a request, read through the close of the connection
00651             if (! m_is_request) {
00652                 // clear the chunk buffers before we start
00653                 http_msg.getChunkCache().clear();
00654 
00655                 // continue reading content until there is no more data
00656                 m_message_parse_state = PARSE_CONTENT_NO_LENGTH;
00657             } else {
00658                 m_message_parse_state = PARSE_END;
00659                 rc = true;
00660             }
00661         }
00662     }
00663 
00664     // allocate a buffer for payload content (may be zero-size)
00665     http_msg.createContentBuffer();
00666 
00667     return rc;
00668 }
00669 
00670 bool HTTPParser::parseURLEncoded(HTTPTypes::QueryParams& dict,
00671                                  const char *ptr, const size_t len)
00672 {
00673     // used to track whether we are parsing the name or value
00674     enum QueryParseState {
00675         QUERY_PARSE_NAME, QUERY_PARSE_VALUE
00676     } parse_state = QUERY_PARSE_NAME;
00677 
00678     // misc other variables used for parsing
00679     const char * const end = ptr + len;
00680     std::string query_name;
00681     std::string query_value;
00682 
00683     // iterate through each encoded character
00684     while (ptr < end) {
00685         switch (parse_state) {
00686 
00687         case QUERY_PARSE_NAME:
00688             // parsing query name
00689             if (*ptr == '=') {
00690                 // end of name found
00691                 if (query_name.empty()) return false;
00692                 parse_state = QUERY_PARSE_VALUE;
00693             } else if (*ptr == '&') {
00694                 // if query name is empty, just skip it (i.e. "&&")
00695                 if (! query_name.empty()) {
00696                     // assume that "=" is missing -- it's OK if the value is empty
00697                     dict.insert( std::make_pair(query_name, query_value) );
00698                     query_name.erase();
00699                 }
00700             } else if (*ptr == '\r' || *ptr == '\n') {
00701                 // ignore linefeeds and carriage returns (normally within POST content)
00702             } else if (isControl(*ptr) || query_name.size() >= QUERY_NAME_MAX) {
00703                 // control character detected, or max sized exceeded
00704                 return false;
00705             } else {
00706                 // character is part of the name
00707                 query_name.push_back(*ptr);
00708             }
00709             break;
00710 
00711         case QUERY_PARSE_VALUE:
00712             // parsing query value
00713             if (*ptr == '&') {
00714                 // end of value found (OK if empty)
00715                 dict.insert( std::make_pair(query_name, query_value) );
00716                 query_name.erase();
00717                 query_value.erase();
00718                 parse_state = QUERY_PARSE_NAME;
00719             } else if (*ptr == '\r' || *ptr == '\n') {
00720                 // ignore linefeeds and carriage returns (normally within POST content)
00721             } else if (isControl(*ptr) || query_value.size() >= QUERY_VALUE_MAX) {
00722                 // control character detected, or max sized exceeded
00723                 return false;
00724             } else {
00725                 // character is part of the value
00726                 query_value.push_back(*ptr);
00727             }
00728             break;
00729         }
00730 
00731         ++ptr;
00732     }
00733 
00734     // handle last pair in string
00735     if (! query_name.empty())
00736         dict.insert( std::make_pair(query_name, query_value) );
00737 
00738     return true;
00739 }
00740 
00741 bool HTTPParser::parseCookieHeader(HTTPTypes::CookieParams& dict,
00742                                    const char *ptr, const size_t len,
00743                                    bool set_cookie_header)
00744 {
00745     // BASED ON RFC 2109
00746     // http://www.ietf.org/rfc/rfc2109.txt
00747     // 
00748     // The current implementation ignores cookie attributes which begin with '$'
00749     // (i.e. $Path=/, $Domain=, etc.)
00750 
00751     // used to track what we are parsing
00752     enum CookieParseState {
00753         COOKIE_PARSE_NAME, COOKIE_PARSE_VALUE, COOKIE_PARSE_IGNORE
00754     } parse_state = COOKIE_PARSE_NAME;
00755 
00756     // misc other variables used for parsing
00757     const char * const end = ptr + len;
00758     std::string cookie_name;
00759     std::string cookie_value;
00760     char value_quote_character = '\0';
00761 
00762     // iterate through each character
00763     while (ptr < end) {
00764         switch (parse_state) {
00765 
00766         case COOKIE_PARSE_NAME:
00767             // parsing cookie name
00768             if (*ptr == '=') {
00769                 // end of name found
00770                 if (cookie_name.empty()) return false;
00771                 value_quote_character = '\0';
00772                 parse_state = COOKIE_PARSE_VALUE;
00773             } else if (*ptr == ';' || *ptr == ',') {
00774                 // ignore empty cookie names since this may occur naturally
00775                 // when quoted values are encountered
00776                 if (! cookie_name.empty()) {
00777                     // value is empty (OK)
00778                     if (! isCookieAttribute(cookie_name, set_cookie_header))
00779                         dict.insert( std::make_pair(cookie_name, cookie_value) );
00780                     cookie_name.erase();
00781                 }
00782             } else if (*ptr != ' ') {   // ignore whitespace
00783                 // check if control character detected, or max sized exceeded
00784                 if (isControl(*ptr) || cookie_name.size() >= COOKIE_NAME_MAX)
00785                     return false;
00786                 // character is part of the name
00787                 cookie_name.push_back(*ptr);
00788             }
00789             break;
00790 
00791         case COOKIE_PARSE_VALUE:
00792             // parsing cookie value
00793             if (value_quote_character == '\0') {
00794                 // value is not (yet) quoted
00795                 if (*ptr == ';' || *ptr == ',') {
00796                     // end of value found (OK if empty)
00797                     if (! isCookieAttribute(cookie_name, set_cookie_header))
00798                         dict.insert( std::make_pair(cookie_name, cookie_value) );
00799                     cookie_name.erase();
00800                     cookie_value.erase();
00801                     parse_state = COOKIE_PARSE_NAME;
00802                 } else if (*ptr == '\'' || *ptr == '"') {
00803                     if (cookie_value.empty()) {
00804                         // begin quoted value
00805                         value_quote_character = *ptr;
00806                     } else if (cookie_value.size() >= COOKIE_VALUE_MAX) {
00807                         // max size exceeded
00808                         return false;
00809                     } else {
00810                         // assume character is part of the (unquoted) value
00811                         cookie_value.push_back(*ptr);
00812                     }
00813                 } else if (*ptr != ' ') {   // ignore unquoted whitespace
00814                     // check if control character detected, or max sized exceeded
00815                     if (isControl(*ptr) || cookie_value.size() >= COOKIE_VALUE_MAX)
00816                         return false;
00817                     // character is part of the (unquoted) value
00818                     cookie_value.push_back(*ptr);
00819                 }
00820             } else {
00821                 // value is quoted
00822                 if (*ptr == value_quote_character) {
00823                     // end of value found (OK if empty)
00824                     if (! isCookieAttribute(cookie_name, set_cookie_header))
00825                         dict.insert( std::make_pair(cookie_name, cookie_value) );
00826                     cookie_name.erase();
00827                     cookie_value.erase();
00828                     parse_state = COOKIE_PARSE_IGNORE;
00829                 } else if (cookie_value.size() >= COOKIE_VALUE_MAX) {
00830                     // max size exceeded
00831                     return false;
00832                 } else {
00833                     // character is part of the (quoted) value
00834                     cookie_value.push_back(*ptr);
00835                 }
00836             }
00837             break;
00838 
00839         case COOKIE_PARSE_IGNORE:
00840             // ignore everything until we reach a comma "," or semicolon ";"
00841             if (*ptr == ';' || *ptr == ',')
00842                 parse_state = COOKIE_PARSE_NAME;
00843             break;
00844         }
00845 
00846         ++ptr;
00847     }
00848 
00849     // handle last cookie in string
00850     if (! isCookieAttribute(cookie_name, set_cookie_header))
00851         dict.insert( std::make_pair(cookie_name, cookie_value) );
00852 
00853     return true;
00854 }
00855 
00856 boost::tribool HTTPParser::parseChunks(HTTPMessage::ChunkCache& chunk_cache)
00857 {
00858     //
00859     // note that boost::tribool may have one of THREE states:
00860     //
00861     // false: encountered an error while parsing message
00862     // true: finished successfully parsing the message
00863     // indeterminate: parsed bytes, but the message is not yet finished
00864     //
00865     const char *read_start_ptr = m_read_ptr;
00866     m_bytes_last_read = 0;
00867     while (m_read_ptr < m_read_end_ptr) {
00868 
00869         switch (m_chunked_content_parse_state) {
00870         case PARSE_CHUNK_SIZE_START:
00871             // we have not yet started parsing the next chunk size
00872             if (isHexDigit(*m_read_ptr)) {
00873                 m_chunk_size_str.erase();
00874                 m_chunk_size_str.push_back(*m_read_ptr);
00875                 m_chunked_content_parse_state = PARSE_CHUNK_SIZE;
00876             } else if (*m_read_ptr == ' ' || *m_read_ptr == '\x09' || *m_read_ptr == '\x0D' || *m_read_ptr == '\x0A') {
00877                 // Ignore leading whitespace.  Technically, the standard probably doesn't allow white space here, 
00878                 // but we'll be flexible, since there's no ambiguity.
00879                 break;
00880             } else {
00881                 return false;
00882             }
00883             break;
00884 
00885         case PARSE_CHUNK_SIZE:
00886             if (isHexDigit(*m_read_ptr)) {
00887                 m_chunk_size_str.push_back(*m_read_ptr);
00888             } else if (*m_read_ptr == '\x0D') {
00889                 m_chunked_content_parse_state = PARSE_EXPECTING_LF_AFTER_CHUNK_SIZE;
00890             } else if (*m_read_ptr == ' ' || *m_read_ptr == '\x09') {
00891                 // Ignore trailing tabs or spaces.  Technically, the standard probably doesn't allow this, 
00892                 // but we'll be flexible, since there's no ambiguity.
00893                 m_chunked_content_parse_state = PARSE_EXPECTING_CR_AFTER_CHUNK_SIZE;
00894             } else {
00895                 return false;
00896             }
00897             break;
00898 
00899         case PARSE_EXPECTING_CR_AFTER_CHUNK_SIZE:
00900             if (*m_read_ptr == '\x0D') {
00901                 m_chunked_content_parse_state = PARSE_EXPECTING_LF_AFTER_CHUNK_SIZE;
00902             } else if (*m_read_ptr == ' ' || *m_read_ptr == '\x09') {
00903                 // Ignore trailing tabs or spaces.  Technically, the standard probably doesn't allow this, 
00904                 // but we'll be flexible, since there's no ambiguity.
00905                 break;
00906             } else {
00907                 return false;
00908             }
00909             break;
00910 
00911         case PARSE_EXPECTING_LF_AFTER_CHUNK_SIZE:
00912             // We received a CR; expecting LF to follow.  We can't be flexible here because 
00913             // if we see anything other than LF, we can't be certain where the chunk starts.
00914             if (*m_read_ptr == '\x0A') {
00915                 m_bytes_read_in_current_chunk = 0;
00916                 m_size_of_current_chunk = strtol(m_chunk_size_str.c_str(), 0, 16);
00917                 if (m_size_of_current_chunk == 0) {
00918                     m_chunked_content_parse_state = PARSE_EXPECTING_FINAL_CR_AFTER_LAST_CHUNK;
00919                 } else {
00920                     m_chunked_content_parse_state = PARSE_CHUNK;
00921                 }
00922             } else {
00923                 return false;
00924             }
00925             break;
00926 
00927         case PARSE_CHUNK:
00928             if (m_bytes_read_in_current_chunk < m_size_of_current_chunk) {
00929                 if (chunk_cache.size() < m_max_content_length)
00930                     chunk_cache.push_back(*m_read_ptr);
00931                 m_bytes_read_in_current_chunk++;
00932             }
00933             if (m_bytes_read_in_current_chunk == m_size_of_current_chunk) {
00934                 m_chunked_content_parse_state = PARSE_EXPECTING_CR_AFTER_CHUNK;
00935             }
00936             break;
00937 
00938         case PARSE_EXPECTING_CR_AFTER_CHUNK:
00939             // we've read exactly m_size_of_current_chunk bytes since starting the current chunk
00940             if (*m_read_ptr == '\x0D') {
00941                 m_chunked_content_parse_state = PARSE_EXPECTING_LF_AFTER_CHUNK;
00942             } else {
00943                 return false;
00944             }
00945             break;
00946 
00947         case PARSE_EXPECTING_LF_AFTER_CHUNK:
00948             // we received a CR; expecting LF to follow
00949             if (*m_read_ptr == '\x0A') {
00950                 m_chunked_content_parse_state = PARSE_CHUNK_SIZE_START;
00951             } else {
00952                 return false;
00953             }
00954             break;
00955 
00956         case PARSE_EXPECTING_FINAL_CR_AFTER_LAST_CHUNK:
00957             // we've read the final chunk; expecting final CRLF
00958             if (*m_read_ptr == '\x0D') {
00959                 m_chunked_content_parse_state = PARSE_EXPECTING_FINAL_LF_AFTER_LAST_CHUNK;
00960             } else {
00961                 return false;
00962             }
00963             break;
00964 
00965         case PARSE_EXPECTING_FINAL_LF_AFTER_LAST_CHUNK:
00966             // we received the final CR; expecting LF to follow
00967             if (*m_read_ptr == '\x0A') {
00968                 ++m_read_ptr;
00969                 m_bytes_last_read = (m_read_ptr - read_start_ptr);
00970                 m_bytes_total_read += m_bytes_last_read;
00971                 PION_LOG_DEBUG(m_logger, "Parsed " << m_bytes_last_read << " chunked payload content bytes; chunked content complete.");
00972                 return true;
00973             } else {
00974                 return false;
00975             }
00976         }
00977 
00978         ++m_read_ptr;
00979     }
00980 
00981     m_bytes_last_read = (m_read_ptr - read_start_ptr);
00982     m_bytes_total_read += m_bytes_last_read;
00983     m_bytes_content_read += m_bytes_last_read;
00984     return boost::indeterminate;
00985 }
00986 
00987 boost::tribool HTTPParser::consumeContent(HTTPMessage& http_msg)
00988 {
00989     size_t content_bytes_to_read;
00990     size_t content_bytes_available = bytes_available();
00991     boost::tribool rc = boost::indeterminate;
00992 
00993     if (m_bytes_content_remaining == 0) {
00994         // we have all of the remaining payload content
00995         return true;
00996     } else {
00997         if (content_bytes_available >= m_bytes_content_remaining) {
00998             // we have all of the remaining payload content
00999             rc = true;
01000             content_bytes_to_read = m_bytes_content_remaining;
01001         } else {
01002             // only some of the payload content is available
01003             content_bytes_to_read = content_bytes_available;
01004         }
01005         m_bytes_content_remaining -= content_bytes_to_read;
01006     }
01007 
01008     // make sure content buffer is not already full
01009     if (m_bytes_content_read < m_max_content_length) {
01010         if (m_bytes_content_read + content_bytes_to_read > m_max_content_length) {
01011             // read would exceed maximum size for content buffer
01012             // copy only enough bytes to fill up the content buffer
01013             memcpy(http_msg.getContent() + m_bytes_content_read, m_read_ptr, 
01014                 m_max_content_length - m_bytes_content_read);
01015         } else {
01016             // copy all bytes available
01017             memcpy(http_msg.getContent() + m_bytes_content_read, m_read_ptr, content_bytes_to_read);
01018         }
01019     }
01020 
01021     m_read_ptr += content_bytes_to_read;
01022     m_bytes_content_read += content_bytes_to_read;
01023     m_bytes_total_read += content_bytes_to_read;
01024     m_bytes_last_read = content_bytes_to_read;
01025 
01026     return rc;
01027 }
01028 
01029 std::size_t HTTPParser::consumeContentAsNextChunk(HTTPMessage::ChunkCache& chunk_cache)
01030 {
01031     if (bytes_available() == 0) {
01032         m_bytes_last_read = 0;
01033     } else {
01034         m_bytes_last_read = (m_read_end_ptr - m_read_ptr);
01035         while (m_read_ptr < m_read_end_ptr) {
01036             if (chunk_cache.size() < m_max_content_length)
01037                 chunk_cache.push_back(*m_read_ptr);
01038             ++m_read_ptr;
01039         }
01040         m_bytes_total_read += m_bytes_last_read;
01041         m_bytes_content_read += m_bytes_last_read;
01042     }
01043     return m_bytes_last_read;
01044 }
01045 
01046 void HTTPParser::finish(HTTPMessage& http_msg) const
01047 {
01048     switch (m_message_parse_state) {
01049     case PARSE_START:
01050         http_msg.setIsValid(false);
01051         http_msg.setContentLength(0);
01052         http_msg.createContentBuffer();
01053         return;
01054     case PARSE_END:
01055         http_msg.setIsValid(true);
01056         break;
01057     case PARSE_HEADERS:
01058         http_msg.setIsValid(false);
01059         updateMessageWithHeaderData(http_msg);
01060         http_msg.setContentLength(0);
01061         http_msg.createContentBuffer();
01062         break;
01063     case PARSE_CONTENT:
01064         http_msg.setIsValid(false);
01065         http_msg.setContentLength(getContentBytesRead());
01066         break;
01067     case PARSE_CHUNKS:
01068         http_msg.setIsValid(false);
01069         http_msg.concatenateChunks();
01070         break;
01071     case PARSE_CONTENT_NO_LENGTH:
01072         http_msg.setIsValid(true);
01073         http_msg.concatenateChunks();
01074         break;
01075     }
01076 
01077     computeMsgStatus(http_msg, http_msg.isValid());
01078 
01079     if (isParsingRequest()) {
01080         // Parse query pairs from post content if content type is x-www-form-urlencoded.
01081         // Type could be followed by parameters (as defined in section 3.6 of RFC 2616)
01082         // e.g. Content-Type: application/x-www-form-urlencoded; charset=UTF-8
01083         HTTPRequest& http_request(dynamic_cast<HTTPRequest&>(http_msg));
01084         const std::string& content_type_header = http_request.getHeader(HTTPTypes::HEADER_CONTENT_TYPE);
01085         if (content_type_header.compare(0, HTTPTypes::CONTENT_TYPE_URLENCODED.length(),
01086                                         HTTPTypes::CONTENT_TYPE_URLENCODED) == 0)
01087         {
01088             if (! parseURLEncoded(http_request.getQueryParams(),
01089                                   http_request.getContent(),
01090                                   http_request.getContentLength())) 
01091                 PION_LOG_WARN(m_logger, "Request query string parsing failed (POST content): \""
01092                     << http_request.getContent() << "\"");
01093         }
01094     }
01095 }
01096 
01097 void HTTPParser::computeMsgStatus(HTTPMessage& http_msg, bool msg_parsed_ok )
01098 {
01099     HTTPMessage::DataStatus st = HTTPMessage::STATUS_NONE;
01100 
01101     if(http_msg.hasMissingPackets()) {
01102         st = http_msg.hasDataAfterMissingPackets() ?
01103                         HTTPMessage::STATUS_PARTIAL : HTTPMessage::STATUS_TRUNCATED;
01104         http_msg.setStatus(st);
01105     } else {
01106         st = msg_parsed_ok ? HTTPMessage::STATUS_OK : HTTPMessage::STATUS_TRUNCATED;
01107     }
01108 
01109     http_msg.setStatus(st);
01110 }
01111 }   // end namespace net
01112 }   // end namespace pion
01113 

Generated on Fri Apr 30 14:48:53 2010 for pion-net by  doxygen 1.4.7