=== modified file 'src/HttpMsg.cc'
--- src/HttpMsg.cc	2010-08-24 20:35:02 +0000
+++ src/HttpMsg.cc	2010-08-31 13:05:01 +0000
@@ -401,6 +401,10 @@
     hdr->req_start = hdr->req_end = -1;
     hdr->hdr_start = hdr->hdr_end = -1;
     debugs(74, 5, "httpParseInit: Request buffer is " << buf);
+    hdr->m_start = hdr->m_end = -1;
+    hdr->u_start = hdr->u_end = -1;
+    hdr->v_start = hdr->v_end = -1;
+    hdr->v_maj = hdr->v_min = 0;
 }
 
 #if MSGDODEBUG
@@ -447,188 +451,234 @@
 
 /**
  * Attempt to parse the request line.
- *
- * This will set the values in hmsg that it determines. One may end up
- * with a partially-parsed buffer; the return value tells you whether
- * the values are valid or not.
- *
- * \retval	1 if parsed correctly
- * \retval	0 if more is needed
- * \retval	-1 if error
- *
- * TODO:
- *   * have it indicate "error" and "not enough" as two separate conditions!
- *   * audit this code as off-by-one errors are probably everywhere!
+ * Governed by:
+ *  RFC 1945 section 5.1
+ *  RFC 2616 section 5.1
+ *
+ * Parsing state is stored between calls. However the current implementation
+ * begins parsing from scratch on every call.
+ * The return value tells you whether the parsing state fields are valid or not.
+ *
+ * \retval -1  an error occurred. request_parse_status indicates HTTP status result.
+ * \retval  1  successful parse
+ * \retval  0  more data is needed to complete the parse
  */
 int
 HttpParserParseReqLine(HttpParser *hmsg)
 {
-    int i = 0;
+    int i = 0, maj, min;
     int retcode = 0;
-    unsigned int maj = 0, min = 0;
-    int last_whitespace = -1, line_end = -1;
+    int second_word = -1;
+    int first_whitespace = -1, last_whitespace = -1, line_end = -1;
+    // NP: line_end tracks the last byte BEFORE the \r\n or \n
 
-    debugs(74, 5, "httpParserParseReqLine: parsing " << hmsg->buf);
+    debugs(74, 5, HERE << "parsing " << hmsg->buf);
 
     PROF_start(HttpParserParseReqLine);
-    /* Find \r\n - end of URL+Version (and the request) */
+
+    // Single-pass parse: (provided we have the whole line anyways)
+
+    hmsg->req_start = 0;
+#if 0 && USE_HTTP_VIOLATION
+    // NOTE: original parser version ignored prefix whitespace before the method.
+    //       for now this hack retains that behaviour. WHY do we need this !?
+    //
+    for(;hmsg->req_start < hmsg->bufsiz && xisspace(hmsg->buf[hmsg->req_start]); hmsg->req_start++);
+#endif
     hmsg->req_end = -1;
     for (i = 0; i < hmsg->bufsiz; i++) {
+        // track first and last whitespace (SP only)
+        if (hmsg->buf[i] == ' ') {
+            last_whitespace = i;
+            if (first_whitespace < hmsg->req_start)
+                first_whitespace = i;
+        }
+
+        // track next non-SP/non-HT byte after first_whitespace
+        if (second_word < first_whitespace && hmsg->buf[i] != ' ' && hmsg->buf[i] != '\t') {
+            second_word = i;
+        }
+
+        // locate line terminator
         if (hmsg->buf[i] == '\n') {
             hmsg->req_end = i;
+            line_end = i - 1;
             break;
         }
-        if (i < hmsg->bufsiz - 1 && hmsg->buf[i] == '\r' && hmsg->buf[i + 1] == '\n') {
-            hmsg->req_end = i + 1;
-            break;
+        if (i < hmsg->bufsiz - 1 && hmsg->buf[i] == '\r') {
+            if (hmsg->buf[i + 1] == '\n') {
+                hmsg->req_end = i + 1;
+                line_end = i - 1;
+                break;
+            } else {
+                // RFC 2616 section 5.1
+                // "No CR or LF is allowed except in the final CRLF sequence"
+                hmsg->request_parse_status = HTTP_BAD_REQUEST;
+                return -1;
+            }
         }
     }
     if (hmsg->req_end == -1) {
-        retcode = 0;
-        goto finish;
-    }
-    assert(hmsg->buf[hmsg->req_end] == '\n');
-    /* Start at the beginning again */
-    i = 0;
-
-    /* Find first non-whitespace - beginning of method */
-    for (; i < hmsg->req_end && (xisspace(hmsg->buf[i])); i++);
-    if (i >= hmsg->req_end) {
-        retcode = 0;
-        goto finish;
-    }
-    hmsg->m_start = i;
-    hmsg->req_start = i;
-
-    /* Find first whitespace - end of method */
-    for (; i < hmsg->req_end && (! xisspace(hmsg->buf[i])); i++);
-    if (i >= hmsg->req_end) {
-        retcode = 0;
-        goto finish;
-    }
-    hmsg->m_end = i - 1;
-
-    /* Find first non-whitespace - beginning of URL+Version */
-    for (; i < hmsg->req_end && (xisspace(hmsg->buf[i])); i++);
-    if (i >= hmsg->req_end) {
-        retcode = 0;
-        goto finish;
-    }
-    hmsg->u_start = i;
-
-    /* Find \r\n or \n - thats the end of the line. Keep track of the last whitespace! */
-    for (; i <= hmsg->req_end; i++) {
-        /* If \n - its end of line */
-        if (hmsg->buf[i] == '\n') {
-            line_end = i;
-            break;
-        }
-        /* XXX could be off-by-one wrong! */
-        if (hmsg->buf[i] == '\r' && (i + 1) <= hmsg->req_end && hmsg->buf[i+1] == '\n') {
-            line_end = i;
-            break;
-        }
-        /* If its a whitespace, note it as it'll delimit our version */
-        if (hmsg->buf[i] == ' ' || hmsg->buf[i] == '\t') {
-            last_whitespace = i;
-        }
-    }
-    if (i > hmsg->req_end) {
-        retcode = 0;
-        goto finish;
+        PROF_stop(HttpParserParseReqLine);
+        debugs(74, 5, "Parser: retval 0: from " << hmsg->req_start <<
+               "->" << hmsg->req_end << ": needs more data to complete first line.");
+        return 0; // This is the only spot in this function where (0) is valid result.
+    }
+
+    // NP: we have now seen EOL, more-data (0) cannot occur.
+    //     From here on any failure is -1, success is 1
+
+
+    // Input Validation:
+
+    // Process what we now know about the line structure into field offsets
+    // generating HTTP status for any aborts as we go.
+
+    // First non-whitespace = beginning of method
+    if (hmsg->req_start > line_end) {
+        hmsg->request_parse_status = HTTP_BAD_REQUEST;
+        retcode = -1;
+        goto finish;
+    }
+    hmsg->m_start = hmsg->req_start;
+
+    // First whitespace = end of method
+    if (first_whitespace > line_end || first_whitespace < hmsg->req_start) {
+        hmsg->request_parse_status = HTTP_BAD_REQUEST; // no method
+        retcode = -1;
+        goto finish;
+    }
+    hmsg->m_end = first_whitespace - 1;
+    if (hmsg->m_end < hmsg->m_start) {
+        hmsg->request_parse_status = HTTP_BAD_REQUEST; // missing URI?
+        retcode = -1;
+        goto finish;
+    }
+
+    // First non-whitespace after first SP = beginning of URL+Version
+    if (second_word > line_end || second_word < hmsg->req_start) {
+        hmsg->request_parse_status = HTTP_BAD_REQUEST; // missing URI
+        retcode = -1;
+        goto finish;
+    }
+    hmsg->u_start = second_word;
+
+    // RFC 1945: SP and version following URI are optional, marking version 0.9
+    // we identify this by the last whitespace being earlier than URI start
+    if (last_whitespace < second_word && last_whitespace >= hmsg->req_start) {
+        hmsg->v_maj = 0;
+        hmsg->v_min = 9;
+        hmsg->u_end = line_end;
+        hmsg->request_parse_status = HTTP_OK; // HTTP/0.9
+        retcode = 1;
+        goto finish;
+    } else {
+        // otherwise last whitespace is somewhere after end of URI.
+        hmsg->u_end = last_whitespace;
+        // crop any trailing whitespace in the area we think of as URI
+        for(;hmsg->u_end >= hmsg->u_start && xisspace(hmsg->buf[hmsg->u_end]); hmsg->u_end--);
+    }
+    if (hmsg->u_end < hmsg->u_start) {
+        hmsg->request_parse_status = HTTP_BAD_REQUEST; // missing URI
+        retcode = -1;
+        goto finish;
+    }
+
+    // Last whitespace SP = before start of protocol/version
+    if (last_whitespace >= line_end) {
+        hmsg->request_parse_status = HTTP_BAD_REQUEST; // missing version
+        retcode = -1;
+        goto finish;
+    }
+    hmsg->v_start = last_whitespace + 1;
+    hmsg->v_end = line_end;
+
+    // We only accept HTTP protocol requests right now.
+    // TODO: accept other protocols; RFC 2326 (RTSP protocol) etc
+    if ((hmsg->v_end - hmsg->v_start +1) < 5 || strncasecmp(&hmsg->buf[hmsg->v_start], "HTTP/", 5) != 0) {
+#if USE_HTTP_VIOLATIONS
+        // being lax; old parser accepted strange versions
+        hmsg->v_maj = 0;
+        hmsg->v_min = 9;
+        hmsg->u_end = line_end;
+        hmsg->request_parse_status = HTTP_OK; // treat as HTTP/0.9
+        retcode = 1;
+        goto finish;
+#else
+        hmsg->request_parse_status = HTTP_HTTP_VERSION_NOT_SUPPORTED; // protocol not supported / implemented.
+        retcode = -1;
+        goto finish;
+#endif
     }
 
     /* At this point we don't need the 'i' value; so we'll recycle it for version parsing */
 
-    /*
-     * At this point: line_end points to the first eol char (\r or \n);
-     * last_whitespace points to the last whitespace char in the URL.
-     * We know we have a full buffer here!
-     */
-    if (last_whitespace == -1) {
-        maj = 0;
-        min = 9;
-        hmsg->u_end = line_end - 1;
-        assert(hmsg->u_end >= hmsg->u_start);
-    } else {
-        /* Find the first non-whitespace after last_whitespace */
-        /* XXX why <= vs < ? I do need to really re-audit all of this ..*/
-        for (i = last_whitespace; i <= hmsg->req_end && xisspace(hmsg->buf[i]); i++);
-        if (i > hmsg->req_end) {
-            retcode = 0;
-            goto finish;
-        }
-
-        /* is it http/ ? if so, we try parsing. If not, the URL is the whole line; version is 0.9 */
-        if (i + 5 >= hmsg->req_end || (strncasecmp(&hmsg->buf[i], "HTTP/", 5) != 0)) {
-            maj = 0;
-            min = 9;
-            hmsg->u_end = line_end - 1;
-            assert(hmsg->u_end >= hmsg->u_start);
-        } else {
-            /* Ok, lets try parsing! Yes, this needs refactoring! */
-            hmsg->v_start = i;
-            i += 5;
-
-            /* next should be 1 or more digits */
-            maj = 0;
-            for (; i < hmsg->req_end && (isdigit(hmsg->buf[i])) && maj < 65536; i++) {
-                maj = maj * 10;
-                maj = maj + (hmsg->buf[i]) - '0';
-            }
-            if (maj >= 65536) {
-                retcode = -1;
-                goto finish;
-            }
-            if (i >= hmsg->req_end) {
-                retcode = 0;
-                goto finish;
-            }
-
-            /* next should be .; we -have- to have this as we have a whole line.. */
-            if (hmsg->buf[i] != '.') {
-                retcode = 0;
-                goto finish;
-            }
-            if (i + 1 >= hmsg->req_end) {
-                retcode = 0;
-                goto finish;
-            }
-
-            /* next should be one or more digits */
-            i++;
-            min = 0;
-            for (; i < hmsg->req_end && (isdigit(hmsg->buf[i])) && min < 65536; i++) {
-                min = min * 10;
-                min = min + (hmsg->buf[i]) - '0';
-            }
-
-            if (min >= 65536) {
-                retcode = -1;
-                goto finish;
-            }
-
-            /* Find whitespace, end of version */
-            hmsg->v_end = i;
-            hmsg->u_end = last_whitespace - 1;
-        }
-    }
+    i = hmsg->v_start + sizeof("HTTP/") -1;
+
+    /* next should be 1 or more digits */
+    if (!isdigit(hmsg->buf[i])) {
+        hmsg->request_parse_status = HTTP_HTTP_VERSION_NOT_SUPPORTED;
+        retcode = -1;
+        goto finish;
+    }
+    maj = 0;
+    for (; i <= line_end && (isdigit(hmsg->buf[i])) && maj < 65536; i++) {
+        maj = maj * 10;
+        maj = maj + (hmsg->buf[i]) - '0';
+    }
+    // catch too-big values or missing remainders
+    if (maj >= 65536 || i > line_end) {
+        hmsg->request_parse_status = HTTP_HTTP_VERSION_NOT_SUPPORTED;
+        retcode = -1;
+        goto finish;
+    }
+    hmsg->v_maj = maj;
+
+    /* next should be .; we -have- to have this as we have a whole line.. */
+    if (hmsg->buf[i] != '.') {
+        hmsg->request_parse_status = HTTP_HTTP_VERSION_NOT_SUPPORTED;
+        retcode = -1;
+        goto finish;
+    }
+    // catch missing minor part
+    if (++i > line_end) {
+        hmsg->request_parse_status = HTTP_HTTP_VERSION_NOT_SUPPORTED;
+        retcode = -1;
+        goto finish;
+    }
+
+    /* next should be one or more digits */
+    if (!isdigit(hmsg->buf[i])) {
+        hmsg->request_parse_status = HTTP_HTTP_VERSION_NOT_SUPPORTED;
+        retcode = -1;
+        goto finish;
+    }
+    min = 0;
+    for (; i <= line_end && (isdigit(hmsg->buf[i])) && min < 65536; i++) {
+        min = min * 10;
+        min = min + (hmsg->buf[i]) - '0';
+    }
+    // catch too-big values or trailing garbage
+    if (min >= 65536 || i < line_end) {
+        hmsg->request_parse_status = HTTP_HTTP_VERSION_NOT_SUPPORTED;
+        retcode = -1;
+        goto finish;
+    }
+    hmsg->v_min = min;
 
     /*
      * Rightio - we have all the schtuff. Return true; we've got enough.
      */
     retcode = 1;
+    hmsg->request_parse_status = HTTP_OK;
 
 finish:
-    hmsg->v_maj = maj;
-    hmsg->v_min = min;
     PROF_stop(HttpParserParseReqLine);
     debugs(74, 5, "Parser: retval " << retcode << ": from " << hmsg->req_start <<
            "->" << hmsg->req_end << ": method " << hmsg->m_start << "->" <<
            hmsg->m_end << "; url " << hmsg->u_start << "->" << hmsg->u_end <<
-           "; version " << hmsg->v_start << "->" << hmsg->v_end << " (" << maj <<
-           "/" << min << ")");
+           "; version " << hmsg->v_start << "->" << hmsg->v_end << " (" << hmsg->v_maj <<
+           "/" << hmsg->v_min << ")");
 
     return retcode;
 }
-


