Truncate too-long HTTP response bodies to match their Content-Length header.

Sometimes a broken server sends more than Content-Length bytes in the
response.  For example, a 302 redirect message with "Content-Length: 0" header
may include an HTML body. Squid used to send "everything" it read to the
client, even if it read more than the Content-Length bytes. That may have
helped in some cases, but I think we should be more conservative when dealing
with broken servers to combat message smuggling attacks and other bad
side-effects for clients. 

We now do not forward more than the advertised content length and declare the
connection with a broken server non-persistent.

Chunked responses (that Squid should not receive and that must not have a
Content-Length header) are not truncated because RFC 2616 says we MUST ignore
their Content-Length header.

TODO: simply truncating read content would not work for pipelined responses.
We should preserve extra content for the next transaction on a pconn.

=== modified file 'src/MemBuf.cc'
--- src/MemBuf.cc	2006-09-20 14:13:38 +0000
+++ src/MemBuf.cc	2009-05-26 16:20:20 +0000
@@ -220,6 +220,15 @@
     PROF_stop(MemBuf_consume);
 }
 
+// removes last tailSize bytes
+void MemBuf::truncate(mb_size_t tailSize)
+{
+    const mb_size_t cSize = contentSize();
+    assert(0 <= tailSize && tailSize <= cSize);
+    assert(!stolen); /* not frozen */
+    size -= tailSize;
+}
+
 // calls memcpy, appends exactly size bytes, extends buffer if needed
 void MemBuf::append(const char *newContent, mb_size_t sz)
 {

=== modified file 'src/MemBuf.h'
--- src/MemBuf.h	2008-04-06 00:35:11 +0000
+++ src/MemBuf.h	2009-05-26 15:45:58 +0000
@@ -73,6 +73,7 @@
     void consume(mb_size_t sz);  // removes sz bytes, moving content left
     void append(const char *c, mb_size_t sz); // grows if needed and possible
     void appended(mb_size_t sz); // updates content size after external append
+    void truncate(mb_size_t sz);  // removes sz last bytes
 
     // XXX: convert global memBuf*() functions into methods
 

=== modified file 'src/http.cc'
--- src/http.cc	2009-05-01 15:50:41 +0000
+++ src/http.cc	2009-06-10 04:56:17 +0000
@@ -68,7 +68,7 @@
         HttpHeader * hdr_out, int we_do_ranges, http_state_flags);
 
 HttpStateData::HttpStateData(FwdState *theFwdState) : ServerStateData(theFwdState),
-        header_bytes_read(0), reply_bytes_read(0)
+        header_bytes_read(0), reply_bytes_read(0), body_bytes_truncated(0)
 {
     debugs(11,5,HERE << "HttpStateData " << this << " created");
     ignoreCacheControl = false;
@@ -923,6 +923,9 @@
 
         if (body_bytes_read < vrep->content_length)
             return INCOMPLETE_MSG;
+
+        if (body_bytes_truncated > 0) // already read more than needed
+            return COMPLETE_NONPERSISTENT_MSG; // disable pconns
     }
 
     /* If there is no message body or we got it all, we can be persistent */
@@ -1099,6 +1102,33 @@
     return false; // quit on error
 }
 
+/** truncate what we read if we read too much so that writeReplyBody()
+    writes no more than what we should have read */
+void
+HttpStateData::truncateVirginBody()
+{
+    assert(flags.headers_parsed);
+
+    HttpReply *vrep = virginReply();
+    int64_t clen = -1;
+    if (!vrep->expectingBody(request->method, clen) || clen < 0)
+        return; // no body or a body of unknown size, including chunked
+
+    const int64_t body_bytes_read = reply_bytes_read - header_bytes_read;
+    if (body_bytes_read - body_bytes_truncated <= clen) 
+        return; // we did not read too much or already took care of the extras
+
+    if (const int64_t extras = body_bytes_read - body_bytes_truncated - clen) {
+        // server sent more that the advertised content length
+        debugs(11,5, HERE << "body_bytes_read=" << body_bytes_read << 
+            " clen=" << clen << '/' << vrep->content_length <<
+            " body_bytes_truncated=" << body_bytes_truncated << '+' << extras);
+
+        readBuf->truncate(extras);
+        body_bytes_truncated += extras;
+    }
+}
+
 /*
  * Call this when there is data from the origin server
  * which should be sent to either StoreEntry, or to ICAP...
@@ -1106,6 +1136,7 @@
 void
 HttpStateData::writeReplyBody()
 {
+    truncateVirginBody(); // if needed
     const char *data = readBuf->content();
     int len = readBuf->contentSize();
 

=== modified file 'src/http.h'
--- src/http.h	2007-08-10 05:30:52 +0000
+++ src/http.h	2009-05-26 16:02:52 +0000
@@ -71,6 +71,7 @@
     size_t read_sz;
     int header_bytes_read;	// to find end of response,
     int reply_bytes_read;	// without relying on StoreEntry
+    int body_bytes_truncated; // positive when we read more than we wanted
     MemBuf *readBuf;
     bool ignoreCacheControl;
     bool surrogateNoStore;
@@ -92,6 +93,7 @@
     void checkDateSkew(HttpReply *);
 
     bool continueAfterParsingHeader();
+    void truncateVirginBody();
 
     virtual void haveParsedReplyHeaders();
     virtual void closeServer(); // end communication with the server


