u16 suboptions;
u64 sndr_key;
u64 rcvr_key;
+ struct mptcp_ext ext_copy;
#endif
};
}
}
-bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
- unsigned int *size, unsigned int remaining,
- struct mptcp_out_options *opts)
+static bool mptcp_established_options_mp(struct sock *sk, unsigned int *size,
+ unsigned int remaining,
+ struct mptcp_out_options *opts)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
- if (subflow->mp_capable && !subflow->fourth_ack) {
+ if (!subflow->fourth_ack) {
opts->suboptions = OPTION_MPTCP_MPC_ACK;
opts->sndr_key = subflow->local_key;
opts->rcvr_key = subflow->remote_key;
return false;
}
+static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow,
+ struct mptcp_ext *ext)
+{
+ ext->data_fin = 1;
+
+ if (!ext->use_map) {
+ /* RFC6824 requires a DSS mapping with specific values
+ * if DATA_FIN is set but no data payload is mapped
+ */
+ ext->use_map = 1;
+ ext->dsn64 = 1;
+ ext->data_seq = mptcp_sk(subflow->conn)->write_seq;
+ ext->subflow_seq = 0;
+ ext->data_len = 1;
+ } else {
+ /* If there's an existing DSS mapping, DATA_FIN consumes
+ * 1 additional byte of mapping space.
+ */
+ ext->data_len++;
+ }
+}
+
+static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
+ unsigned int *size,
+ unsigned int remaining,
+ struct mptcp_out_options *opts)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ unsigned int dss_size = 0;
+ struct mptcp_ext *mpext;
+ struct mptcp_sock *msk;
+ unsigned int ack_size;
+ u8 tcp_fin;
+
+ if (skb) {
+ mpext = mptcp_get_ext(skb);
+ tcp_fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
+ } else {
+ mpext = NULL;
+ tcp_fin = 0;
+ }
+
+ if (!skb || (mpext && mpext->use_map) || tcp_fin) {
+ unsigned int map_size;
+
+ map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64;
+
+ remaining -= map_size;
+ dss_size = map_size;
+ if (mpext)
+ opts->ext_copy = *mpext;
+
+ if (skb && tcp_fin &&
+ subflow->conn->sk_state != TCP_ESTABLISHED)
+ mptcp_write_data_fin(subflow, &opts->ext_copy);
+ }
+
+ ack_size = TCPOLEN_MPTCP_DSS_ACK64;
+
+ /* Add kind/length/subtype/flag overhead if mapping is not populated */
+ if (dss_size == 0)
+ ack_size += TCPOLEN_MPTCP_DSS_BASE;
+
+ dss_size += ack_size;
+
+ msk = mptcp_sk(mptcp_subflow_ctx(sk)->conn);
+ if (msk) {
+ opts->ext_copy.data_ack = msk->ack_seq;
+ } else {
+ mptcp_crypto_key_sha(mptcp_subflow_ctx(sk)->remote_key,
+ NULL, &opts->ext_copy.data_ack);
+ opts->ext_copy.data_ack++;
+ }
+
+ opts->ext_copy.ack64 = 1;
+ opts->ext_copy.use_ack = 1;
+
+ *size = ALIGN(dss_size, 4);
+ return true;
+}
+
+bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
+ unsigned int *size, unsigned int remaining,
+ struct mptcp_out_options *opts)
+{
+ unsigned int opt_size = 0;
+ bool ret = false;
+
+ if (mptcp_established_options_mp(sk, &opt_size, remaining, opts))
+ ret = true;
+ else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining,
+ opts))
+ ret = true;
+
+ /* we reserved enough space for the above options, and exceeding the
+ * TCP option space would be fatal
+ */
+ if (WARN_ON_ONCE(opt_size > remaining))
+ return false;
+
+ *size += opt_size;
+ remaining -= opt_size;
+
+ return ret;
+}
+
bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
struct mptcp_out_options *opts)
{
ptr += 2;
}
}
+
+ if (opts->ext_copy.use_ack || opts->ext_copy.use_map) {
+ struct mptcp_ext *mpext = &opts->ext_copy;
+ u8 len = TCPOLEN_MPTCP_DSS_BASE;
+ u8 flags = 0;
+
+ if (mpext->use_ack) {
+ len += TCPOLEN_MPTCP_DSS_ACK64;
+ flags = MPTCP_DSS_HAS_ACK | MPTCP_DSS_ACK64;
+ }
+
+ if (mpext->use_map) {
+ len += TCPOLEN_MPTCP_DSS_MAP64;
+
+ /* Use only 64-bit mapping flags for now, add
+ * support for optional 32-bit mappings later.
+ */
+ flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64;
+ if (mpext->data_fin)
+ flags |= MPTCP_DSS_DATA_FIN;
+ }
+
+ *ptr++ = htonl((TCPOPT_MPTCP << 24) |
+ (len << 16) |
+ (MPTCPOPT_DSS << 12) |
+ (flags));
+
+ if (mpext->use_ack) {
+ put_unaligned_be64(mpext->data_ack, ptr);
+ ptr += 2;
+ }
+
+ if (mpext->use_map) {
+ put_unaligned_be64(mpext->data_seq, ptr);
+ ptr += 2;
+ put_unaligned_be32(mpext->subflow_seq, ptr);
+ ptr += 1;
+ put_unaligned_be32(mpext->data_len << 16 |
+ TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
+ }
+ }
}
return NULL;
}
+static bool mptcp_ext_cache_refill(struct mptcp_sock *msk)
+{
+ if (!msk->cached_ext)
+ msk->cached_ext = __skb_ext_alloc();
+
+ return !!msk->cached_ext;
+}
+
+static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
+ struct msghdr *msg, long *timeo)
+{
+ int mss_now = 0, size_goal = 0, ret = 0;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_ext *mpext = NULL;
+ struct page_frag *pfrag;
+ struct sk_buff *skb;
+ size_t psize;
+
+ /* use the mptcp page cache so that we can easily move the data
+ * from one substream to another, but do per subflow memory accounting
+ */
+ pfrag = sk_page_frag(sk);
+ while (!sk_page_frag_refill(ssk, pfrag) ||
+ !mptcp_ext_cache_refill(msk)) {
+ ret = sk_stream_wait_memory(ssk, timeo);
+ if (ret)
+ return ret;
+ }
+
+ /* compute copy limit */
+ mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags);
+ psize = min_t(int, pfrag->size - pfrag->offset, size_goal);
+
+ pr_debug("left=%zu", msg_data_left(msg));
+ psize = copy_page_from_iter(pfrag->page, pfrag->offset,
+ min_t(size_t, msg_data_left(msg), psize),
+ &msg->msg_iter);
+ pr_debug("left=%zu", msg_data_left(msg));
+ if (!psize)
+ return -EINVAL;
+
+ /* Mark the end of the previous write so the beginning of the
+ * next write (with its own mptcp skb extension data) is not
+ * collapsed.
+ */
+ skb = tcp_write_queue_tail(ssk);
+ if (skb)
+ TCP_SKB_CB(skb)->eor = 1;
+
+ ret = do_tcp_sendpages(ssk, pfrag->page, pfrag->offset, psize,
+ msg->msg_flags | MSG_SENDPAGE_NOTLAST);
+ if (ret <= 0)
+ return ret;
+ if (unlikely(ret < psize))
+ iov_iter_revert(&msg->msg_iter, psize - ret);
+
+ skb = tcp_write_queue_tail(ssk);
+ mpext = __skb_ext_set(skb, SKB_EXT_MPTCP, msk->cached_ext);
+ msk->cached_ext = NULL;
+
+ memset(mpext, 0, sizeof(*mpext));
+ mpext->data_seq = msk->write_seq;
+ mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq;
+ mpext->data_len = ret;
+ mpext->use_map = 1;
+ mpext->dsn64 = 1;
+
+ pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d",
+ mpext->data_seq, mpext->subflow_seq, mpext->data_len,
+ mpext->dsn64);
+
+ pfrag->offset += ret;
+ msk->write_seq += ret;
+ mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
+
+ tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, size_goal);
+ return ret;
+}
+
static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct mptcp_sock *msk = mptcp_sk(sk);
struct socket *ssock;
+ size_t copied = 0;
struct sock *ssk;
- int ret;
+ int ret = 0;
+ long timeo;
if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
return -EOPNOTSUPP;
return ret;
}
+ timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+
ssk = mptcp_subflow_get(msk);
if (!ssk) {
release_sock(sk);
return -ENOTCONN;
}
- ret = sock_sendmsg(ssk->sk_socket, msg);
+ pr_debug("conn_list->subflow=%p", ssk);
+ lock_sock(ssk);
+ while (msg_data_left(msg)) {
+ ret = mptcp_sendmsg_frag(sk, ssk, msg, &timeo);
+ if (ret < 0)
+ break;
+
+ copied += ret;
+ }
+
+ if (copied > 0)
+ ret = copied;
+
+ release_sock(ssk);
release_sock(sk);
return ret;
}
__mptcp_close_ssk(sk, ssk, subflow, timeout);
}
+ if (msk->cached_ext)
+ __skb_ext_put(msk->cached_ext);
release_sock(sk);
sk_common_release(sk);
}
struct mptcp_subflow_context *subflow;
struct sock *new_mptcp_sock;
struct sock *ssk = newsk;
+ u64 ack_seq;
subflow = mptcp_subflow_ctx(newsk);
lock_sock(sk);
msk->subflow = NULL;
mptcp_token_update_accept(newsk, new_mptcp_sock);
+
+ mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq);
+ msk->write_seq = subflow->idsn + 1;
+ ack_seq++;
+ msk->ack_seq = ack_seq;
+ subflow->rel_write_seq = 1;
newsk = new_mptcp_sock;
mptcp_copy_inaddrs(newsk, ssk);
list_add(&subflow->node, &msk->conn_list);
struct mptcp_subflow_context *subflow;
struct mptcp_sock *msk;
struct sock *sk;
+ u64 ack_seq;
subflow = mptcp_subflow_ctx(ssk);
sk = subflow->conn;
msk = mptcp_sk(sk);
+ mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq);
+ ack_seq++;
+ subflow->rel_write_seq = 1;
+
/* the socket is not connected yet, no msk/subflow ops can access/race
* accessing the field below
*/
WRITE_ONCE(msk->remote_key, subflow->remote_key);
WRITE_ONCE(msk->local_key, subflow->local_key);
WRITE_ONCE(msk->token, subflow->token);
+ WRITE_ONCE(msk->write_seq, subflow->idsn + 1);
+ WRITE_ONCE(msk->ack_seq, ack_seq);
}
static void mptcp_sock_graft(struct sock *sk, struct socket *parent)
#define TCPOLEN_MPTCP_MPC_SYN 12
#define TCPOLEN_MPTCP_MPC_SYNACK 12
#define TCPOLEN_MPTCP_MPC_ACK 20
+#define TCPOLEN_MPTCP_DSS_BASE 4
+#define TCPOLEN_MPTCP_DSS_ACK64 8
+#define TCPOLEN_MPTCP_DSS_MAP64 14
+#define TCPOLEN_MPTCP_DSS_CHECKSUM 2
/* MPTCP MP_CAPABLE flags */
#define MPTCP_VERSION_MASK (0x0F)
#define MPTCP_CAP_HMAC_SHA1 BIT(0)
#define MPTCP_CAP_FLAG_MASK (0x3F)
+/* MPTCP DSS flags */
+#define MPTCP_DSS_DATA_FIN BIT(4)
+#define MPTCP_DSS_DSN64 BIT(3)
+#define MPTCP_DSS_HAS_MAP BIT(2)
+#define MPTCP_DSS_ACK64 BIT(1)
+#define MPTCP_DSS_HAS_ACK BIT(0)
+
/* MPTCP connection sock */
struct mptcp_sock {
/* inet_connection_sock must be the first member */
struct inet_connection_sock sk;
u64 local_key;
u64 remote_key;
+ u64 write_seq;
+ u64 ack_seq;
u32 token;
struct list_head conn_list;
+ struct skb_ext *cached_ext; /* for the next sendmsg */
struct socket *subflow; /* outgoing connect/listener/!mp_capable */
};
u64 remote_key;
u64 idsn;
u32 token;
+ u32 rel_write_seq;
u32 request_mptcp : 1, /* send MP_CAPABLE */
mp_capable : 1, /* remote is MPTCP capable */
fourth_ack : 1, /* send initial DSS */
void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
u32 *hash_out);
+static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb)
+{
+ return (struct mptcp_ext *)skb_ext_find(skb, SKB_EXT_MPTCP);
+}
+
#endif /* __MPTCP_PROTOCOL_H */