dctcp-2.6.26-rev1.1.0.patch

 diff -Naur linux-2.6./include/linux/sysctl.h linux-2.6.-dctcp-rev1.1.0/include/linux/sysctl.h
--- linux-2.6./include/linux/sysctl.h -- ::29.000000000 -
+++ linux-2.6.-dctcp-rev1.1.0/include/linux/sysctl.h -- ::50.000000000 -
@@ -, +, @@
NET_TCP_ALLOWED_CONG_CONTROL=,
NET_TCP_MAX_SSTHRESH=,
NET_TCP_FRTO_RESPONSE=,
+ NET_TCP_DELAYED_ACK=,
+ NET_TCP_DCTCP_ENABLE=,
+ NET_TCP_DCTCP_SHIFT_G=,
}; enum {
diff -Naur linux-2.6./include/linux/tcp.h linux-2.6.-dctcp-rev1.1.0/include/linux/tcp.h
--- linux-2.6./include/linux/tcp.h -- ::29.000000000 -
+++ linux-2.6.-dctcp-rev1.1.0/include/linux/tcp.h -- ::45.000000000 -
@@ -, +, @@
/* TCP MD5 Signagure Option information */
struct tcp_md5sig_info *md5sig_info;
#endif
+
+/* DCTCP Specific Parameters */
+ u32 acked_bytes_ecn;
+ u32 acked_bytes_total;
+ u32 prior_rcv_nxt;
+ u32 dctcp_alpha;
+ u32 next_seq;
+ u32 ce_state; /* 0: last pkt was non-ce , 1: last pkt was ce */
+ u32 delayed_ack_reserved;
}; static inline struct tcp_sock *tcp_sk(const struct sock *sk)
diff -Naur linux-2.6./include/net/tcp.h linux-2.6.-dctcp-rev1.1.0/include/net/tcp.h
--- linux-2.6./include/net/tcp.h -- ::29.000000000 -
+++ linux-2.6.-dctcp-rev1.1.0/include/net/tcp.h -- ::50.000000000 -
@@ -, +, @@
extern int sysctl_tcp_fack;
extern int sysctl_tcp_reordering;
extern int sysctl_tcp_ecn;
+extern int sysctl_tcp_delayed_ack;
+extern int sysctl_tcp_dctcp_enable;
+extern int sysctl_tcp_dctcp_shift_g;
extern int sysctl_tcp_dsack;
extern int sysctl_tcp_mem[];
extern int sysctl_tcp_wmem[];
diff -Naur linux-2.6./kernel/sysctl_check.c linux-2.6.-dctcp-rev1.1.0/kernel/sysctl_check.c
--- linux-2.6./kernel/sysctl_check.c -- ::29.000000000 -
+++ linux-2.6.-dctcp-rev1.1.0/kernel/sysctl_check.c -- ::50.000000000 -
@@ -, +, @@
{ NET_TCP_FACK, "tcp_fack" },
{ NET_TCP_REORDERING, "tcp_reordering" },
{ NET_TCP_ECN, "tcp_ecn" },
+ { NET_TCP_DELAYED_ACK, "tcp_delayed_ack" },
+ { NET_TCP_DCTCP_ENABLE, "tcp_dctcp_enable" },
+ { NET_TCP_DCTCP_SHIFT_G, "tcp_dctcp_shift_g" },
{ NET_TCP_DSACK, "tcp_dsack" },
{ NET_TCP_MEM, "tcp_mem" },
{ NET_TCP_WMEM, "tcp_wmem" },
diff -Naur linux-2.6./net/ipv4/sysctl_net_ipv4.c linux-2.6.-dctcp-rev1.1.0/net/ipv4/sysctl_net_ipv4.c
--- linux-2.6./net/ipv4/sysctl_net_ipv4.c -- ::29.000000000 -
+++ linux-2.6.-dctcp-rev1.1.0/net/ipv4/sysctl_net_ipv4.c -- ::50.000000000 -
@@ -, +, @@
.proc_handler = &proc_dointvec
},
{
+ .ctl_name = NET_TCP_DELAYED_ACK,
+ .procname = "tcp_delayed_ack",
+ .data = &sysctl_tcp_delayed_ack,
+ .maxlen = sizeof(int),
+ .mode = ,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_TCP_DCTCP_ENABLE,
+ .procname = "tcp_dctcp_enable",
+ .data = &sysctl_tcp_dctcp_enable,
+ .maxlen = sizeof(int),
+ .mode = ,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_TCP_DCTCP_SHIFT_G,
+ .procname = "tcp_dctcp_shift_g",
+ .data = &sysctl_tcp_dctcp_shift_g,
+ .maxlen = sizeof(int),
+ .mode = ,
+ .proc_handler = &proc_dointvec
+ },
+ {
.ctl_name = NET_TCP_DSACK,
.procname = "tcp_dsack",
.data = &sysctl_tcp_dsack,
diff -Naur linux-2.6./net/ipv4/tcp_input.c linux-2.6.-dctcp-rev1.1.0/net/ipv4/tcp_input.c
--- linux-2.6./net/ipv4/tcp_input.c -- ::29.000000000 -
+++ linux-2.6.-dctcp-rev1.1.0/net/ipv4/tcp_input.c -- ::21.000000000 -
@@ -, +, @@
int sysctl_tcp_fack __read_mostly = ;
int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
int sysctl_tcp_ecn __read_mostly;
+int sysctl_tcp_delayed_ack __read_mostly = ;
+int sysctl_tcp_dctcp_enable __read_mostly;
+int sysctl_tcp_dctcp_shift_g __read_mostly = ; /* g=1/2^5 */
int sysctl_tcp_dsack __read_mostly = ;
int sysctl_tcp_app_win __read_mostly = ;
int sysctl_tcp_adv_win_scale __read_mostly = ;
@@ -, +, @@
tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
} -static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb)
+static inline void TCP_ECN_dctcp_check_ce(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
{
if (tp->ecn_flags & TCP_ECN_OK) {
- if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags))
- tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
- /* Funny extension: if ECT is not set on a segment,
- * it is surely retransmit. It is not in ECN RFC,
- * but Linux follows this rule. */
- else if (INET_ECN_is_not_ect((TCP_SKB_CB(skb)->flags)))
- tcp_enter_quickack_mode((struct sock *)tp);
+ u32 temp_rcv_nxt;
+
+ if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags)) {
+
+ /* rcv_nxt is already update in previous process (tcp_rcv_established) */
+
+ if(sysctl_tcp_dctcp_enable) {
+
+ /* state has changed from CE=0 to CE=1 && delayed ack has not sent yet */
+ if(tp->ce_state == && tp->delayed_ack_reserved) {
+
+ /* save current rcv_nxt */
+ temp_rcv_nxt = tp->rcv_nxt;
+ /* generate previous ack with CE=0 */
+ tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+ tp->rcv_nxt = tp->prior_rcv_nxt;
+ tcp_send_ack(sk);
+ /* recover current rcv_nxt */
+ tp->rcv_nxt = temp_rcv_nxt;
+ }
+
+ tp->ce_state = ;
+ }
+
+ tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+
+
+ /* Funny extension: if ECT is not set on a segment,
+ * it is surely retransmit. It is not in ECN RFC,
+ * but Linux follows this rule. */
+ } else if (INET_ECN_is_not_ect((TCP_SKB_CB(skb)->flags))) {
+ tcp_enter_quickack_mode((struct sock *)tp);
+ }else {
+ /* It has ECT but it doesn't have CE */
+
+ if(sysctl_tcp_dctcp_enable) {
+
+ if(tp->ce_state != && tp->delayed_ack_reserved) {
+
+ /* save current rcv_nxt */
+ temp_rcv_nxt = tp->rcv_nxt;
+ /* generate previous ack with CE=1 */
+ tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+ tp->rcv_nxt = tp->prior_rcv_nxt;
+ tcp_send_ack(sk);
+ /* recover current rcv_nxt */
+ tp->rcv_nxt = temp_rcv_nxt;
+ }
+
+ tp->ce_state = ;
+
+ /* deassert only when DCTCP is enabled */
+ tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+ }
+
+ }
+
+ /* set current rcv_nxt to prior_rcv_nxt */
+ tp->prior_rcv_nxt = tp->rcv_nxt;
}
} @@ -, +, @@
*/
tcp_incr_quickack(sk);
icsk->icsk_ack.ato = TCP_ATO_MIN;
+
+ tp->ce_state = ;
} else {
int m = now - icsk->icsk_ack.lrcvtime; @@ -, +, @@
}
icsk->icsk_ack.lrcvtime = now; - TCP_ECN_check_ce(tp, skb);
+ TCP_ECN_dctcp_check_ce(sk, tp, skb); if (skb->len >= )
tcp_grow_window(sk, skb);
@@ -, +, @@
struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk); + __u32 ssthresh_old;
+ __u32 cwnd_old;
+ __u32 cwnd_new;
+
tp->prior_ssthresh = ;
tp->bytes_acked = ;
if (icsk->icsk_ca_state < TCP_CA_CWR) {
tp->undo_marker = ;
- if (set_ssthresh)
- tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
- tp->snd_cwnd = min(tp->snd_cwnd,
- tcp_packets_in_flight(tp) + 1U);
+
+ if(!sysctl_tcp_dctcp_enable) {
+
+ if (set_ssthresh)
+ tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+
+ tp->snd_cwnd = min(tp->snd_cwnd,
+ tcp_packets_in_flight(tp) + 1U);
+
+ }else {
+
+ cwnd_new = max (tp->snd_cwnd - ((tp->snd_cwnd * tp->dctcp_alpha)>>) , 2U);
+
+ if(set_ssthresh) {
+
+ ssthresh_old = tp->snd_ssthresh;
+ tp->snd_ssthresh = cwnd_new;
+
+ /* printk("%llu alpha= %d ssth old= %d new= %d\n", */
+ /* ktime_to_us(ktime_get_real()), */
+ /* tp->dctcp_alpha, */
+ /* ssthresh_old, */
+ /* tp->snd_ssthresh); */
+ }
+
+ cwnd_old = tp->snd_cwnd;
+ tp->snd_cwnd = cwnd_new;
+
+ /* printk("%llu alpha= %d cwnd old= %d new= %d\n", */
+ /* ktime_to_us(ktime_get_real()), */
+ /* tp->dctcp_alpha, */
+ /* cwnd_old, */
+ /* tp->snd_cwnd); */
+ }
+
tp->snd_cwnd_cnt = ;
tp->high_seq = tp->snd_nxt;
tp->snd_cwnd_stamp = tcp_time_stamp;
TCP_ECN_queue_cwr(tp);
-
+
tcp_set_ca_state(sk, TCP_CA_CWR);
}
}
@@ -, +, @@
tcp_try_keep_open(sk);
tcp_moderate_cwnd(tp);
} else {
- tcp_cwnd_down(sk, flag);
+ if(!sysctl_tcp_dctcp_enable)
+ tcp_cwnd_down(sk, flag);
}
} @@ -, +, @@
int prior_packets;
int frto_cwnd = ; + __u32 alpha_old;
+ __u32 acked_bytes;
+
/* If the ack is newer than sent or older than previous acks
* then we can probably ignore it.
*/
@@ -, +, @@
tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
} +
+ /* START: DCTCP Processing */
+
+ /* calc acked bytes */
+ if(after(ack,prior_snd_una)) {
+ acked_bytes = ack - prior_snd_una;
+ } else {
+ acked_bytes = inet_csk(sk)->icsk_ack.rcv_mss;
+ }
+
+ if(flag & FLAG_ECE)
+ tp->acked_bytes_ecn += acked_bytes;
+
+ tp->acked_bytes_total += acked_bytes;
+
+ /* Expired RTT */
+ if (!before(tp->snd_una,tp->next_seq)) {
+
+ /* For avoiding denominator == 1 */
+ if(tp->acked_bytes_total == ) tp->acked_bytes_total = ;
+
+ alpha_old = tp->dctcp_alpha;
+
+ /* alpha = (1-g) * alpha + g * F */
+ tp->dctcp_alpha = alpha_old - (alpha_old >> sysctl_tcp_dctcp_shift_g)
+ + (tp->acked_bytes_ecn << ( - sysctl_tcp_dctcp_shift_g)) / tp->acked_bytes_total;
+
+ if(tp->dctcp_alpha > ) tp->dctcp_alpha = ; /* round to 0-1024 */
+
+ /* printk("bytes_ecn= %d total= %d alpha: old= %d new= %d\n", */
+ /* tp->acked_bytes_ecn, tp->acked_bytes_total, alpha_old, tp->dctcp_alpha); */
+
+ tp->acked_bytes_ecn = ;
+ tp->acked_bytes_total = ;
+ tp->next_seq = tp->snd_nxt;
+ }
+
+ /* END: DCTCP Processing */
+
/* We passed data and got it acked, remove any soft error
* log. Something worked...
*/
@@ -, +, @@
goto queue_and_out;
} - TCP_ECN_check_ce(tp, skb);
+ TCP_ECN_dctcp_check_ce(sk, tp, skb); if (tcp_try_rmem_schedule(sk, skb->truesize))
goto drop;
@@ -, +, @@
&& __tcp_select_window(sk) >= tp->rcv_wnd) ||
/* We ACK each frame or... */
tcp_in_quickack_mode(sk) ||
+ /* Delayed ACK is disabled or ... */
+ sysctl_tcp_delayed_ack == ||
/* We have out of order data. */
(ofo_possible && skb_peek(&tp->out_of_order_queue))) {
/* Then ack it now */
@@ -, +, @@
} EXPORT_SYMBOL(sysctl_tcp_ecn);
+EXPORT_SYMBOL(sysctl_tcp_delayed_ack);
+EXPORT_SYMBOL(sysctl_tcp_dctcp_enable);
+EXPORT_SYMBOL(sysctl_tcp_dctcp_shift_g);
EXPORT_SYMBOL(sysctl_tcp_reordering);
EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
EXPORT_SYMBOL(tcp_parse_options);
diff -Naur linux-2.6./net/ipv4/tcp_minisocks.c linux-2.6.-dctcp-rev1.1.0/net/ipv4/tcp_minisocks.c
--- linux-2.6./net/ipv4/tcp_minisocks.c -- ::29.000000000 -
+++ linux-2.6.-dctcp-rev1.1.0/net/ipv4/tcp_minisocks.c -- ::45.000000000 -
@@ -, +, @@
newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + ;
newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = treq->snt_isn + ; + /* Initialize DCTCP internal parameters */
+ newtp->next_seq = newtp->snd_nxt;
+ newtp->acked_bytes_ecn = ;
+ newtp->acked_bytes_total = ;
+
tcp_prequeue_init(newtp); tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn);
diff -Naur linux-2.6./net/ipv4/tcp_output.c linux-2.6.-dctcp-rev1.1.0/net/ipv4/tcp_output.c
--- linux-2.6./net/ipv4/tcp_output.c -- ::29.000000000 -
+++ linux-2.6.-dctcp-rev1.1.0/net/ipv4/tcp_output.c -- ::50.000000000 -
@@ -, +, @@
struct tcp_sock *tp = tcp_sk(sk); tp->ecn_flags = ;
- if (sysctl_tcp_ecn) {
+ if (sysctl_tcp_ecn || sysctl_tcp_dctcp_enable) {
TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE | TCPCB_FLAG_CWR;
tp->ecn_flags = TCP_ECN_OK;
}
@@ -, +, @@
TCP_ECN_send(sk, skb, tcp_header_size);
} + /* In DCTCP, Assert ECT bit to all packets*/
+ if(sysctl_tcp_dctcp_enable)
+ INET_ECN_xmit(sk);
+
#ifdef CONFIG_TCP_MD5SIG
/* Calculate the MD5 hash, as we have all we need now */
if (md5) {
@@ -, +, @@
tcp_init_nondata_skb(buff, tp->write_seq++, TCPCB_FLAG_SYN);
TCP_ECN_send_syn(sk, buff); + /* Initialize DCTCP internal parameters */
+ tp->next_seq = tp->snd_nxt;
+ tp->acked_bytes_ecn = ;
+ tp->acked_bytes_total = ;
+
/* Send it off. */
TCP_SKB_CB(buff)->when = tcp_time_stamp;
tp->retrans_stamp = TCP_SKB_CB(buff)->when;
@@ -, +, @@
int ato = icsk->icsk_ack.ato;
unsigned long timeout; + /* Delayed ACK reserved flag for DCTCP */
+ struct tcp_sock *tp = tcp_sk(sk);
+ tp->delayed_ack_reserved = ;
+
if (ato > TCP_DELACK_MIN) {
const struct tcp_sock *tp = tcp_sk(sk);
int max_ato = HZ / ;
@@ -, +, @@
{
struct sk_buff *buff; + /* Delayed ACK reserved flag for DCTCP */
+ struct tcp_sock *tp = tcp_sk(sk);
+ tp->delayed_ack_reserved = ;
+
/* If we have been reset, we may not send again. */
if (sk->sk_state == TCP_CLOSE)
return;

https://github.com/myasuda/DCTCP-Linux/blob/master/dctcp-2.6.26-rev1.1.0.patch

05-11 16:02