aboutsummaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/802/fc.c2
-rw-r--r--net/802/fddi.c4
-rw-r--r--net/802/hippi.c6
-rw-r--r--net/802/p8022.c5
-rw-r--r--net/802/p8023.c1
-rw-r--r--net/802/psnap.c9
-rw-r--r--net/802/sysctl_net_802.c3
-rw-r--r--net/802/tr.c29
-rw-r--r--net/8021q/vlan.h2
-rw-r--r--net/8021q/vlan_dev.c4
-rw-r--r--net/Kconfig3
-rw-r--r--net/Makefile3
-rw-r--r--net/appletalk/aarp.c2
-rw-r--r--net/appletalk/ddp.c39
-rw-r--r--net/atm/addr.c55
-rw-r--r--net/atm/addr.h12
-rw-r--r--net/atm/atm_misc.c2
-rw-r--r--net/atm/br2684.c2
-rw-r--r--net/atm/clip.c2
-rw-r--r--net/atm/common.c6
-rw-r--r--net/atm/ioctl.c35
-rw-r--r--net/atm/ipcommon.c3
-rw-r--r--net/atm/lec.c43
-rw-r--r--net/atm/mpc.c2
-rw-r--r--net/atm/resources.c20
-rw-r--r--net/atm/signaling.c8
-rw-r--r--net/atm/svc.c1
-rw-r--r--net/ax25/af_ax25.c82
-rw-r--r--net/ax25/ax25_addr.c30
-rw-r--r--net/ax25/ax25_ds_in.c3
-rw-r--r--net/ax25/ax25_ds_timer.c2
-rw-r--r--net/ax25/ax25_in.c19
-rw-r--r--net/ax25/ax25_ip.c6
-rw-r--r--net/ax25/ax25_route.c19
-rw-r--r--net/ax25/ax25_std_in.c3
-rw-r--r--net/ax25/ax25_std_timer.c2
-rw-r--r--net/ax25/ax25_subr.c4
-rw-r--r--net/ax25/ax25_uid.c87
-rw-r--r--net/bluetooth/af_bluetooth.c6
-rw-r--r--net/bluetooth/hci_core.c24
-rw-r--r--net/bluetooth/hci_event.c106
-rw-r--r--net/bluetooth/hci_sock.c38
-rw-r--r--net/bluetooth/hidp/Kconfig2
-rw-r--r--net/bluetooth/hidp/core.c13
-rw-r--r--net/bluetooth/l2cap.c2
-rw-r--r--net/bluetooth/lib.c25
-rw-r--r--net/bluetooth/rfcomm/Makefile2
-rw-r--r--net/bluetooth/rfcomm/core.c120
-rw-r--r--net/bluetooth/rfcomm/crc.c71
-rw-r--r--net/bluetooth/rfcomm/sock.c32
-rw-r--r--net/bluetooth/rfcomm/tty.c206
-rw-r--r--net/bluetooth/sco.c2
-rw-r--r--net/bridge/br_fdb.c14
-rw-r--r--net/bridge/br_forward.c3
-rw-r--r--net/bridge/br_if.c2
-rw-r--r--net/bridge/br_input.c2
-rw-r--r--net/bridge/br_netfilter.c8
-rw-r--r--net/bridge/br_stp_if.c9
-rw-r--r--net/bridge/netfilter/ebt_mark.c5
-rw-r--r--net/bridge/netfilter/ebt_ulog.c9
-rw-r--r--net/bridge/netfilter/ebtables.c27
-rw-r--r--net/compat.c53
-rw-r--r--net/core/Makefile1
-rw-r--r--net/core/datagram.c87
-rw-r--r--net/core/dev.c108
-rw-r--r--net/core/dst.c3
-rw-r--r--net/core/ethtool.c102
-rw-r--r--net/core/filter.c6
-rw-r--r--net/core/flow.c2
-rw-r--r--net/core/neighbour.c91
-rw-r--r--net/core/netfilter.c648
-rw-r--r--net/core/netpoll.c69
-rw-r--r--net/core/pktgen.c531
-rw-r--r--net/core/request_sock.c28
-rw-r--r--net/core/rtnetlink.c9
-rw-r--r--net/core/skbuff.c248
-rw-r--r--net/core/sock.c172
-rw-r--r--net/core/sysctl_net_core.c9
-rw-r--r--net/core/utils.c2
-rw-r--r--net/core/wireless.c71
-rw-r--r--net/dccp/Kconfig50
-rw-r--r--net/dccp/Makefile12
-rw-r--r--net/dccp/ackvec.c419
-rw-r--r--net/dccp/ackvec.h133
-rw-r--r--net/dccp/ccid.c139
-rw-r--r--net/dccp/ccid.h211
-rw-r--r--net/dccp/ccids/Kconfig29
-rw-r--r--net/dccp/ccids/Makefile5
-rw-r--r--net/dccp/ccids/ccid3.c1270
-rw-r--r--net/dccp/ccids/ccid3.h146
-rw-r--r--net/dccp/ccids/lib/Makefile3
-rw-r--r--net/dccp/ccids/lib/loss_interval.c144
-rw-r--r--net/dccp/ccids/lib/loss_interval.h61
-rw-r--r--net/dccp/ccids/lib/packet_history.c398
-rw-r--r--net/dccp/ccids/lib/packet_history.h200
-rw-r--r--net/dccp/ccids/lib/tfrc.h22
-rw-r--r--net/dccp/ccids/lib/tfrc_equation.c644
-rw-r--r--net/dccp/dccp.h425
-rw-r--r--net/dccp/diag.c71
-rw-r--r--net/dccp/input.c568
-rw-r--r--net/dccp/ipv4.c1348
-rw-r--r--net/dccp/minisocks.c268
-rw-r--r--net/dccp/options.c462
-rw-r--r--net/dccp/output.c527
-rw-r--r--net/dccp/proto.c910
-rw-r--r--net/dccp/timer.c255
-rw-r--r--net/decnet/af_decnet.c68
-rw-r--r--net/decnet/dn_dev.c8
-rw-r--r--net/decnet/dn_neigh.c2
-rw-r--r--net/decnet/dn_nsp_in.c2
-rw-r--r--net/decnet/dn_nsp_out.c86
-rw-r--r--net/decnet/dn_route.c5
-rw-r--r--net/decnet/dn_table.c6
-rw-r--r--net/decnet/netfilter/dn_rtmsg.c11
-rw-r--r--net/econet/af_econet.c10
-rw-r--r--net/ethernet/eth.c21
-rw-r--r--net/ethernet/sysctl_net_ether.c1
-rw-r--r--net/ieee80211/Kconfig68
-rw-r--r--net/ieee80211/Makefile12
-rw-r--r--net/ieee80211/ieee80211_crypt.c279
-rw-r--r--net/ieee80211/ieee80211_crypt_ccmp.c472
-rw-r--r--net/ieee80211/ieee80211_crypt_tkip.c725
-rw-r--r--net/ieee80211/ieee80211_crypt_wep.c258
-rw-r--r--net/ieee80211/ieee80211_geo.c141
-rw-r--r--net/ieee80211/ieee80211_module.c304
-rw-r--r--net/ieee80211/ieee80211_rx.c1511
-rw-r--r--net/ieee80211/ieee80211_tx.c581
-rw-r--r--net/ieee80211/ieee80211_wx.c730
-rw-r--r--net/ipv4/Kconfig17
-rw-r--r--net/ipv4/Makefile8
-rw-r--r--net/ipv4/af_inet.c192
-rw-r--r--net/ipv4/ah4.c18
-rw-r--r--net/ipv4/arp.c27
-rw-r--r--net/ipv4/datagram.c3
-rw-r--r--net/ipv4/devinet.c32
-rw-r--r--net/ipv4/esp4.c53
-rw-r--r--net/ipv4/fib_frontend.c12
-rw-r--r--net/ipv4/fib_hash.c4
-rw-r--r--net/ipv4/fib_lookup.h1
-rw-r--r--net/ipv4/fib_semantics.c20
-rw-r--r--net/ipv4/fib_trie.c2080
-rw-r--r--net/ipv4/icmp.c34
-rw-r--r--net/ipv4/igmp.c11
-rw-r--r--net/ipv4/inet_connection_sock.c641
-rw-r--r--net/ipv4/inet_diag.c868
-rw-r--r--net/ipv4/inet_hashtables.c165
-rw-r--r--net/ipv4/inet_timewait_sock.c385
-rw-r--r--net/ipv4/inetpeer.c19
-rw-r--r--net/ipv4/ip_forward.c6
-rw-r--r--net/ipv4/ip_fragment.c14
-rw-r--r--net/ipv4/ip_gre.c4
-rw-r--r--net/ipv4/ip_input.c141
-rw-r--r--net/ipv4/ip_options.c52
-rw-r--r--net/ipv4/ip_output.c115
-rw-r--r--net/ipv4/ip_sockglue.c11
-rw-r--r--net/ipv4/ipcomp.c9
-rw-r--r--net/ipv4/ipconfig.c15
-rw-r--r--net/ipv4/ipmr.c8
-rw-r--r--net/ipv4/ipvs/ip_vs_app.c3
-rw-r--r--net/ipv4/ipvs/ip_vs_conn.c45
-rw-r--r--net/ipv4/ipvs/ip_vs_core.c25
-rw-r--r--net/ipv4/ipvs/ip_vs_ctl.c4
-rw-r--r--net/ipv4/ipvs/ip_vs_lblc.c4
-rw-r--r--net/ipv4/ipvs/ip_vs_lblcr.c4
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_tcp.c8
-rw-r--r--net/ipv4/ipvs/ip_vs_sync.c20
-rw-r--r--net/ipv4/ipvs/ip_vs_xmit.c2
-rw-r--r--net/ipv4/multipath_drr.c2
-rw-r--r--net/ipv4/netfilter.c139
-rw-r--r--net/ipv4/netfilter/Kconfig133
-rw-r--r--net/ipv4/netfilter/Makefile20
-rw-r--r--net/ipv4/netfilter/arp_tables.c215
-rw-r--r--net/ipv4/netfilter/ip_conntrack_amanda.c21
-rw-r--r--net/ipv4/netfilter/ip_conntrack_core.c582
-rw-r--r--net/ipv4/netfilter/ip_conntrack_ftp.c28
-rw-r--r--net/ipv4/netfilter/ip_conntrack_helper_pptp.c806
-rw-r--r--net/ipv4/netfilter/ip_conntrack_irc.c14
-rw-r--r--net/ipv4/netfilter/ip_conntrack_netbios_ns.c142
-rw-r--r--net/ipv4/netfilter/ip_conntrack_netlink.c1622
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_gre.c328
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_icmp.c72
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_sctp.c10
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_tcp.c79
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_udp.c14
-rw-r--r--net/ipv4/netfilter/ip_conntrack_standalone.c51
-rw-r--r--net/ipv4/netfilter/ip_conntrack_tftp.c7
-rw-r--r--net/ipv4/netfilter/ip_nat_core.c141
-rw-r--r--net/ipv4/netfilter/ip_nat_helper.c12
-rw-r--r--net/ipv4/netfilter/ip_nat_helper_pptp.c401
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_gre.c214
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_icmp.c23
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_tcp.c24
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_udp.c23
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_unknown.c13
-rw-r--r--net/ipv4/netfilter/ip_nat_rule.c21
-rw-r--r--net/ipv4/netfilter/ip_nat_snmp_basic.c2
-rw-r--r--net/ipv4/netfilter/ip_nat_standalone.c37
-rw-r--r--net/ipv4/netfilter/ip_queue.c58
-rw-r--r--net/ipv4/netfilter/ip_tables.c22
-rw-r--r--net/ipv4/netfilter/ipt_CLASSIFY.c4
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c225
-rw-r--r--net/ipv4/netfilter/ipt_CONNMARK.c15
-rw-r--r--net/ipv4/netfilter/ipt_DSCP.c3
-rw-r--r--net/ipv4/netfilter/ipt_ECN.c23
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c86
-rw-r--r--net/ipv4/netfilter/ipt_MARK.c22
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c11
-rw-r--r--net/ipv4/netfilter/ipt_NETMAP.c8
-rw-r--r--net/ipv4/netfilter/ipt_NFQUEUE.c70
-rw-r--r--net/ipv4/netfilter/ipt_REDIRECT.c16
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c6
-rw-r--r--net/ipv4/netfilter/ipt_TCPMSS.c10
-rw-r--r--net/ipv4/netfilter/ipt_TOS.c3
-rw-r--r--net/ipv4/netfilter/ipt_TTL.c119
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c53
-rw-r--r--net/ipv4/netfilter/ipt_addrtype.c2
-rw-r--r--net/ipv4/netfilter/ipt_connbytes.c162
-rw-r--r--net/ipv4/netfilter/ipt_connmark.c7
-rw-r--r--net/ipv4/netfilter/ipt_dccp.c176
-rw-r--r--net/ipv4/netfilter/ipt_hashlimit.c2
-rw-r--r--net/ipv4/netfilter/ipt_mark.c7
-rw-r--r--net/ipv4/netfilter/ipt_owner.c133
-rw-r--r--net/ipv4/netfilter/ipt_string.c91
-rw-r--r--net/ipv4/proc.c9
-rw-r--r--net/ipv4/protocol.c1
-rw-r--r--net/ipv4/raw.c9
-rw-r--r--net/ipv4/route.c43
-rw-r--r--net/ipv4/syncookies.c4
-rw-r--r--net/ipv4/sysctl_net_ipv4.c47
-rw-r--r--net/ipv4/tcp.c421
-rw-r--r--net/ipv4/tcp_bic.c50
-rw-r--r--net/ipv4/tcp_cong.c44
-rw-r--r--net/ipv4/tcp_diag.c784
-rw-r--r--net/ipv4/tcp_highspeed.c17
-rw-r--r--net/ipv4/tcp_htcp.c53
-rw-r--r--net/ipv4/tcp_hybla.c31
-rw-r--r--net/ipv4/tcp_input.c556
-rw-r--r--net/ipv4/tcp_ipv4.c944
-rw-r--r--net/ipv4/tcp_minisocks.c605
-rw-r--r--net/ipv4/tcp_output.c338
-rw-r--r--net/ipv4/tcp_scalable.c6
-rw-r--r--net/ipv4/tcp_timer.c253
-rw-r--r--net/ipv4/tcp_vegas.c50
-rw-r--r--net/ipv4/tcp_westwood.c64
-rw-r--r--net/ipv4/udp.c41
-rw-r--r--net/ipv4/xfrm4_state.c2
-rw-r--r--net/ipv6/Makefile4
-rw-r--r--net/ipv6/addrconf.c57
-rw-r--r--net/ipv6/af_inet6.c62
-rw-r--r--net/ipv6/ah6.c31
-rw-r--r--net/ipv6/datagram.c144
-rw-r--r--net/ipv6/esp6.c45
-rw-r--r--net/ipv6/exthdrs.c127
-rw-r--r--net/ipv6/icmp.c54
-rw-r--r--net/ipv6/inet6_hashtables.c81
-rw-r--r--net/ipv6/ip6_fib.c4
-rw-r--r--net/ipv6/ip6_flowlabel.c18
-rw-r--r--net/ipv6/ip6_input.c15
-rw-r--r--net/ipv6/ip6_output.c151
-rw-r--r--net/ipv6/ip6_tunnel.c7
-rw-r--r--net/ipv6/ipcomp6.c5
-rw-r--r--net/ipv6/ipv6_sockglue.c214
-rw-r--r--net/ipv6/ipv6_syms.c3
-rw-r--r--net/ipv6/mcast.c29
-rw-r--r--net/ipv6/ndisc.c22
-rw-r--r--net/ipv6/netfilter.c104
-rw-r--r--net/ipv6/netfilter/Kconfig48
-rw-r--r--net/ipv6/netfilter/Makefile3
-rw-r--r--net/ipv6/netfilter/ip6_queue.c55
-rw-r--r--net/ipv6/netfilter/ip6_tables.c372
-rw-r--r--net/ipv6/netfilter/ip6t_HL.c118
-rw-r--r--net/ipv6/netfilter/ip6t_LOG.c93
-rw-r--r--net/ipv6/netfilter/ip6t_MARK.c13
-rw-r--r--net/ipv6/netfilter/ip6t_NFQUEUE.c70
-rw-r--r--net/ipv6/netfilter/ip6t_REJECT.c281
-rw-r--r--net/ipv6/netfilter/ip6t_ah.c81
-rw-r--r--net/ipv6/netfilter/ip6t_dst.c88
-rw-r--r--net/ipv6/netfilter/ip6t_esp.c73
-rw-r--r--net/ipv6/netfilter/ip6t_frag.c90
-rw-r--r--net/ipv6/netfilter/ip6t_hbh.c88
-rw-r--r--net/ipv6/netfilter/ip6t_owner.c91
-rw-r--r--net/ipv6/netfilter/ip6t_rt.c97
-rw-r--r--net/ipv6/proc.c4
-rw-r--r--net/ipv6/raw.c45
-rw-r--r--net/ipv6/reassembly.c13
-rw-r--r--net/ipv6/route.c16
-rw-r--r--net/ipv6/sit.c2
-rw-r--r--net/ipv6/sysctl_net_ipv6.c3
-rw-r--r--net/ipv6/tcp_ipv6.c488
-rw-r--r--net/ipv6/udp.c68
-rw-r--r--net/ipv6/xfrm6_tunnel.c2
-rw-r--r--net/ipx/af_ipx.c10
-rw-r--r--net/ipx/ipx_proc.c2
-rw-r--r--net/irda/af_irda.c2
-rw-r--r--net/irda/ircomm/ircomm_tty.c9
-rw-r--r--net/irda/irlan/irlan_eth.c2
-rw-r--r--net/irda/irlan/irlan_filter.c1
-rw-r--r--net/irda/irlap_frame.c8
-rw-r--r--net/irda/irlmp.c3
-rw-r--r--net/irda/irmod.c2
-rw-r--r--net/irda/irnet/irnet.h3
-rw-r--r--net/irda/irnet/irnet_ppp.c2
-rw-r--r--net/irda/irqueue.c1
-rw-r--r--net/irda/irttp.c16
-rw-r--r--net/irda/qos.c1
-rw-r--r--net/key/af_key.c18
-rw-r--r--net/lapb/lapb_subr.c2
-rw-r--r--net/llc/Makefile1
-rw-r--r--net/llc/af_llc.c503
-rw-r--r--net/llc/llc_c_ac.c271
-rw-r--r--net/llc/llc_c_ev.c157
-rw-r--r--net/llc/llc_conn.c219
-rw-r--r--net/llc/llc_core.c37
-rw-r--r--net/llc/llc_if.c13
-rw-r--r--net/llc/llc_input.c23
-rw-r--r--net/llc/llc_output.c2
-rw-r--r--net/llc/llc_proc.c2
-rw-r--r--net/llc/llc_s_ac.c16
-rw-r--r--net/llc/llc_sap.c22
-rw-r--r--net/llc/llc_station.c25
-rw-r--r--net/llc/sysctl_net_llc.c131
-rw-r--r--net/netfilter/Kconfig24
-rw-r--r--net/netfilter/Makefile7
-rw-r--r--net/netfilter/core.c216
-rw-r--r--net/netfilter/nf_internals.h39
-rw-r--r--net/netfilter/nf_log.c178
-rw-r--r--net/netfilter/nf_queue.c343
-rw-r--r--net/netfilter/nf_sockopt.c132
-rw-r--r--net/netfilter/nfnetlink.c376
-rw-r--r--net/netfilter/nfnetlink_log.c1055
-rw-r--r--net/netfilter/nfnetlink_queue.c1127
-rw-r--r--net/netlink/af_netlink.c346
-rw-r--r--net/netrom/af_netrom.c73
-rw-r--r--net/netrom/nr_dev.c61
-rw-r--r--net/netrom/nr_in.c18
-rw-r--r--net/netrom/nr_loopback.c2
-rw-r--r--net/netrom/nr_route.c8
-rw-r--r--net/netrom/nr_subr.c11
-rw-r--r--net/netrom/nr_timer.c2
-rw-r--r--net/netrom/sysctl_net_netrom.c12
-rw-r--r--net/packet/af_packet.c79
-rw-r--r--net/rose/af_rose.c69
-rw-r--r--net/rose/rose_dev.c2
-rw-r--r--net/rose/rose_in.c3
-rw-r--r--net/rose/rose_route.c24
-rw-r--r--net/rose/rose_subr.c13
-rw-r--r--net/rose/rose_timer.c3
-rw-r--r--net/rxrpc/call.c2
-rw-r--r--net/rxrpc/connection.c2
-rw-r--r--net/rxrpc/transport.c2
-rw-r--r--net/sched/Kconfig399
-rw-r--r--net/sched/act_api.c15
-rw-r--r--net/sched/cls_api.c2
-rw-r--r--net/sched/em_meta.c6
-rw-r--r--net/sched/gact.c2
-rw-r--r--net/sched/ipt.c2
-rw-r--r--net/sched/mirred.c2
-rw-r--r--net/sched/pedit.c2
-rw-r--r--net/sched/police.c3
-rw-r--r--net/sched/sch_api.c6
-rw-r--r--net/sched/sch_generic.c17
-rw-r--r--net/sched/simple.c2
-rw-r--r--net/sctp/associola.c10
-rw-r--r--net/sctp/bind_addr.c12
-rw-r--r--net/sctp/chunk.c2
-rw-r--r--net/sctp/endpointola.c8
-rw-r--r--net/sctp/input.c4
-rw-r--r--net/sctp/ipv6.c7
-rw-r--r--net/sctp/proc.c5
-rw-r--r--net/sctp/protocol.c18
-rw-r--r--net/sctp/sm_make_chunk.c25
-rw-r--r--net/sctp/sm_sideeffect.c12
-rw-r--r--net/sctp/sm_statefuns.c22
-rw-r--r--net/sctp/socket.c349
-rw-r--r--net/sctp/ssnmap.c2
-rw-r--r--net/sctp/sysctl.c1
-rw-r--r--net/sctp/transport.c4
-rw-r--r--net/sctp/ulpevent.c24
-rw-r--r--net/sctp/ulpqueue.c71
-rw-r--r--net/socket.c64
-rw-r--r--net/sunrpc/Makefile2
-rw-r--r--net/sunrpc/auth.c16
-rw-r--r--net/sunrpc/auth_gss/Makefile2
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c187
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_crypto.c282
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_mech.c50
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_seal.c44
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_unseal.c39
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_wrap.c363
-rw-r--r--net/sunrpc/auth_gss/gss_mech_switch.c29
-rw-r--r--net/sunrpc/auth_gss/gss_spkm3_mech.c33
-rw-r--r--net/sunrpc/auth_gss/gss_spkm3_seal.c4
-rw-r--r--net/sunrpc/auth_gss/gss_spkm3_unseal.c2
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c17
-rw-r--r--net/sunrpc/auth_null.c2
-rw-r--r--net/sunrpc/auth_unix.c2
-rw-r--r--net/sunrpc/cache.c8
-rw-r--r--net/sunrpc/clnt.c147
-rw-r--r--net/sunrpc/pmap_clnt.c12
-rw-r--r--net/sunrpc/rpc_pipe.c35
-rw-r--r--net/sunrpc/sched.c10
-rw-r--r--net/sunrpc/socklib.c175
-rw-r--r--net/sunrpc/stats.c16
-rw-r--r--net/sunrpc/sunrpc_syms.c9
-rw-r--r--net/sunrpc/svcauth.c1
-rw-r--r--net/sunrpc/svcauth_unix.c1
-rw-r--r--net/sunrpc/svcsock.c102
-rw-r--r--net/sunrpc/sysctl.c25
-rw-r--r--net/sunrpc/xdr.c178
-rw-r--r--net/sunrpc/xprt.c1613
-rw-r--r--net/sunrpc/xprtsock.c1261
-rw-r--r--net/sysctl_net.c10
-rw-r--r--net/unix/af_unix.c10
-rw-r--r--net/unix/garbage.c14
-rw-r--r--net/unix/sysctl_net_unix.c2
-rw-r--r--net/wanrouter/af_wanpipe.c2
-rw-r--r--net/x25/af_x25.c2
-rw-r--r--net/x25/x25_dev.c2
-rw-r--r--net/x25/x25_in.c2
-rw-r--r--net/x25/x25_subr.c4
-rw-r--r--net/x25/x25_timer.c2
-rw-r--r--net/xfrm/xfrm_input.c2
-rw-r--r--net/xfrm/xfrm_policy.c59
-rw-r--r--net/xfrm/xfrm_state.c6
-rw-r--r--net/xfrm/xfrm_user.c31
425 files changed, 36763 insertions, 11893 deletions
diff --git a/net/802/fc.c b/net/802/fc.c
index 640d34e026c..282c4ab1abe 100644
--- a/net/802/fc.c
+++ b/net/802/fc.c
@@ -87,7 +87,7 @@ static int fc_rebuild_header(struct sk_buff *skb)
struct fch_hdr *fch=(struct fch_hdr *)skb->data;
struct fcllc *fcllc=(struct fcllc *)(skb->data+sizeof(struct fch_hdr));
if(fcllc->ethertype != htons(ETH_P_IP)) {
- printk("fc_rebuild_header: Don't know how to resolve type %04X addresses ?\n",(unsigned int)htons(fcllc->ethertype));
+ printk("fc_rebuild_header: Don't know how to resolve type %04X addresses ?\n", ntohs(fcllc->ethertype));
return 0;
}
#ifdef CONFIG_INET
diff --git a/net/802/fddi.c b/net/802/fddi.c
index 5ce24c4bb84..ac242a4bc34 100644
--- a/net/802/fddi.c
+++ b/net/802/fddi.c
@@ -108,8 +108,8 @@ static int fddi_rebuild_header(struct sk_buff *skb)
else
#endif
{
- printk("%s: Don't know how to resolve type %02X addresses.\n",
- skb->dev->name, htons(fddi->hdr.llc_snap.ethertype));
+ printk("%s: Don't know how to resolve type %04X addresses.\n",
+ skb->dev->name, ntohs(fddi->hdr.llc_snap.ethertype));
return(0);
}
}
diff --git a/net/802/hippi.c b/net/802/hippi.c
index 051e8af56a7..6d7fed3dd99 100644
--- a/net/802/hippi.c
+++ b/net/802/hippi.c
@@ -51,6 +51,7 @@ static int hippi_header(struct sk_buff *skb, struct net_device *dev,
unsigned len)
{
struct hippi_hdr *hip = (struct hippi_hdr *)skb_push(skb, HIPPI_HLEN);
+ struct hippi_cb *hcb = (struct hippi_cb *) skb->cb;
if (!len){
len = skb->len - HIPPI_HLEN;
@@ -84,9 +85,10 @@ static int hippi_header(struct sk_buff *skb, struct net_device *dev,
if (daddr)
{
memcpy(hip->le.dest_switch_addr, daddr + 3, 3);
- memcpy(&skb->private.ifield, daddr + 2, 4);
+ memcpy(&hcb->ifield, daddr + 2, 4);
return HIPPI_HLEN;
}
+ hcb->ifield = 0;
return -((int)HIPPI_HLEN);
}
@@ -122,7 +124,7 @@ static int hippi_rebuild_header(struct sk_buff *skb)
* Determine the packet's protocol ID.
*/
-unsigned short hippi_type_trans(struct sk_buff *skb, struct net_device *dev)
+__be16 hippi_type_trans(struct sk_buff *skb, struct net_device *dev)
{
struct hippi_hdr *hip;
diff --git a/net/802/p8022.c b/net/802/p8022.c
index 5ae63416df6..2530f35241c 100644
--- a/net/802/p8022.c
+++ b/net/802/p8022.c
@@ -35,7 +35,8 @@ static int p8022_request(struct datalink_proto *dl, struct sk_buff *skb,
struct datalink_proto *register_8022_client(unsigned char type,
int (*func)(struct sk_buff *skb,
struct net_device *dev,
- struct packet_type *pt))
+ struct packet_type *pt,
+ struct net_device *orig_dev))
{
struct datalink_proto *proto;
@@ -55,7 +56,7 @@ struct datalink_proto *register_8022_client(unsigned char type,
void unregister_8022_client(struct datalink_proto *proto)
{
- llc_sap_close(proto->sap);
+ llc_sap_put(proto->sap);
kfree(proto);
}
diff --git a/net/802/p8023.c b/net/802/p8023.c
index a0b61b40225..6368d3dce44 100644
--- a/net/802/p8023.c
+++ b/net/802/p8023.c
@@ -20,6 +20,7 @@
#include <linux/skbuff.h>
#include <net/datalink.h>
+#include <net/p8022.h>
/*
* Place an 802.3 header on a packet. The driver will do the mac
diff --git a/net/802/psnap.c b/net/802/psnap.c
index 1053821ddf9..4d638944d93 100644
--- a/net/802/psnap.c
+++ b/net/802/psnap.c
@@ -47,7 +47,7 @@ static struct datalink_proto *find_snap_client(unsigned char *desc)
* A SNAP packet has arrived
*/
static int snap_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt)
+ struct packet_type *pt, struct net_device *orig_dev)
{
int rc = 1;
struct datalink_proto *proto;
@@ -61,7 +61,7 @@ static int snap_rcv(struct sk_buff *skb, struct net_device *dev,
/* Pass the frame on. */
skb->h.raw += 5;
skb_pull(skb, 5);
- rc = proto->rcvfunc(skb, dev, &snap_packet_type);
+ rc = proto->rcvfunc(skb, dev, &snap_packet_type, orig_dev);
} else {
skb->sk = NULL;
kfree_skb(skb);
@@ -106,7 +106,7 @@ module_init(snap_init);
static void __exit snap_exit(void)
{
- llc_sap_close(snap_sap);
+ llc_sap_put(snap_sap);
}
module_exit(snap_exit);
@@ -118,7 +118,8 @@ module_exit(snap_exit);
struct datalink_proto *register_snap_client(unsigned char *desc,
int (*rcvfunc)(struct sk_buff *,
struct net_device *,
- struct packet_type *))
+ struct packet_type *,
+ struct net_device *))
{
struct datalink_proto *proto = NULL;
diff --git a/net/802/sysctl_net_802.c b/net/802/sysctl_net_802.c
index 36079630c49..700129556c1 100644
--- a/net/802/sysctl_net_802.c
+++ b/net/802/sysctl_net_802.c
@@ -10,9 +10,10 @@
* 2 of the License, or (at your option) any later version.
*/
+#include <linux/config.h>
#include <linux/mm.h>
+#include <linux/if_tr.h>
#include <linux/sysctl.h>
-#include <linux/config.h>
#ifdef CONFIG_TR
extern int sysctl_tr_rif_timeout;
diff --git a/net/802/tr.c b/net/802/tr.c
index a755e880f4b..afd8385c0c9 100644
--- a/net/802/tr.c
+++ b/net/802/tr.c
@@ -238,7 +238,7 @@ unsigned short tr_type_trans(struct sk_buff *skb, struct net_device *dev)
return trllc->ethertype;
}
- return ntohs(ETH_P_802_2);
+ return ntohs(ETH_P_TR_802_2);
}
/*
@@ -251,10 +251,11 @@ void tr_source_route(struct sk_buff *skb,struct trh_hdr *trh,struct net_device *
unsigned int hash;
struct rif_cache *entry;
unsigned char *olddata;
+ unsigned long flags;
static const unsigned char mcast_func_addr[]
= {0xC0,0x00,0x00,0x04,0x00,0x00};
- spin_lock_bh(&rif_lock);
+ spin_lock_irqsave(&rif_lock, flags);
/*
* Broadcasts are single route as stated in RFC 1042
@@ -323,7 +324,7 @@ printk("source routing for %02X:%02X:%02X:%02X:%02X:%02X\n",trh->daddr[0],
else
slack = 18 - ((ntohs(trh->rcf) & TR_RCF_LEN_MASK)>>8);
olddata = skb->data;
- spin_unlock_bh(&rif_lock);
+ spin_unlock_irqrestore(&rif_lock, flags);
skb_pull(skb, slack);
memmove(skb->data, olddata, sizeof(struct trh_hdr) - slack);
@@ -337,10 +338,12 @@ printk("source routing for %02X:%02X:%02X:%02X:%02X:%02X\n",trh->daddr[0],
static void tr_add_rif_info(struct trh_hdr *trh, struct net_device *dev)
{
unsigned int hash, rii_p = 0;
+ unsigned long flags;
struct rif_cache *entry;
+ unsigned char saddr0;
-
- spin_lock_bh(&rif_lock);
+ spin_lock_irqsave(&rif_lock, flags);
+ saddr0 = trh->saddr[0];
/*
* Firstly see if the entry exists
@@ -378,7 +381,7 @@ printk("adding rif_entry: addr:%02X:%02X:%02X:%02X:%02X:%02X rcf:%04X\n",
if(!entry)
{
printk(KERN_DEBUG "tr.c: Couldn't malloc rif cache entry !\n");
- spin_unlock_bh(&rif_lock);
+ spin_unlock_irqrestore(&rif_lock, flags);
return;
}
@@ -393,7 +396,6 @@ printk("adding rif_entry: addr:%02X:%02X:%02X:%02X:%02X:%02X rcf:%04X\n",
entry->rcf = trh->rcf & htons((unsigned short)~TR_RCF_BROADCAST_MASK);
memcpy(&(entry->rseg[0]),&(trh->rseg[0]),8*sizeof(unsigned short));
entry->local_ring = 0;
- trh->saddr[0]|=TR_RII; /* put the routing indicator back for tcpdump */
}
else
{
@@ -420,7 +422,8 @@ printk("updating rif_entry: addr:%02X:%02X:%02X:%02X:%02X:%02X rcf:%04X\n",
}
entry->last_used=jiffies;
}
- spin_unlock_bh(&rif_lock);
+ trh->saddr[0]=saddr0; /* put the routing indicator back for tcpdump */
+ spin_unlock_irqrestore(&rif_lock, flags);
}
/*
@@ -430,9 +433,9 @@ printk("updating rif_entry: addr:%02X:%02X:%02X:%02X:%02X:%02X rcf:%04X\n",
static void rif_check_expire(unsigned long dummy)
{
int i;
- unsigned long next_interval = jiffies + sysctl_tr_rif_timeout/2;
+ unsigned long flags, next_interval = jiffies + sysctl_tr_rif_timeout/2;
- spin_lock_bh(&rif_lock);
+ spin_lock_irqsave(&rif_lock, flags);
for(i =0; i < RIF_TABLE_SIZE; i++) {
struct rif_cache *entry, **pentry;
@@ -454,7 +457,7 @@ static void rif_check_expire(unsigned long dummy)
}
}
- spin_unlock_bh(&rif_lock);
+ spin_unlock_irqrestore(&rif_lock, flags);
mod_timer(&rif_timer, next_interval);
@@ -485,7 +488,7 @@ static struct rif_cache *rif_get_idx(loff_t pos)
static void *rif_seq_start(struct seq_file *seq, loff_t *pos)
{
- spin_lock_bh(&rif_lock);
+ spin_lock_irq(&rif_lock);
return *pos ? rif_get_idx(*pos - 1) : SEQ_START_TOKEN;
}
@@ -516,7 +519,7 @@ static void *rif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
static void rif_seq_stop(struct seq_file *seq, void *v)
{
- spin_unlock_bh(&rif_lock);
+ spin_unlock_irq(&rif_lock);
}
static int rif_seq_show(struct seq_file *seq, void *v)
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index 508b1fa1454..9ae3a14dd01 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -51,7 +51,7 @@ struct net_device *__find_vlan_dev(struct net_device* real_dev,
/* found in vlan_dev.c */
int vlan_dev_rebuild_header(struct sk_buff *skb);
int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type* ptype);
+ struct packet_type *ptype, struct net_device *orig_dev);
int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev,
unsigned short type, void *daddr, void *saddr,
unsigned len);
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 49c48741351..b7486488967 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -113,14 +113,14 @@ static inline struct sk_buff *vlan_check_reorder_header(struct sk_buff *skb)
*
*/
int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type* ptype)
+ struct packet_type* ptype, struct net_device *orig_dev)
{
unsigned char *rawp = NULL;
struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data);
unsigned short vid;
struct net_device_stats *stats;
unsigned short vlan_TCI;
- unsigned short proto;
+ __be16 proto;
/* vlan_TCI = ntohs(get_unaligned(&vhdr->h_vlan_TCI)); */
vlan_TCI = ntohs(vhdr->h_vlan_TCI);
diff --git a/net/Kconfig b/net/Kconfig
index 40a31ba86d2..60f6f321bd7 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -140,6 +140,7 @@ config BRIDGE_NETFILTER
If unsure, say N.
+source "net/netfilter/Kconfig"
source "net/ipv4/netfilter/Kconfig"
source "net/ipv6/netfilter/Kconfig"
source "net/decnet/netfilter/Kconfig"
@@ -147,6 +148,7 @@ source "net/bridge/netfilter/Kconfig"
endif
+source "net/dccp/Kconfig"
source "net/sctp/Kconfig"
source "net/atm/Kconfig"
source "net/bridge/Kconfig"
@@ -212,6 +214,7 @@ endmenu
source "net/ax25/Kconfig"
source "net/irda/Kconfig"
source "net/bluetooth/Kconfig"
+source "net/ieee80211/Kconfig"
endif # if NET
endmenu # Networking
diff --git a/net/Makefile b/net/Makefile
index 8e2bdc025ab..4aa2f46d2a5 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -16,6 +16,7 @@ obj-$(CONFIG_NET) += $(tmp-y)
obj-$(CONFIG_LLC) += llc/
obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/
obj-$(CONFIG_INET) += ipv4/
+obj-$(CONFIG_NETFILTER) += netfilter/
obj-$(CONFIG_XFRM) += xfrm/
obj-$(CONFIG_UNIX) += unix/
ifneq ($(CONFIG_IPV6),)
@@ -41,7 +42,9 @@ obj-$(CONFIG_ATM) += atm/
obj-$(CONFIG_DECNET) += decnet/
obj-$(CONFIG_ECONET) += econet/
obj-$(CONFIG_VLAN_8021Q) += 8021q/
+obj-$(CONFIG_IP_DCCP) += dccp/
obj-$(CONFIG_IP_SCTP) += sctp/
+obj-$(CONFIG_IEEE80211) += ieee80211/
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_SYSCTL) += sysctl_net.o
diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c
index c34614ea5fc..7076097debc 100644
--- a/net/appletalk/aarp.c
+++ b/net/appletalk/aarp.c
@@ -698,7 +698,7 @@ static void __aarp_resolved(struct aarp_entry **list, struct aarp_entry *a,
* frame. We currently only support Ethernet.
*/
static int aarp_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt)
+ struct packet_type *pt, struct net_device *orig_dev)
{
struct elapaarp *ea = aarp_hdr(skb);
int hash, ret = 0;
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 192b529f86a..7982656b9c8 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -53,12 +53,12 @@
#include <linux/config.h>
#include <linux/module.h>
-#include <linux/tcp.h>
#include <linux/if_arp.h>
#include <linux/termios.h> /* For TIOCOUTQ/INQ */
#include <net/datalink.h>
#include <net/psnap.h>
#include <net/sock.h>
+#include <net/tcp_states.h>
#include <net/route.h>
#include <linux/atalk.h>
@@ -100,8 +100,7 @@ static struct sock *atalk_search_socket(struct sockaddr_at *to,
continue;
if (to->sat_addr.s_net == ATADDR_ANYNET &&
- to->sat_addr.s_node == ATADDR_BCAST &&
- at->src_net == atif->address.s_net)
+ to->sat_addr.s_node == ATADDR_BCAST)
goto found;
if (to->sat_addr.s_net == at->src_net &&
@@ -1390,7 +1389,7 @@ free_it:
* [ie ARPHRD_ETHERTALK]
*/
static int atalk_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt)
+ struct packet_type *pt, struct net_device *orig_dev)
{
struct ddpehdr *ddp;
struct sock *sock;
@@ -1443,8 +1442,10 @@ static int atalk_rcv(struct sk_buff *skb, struct net_device *dev,
else
atif = atalk_find_interface(ddp->deh_dnet, ddp->deh_dnode);
- /* Not ours, so we route the packet via the correct AppleTalk iface */
if (!atif) {
+ /* Not ours, so we route the packet via the correct
+ * AppleTalk iface
+ */
atalk_route_packet(skb, dev, ddp, &ddphv, origlen);
goto out;
}
@@ -1482,7 +1483,7 @@ freeit:
* header and append a long one.
*/
static int ltalk_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt)
+ struct packet_type *pt, struct net_device *orig_dev)
{
/* Expand any short form frames */
if (skb->mac.raw[2] == 1) {
@@ -1528,7 +1529,7 @@ static int ltalk_rcv(struct sk_buff *skb, struct net_device *dev,
}
skb->h.raw = skb->data;
- return atalk_rcv(skb, dev, pt);
+ return atalk_rcv(skb, dev, pt, orig_dev);
freeit:
kfree_skb(skb);
return 0;
@@ -1592,9 +1593,6 @@ static int atalk_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr
if (usat->sat_addr.s_net || usat->sat_addr.s_node == ATADDR_ANYNODE) {
rt = atrtr_find(&usat->sat_addr);
- if (!rt)
- return -ENETUNREACH;
-
dev = rt->dev;
} else {
struct atalk_addr at_hint;
@@ -1603,11 +1601,12 @@ static int atalk_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr
at_hint.s_net = at->src_net;
rt = atrtr_find(&at_hint);
- if (!rt)
- return -ENETUNREACH;
-
dev = rt->dev;
}
+ if (!rt)
+ return -ENETUNREACH;
+
+ dev = rt->dev;
SOCK_DEBUG(sk, "SK %p: Size needed %d, device %s\n",
sk, size, dev->name);
@@ -1677,6 +1676,20 @@ static int atalk_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr
SOCK_DEBUG(sk, "SK %p: Loop back.\n", sk);
/* loop back */
skb_orphan(skb);
+ if (ddp->deh_dnode == ATADDR_BCAST) {
+ struct atalk_addr at_lo;
+
+ at_lo.s_node = 0;
+ at_lo.s_net = 0;
+
+ rt = atrtr_find(&at_lo);
+ if (!rt) {
+ kfree_skb(skb);
+ return -ENETUNREACH;
+ }
+ dev = rt->dev;
+ skb->dev = dev;
+ }
ddp_dl->request(ddp_dl, skb, dev->dev_addr);
} else {
SOCK_DEBUG(sk, "SK %p: send out.\n", sk);
diff --git a/net/atm/addr.c b/net/atm/addr.c
index 1c8867f7f54..3060fd0ba4b 100644
--- a/net/atm/addr.c
+++ b/net/atm/addr.c
@@ -44,29 +44,43 @@ static void notify_sigd(struct atm_dev *dev)
sigd_enq(NULL, as_itf_notify, NULL, &pvc, NULL);
}
-void atm_reset_addr(struct atm_dev *dev)
+void atm_reset_addr(struct atm_dev *dev, enum atm_addr_type_t atype)
{
unsigned long flags;
struct atm_dev_addr *this, *p;
+ struct list_head *head;
spin_lock_irqsave(&dev->lock, flags);
- list_for_each_entry_safe(this, p, &dev->local, entry)
- kfree(this);
+ if (atype == ATM_ADDR_LECS)
+ head = &dev->lecs;
+ else
+ head = &dev->local;
+ list_for_each_entry_safe(this, p, head, entry) {
+ list_del(&this->entry);
+ kfree(this);
+ }
spin_unlock_irqrestore(&dev->lock, flags);
- notify_sigd(dev);
+ if (head == &dev->local)
+ notify_sigd(dev);
}
-int atm_add_addr(struct atm_dev *dev, struct sockaddr_atmsvc *addr)
+int atm_add_addr(struct atm_dev *dev, struct sockaddr_atmsvc *addr,
+ enum atm_addr_type_t atype)
{
unsigned long flags;
struct atm_dev_addr *this;
+ struct list_head *head;
int error;
error = check_addr(addr);
if (error)
return error;
spin_lock_irqsave(&dev->lock, flags);
- list_for_each_entry(this, &dev->local, entry) {
+ if (atype == ATM_ADDR_LECS)
+ head = &dev->lecs;
+ else
+ head = &dev->local;
+ list_for_each_entry(this, head, entry) {
if (identical(&this->addr, addr)) {
spin_unlock_irqrestore(&dev->lock, flags);
return -EEXIST;
@@ -78,28 +92,36 @@ int atm_add_addr(struct atm_dev *dev, struct sockaddr_atmsvc *addr)
return -ENOMEM;
}
this->addr = *addr;
- list_add(&this->entry, &dev->local);
+ list_add(&this->entry, head);
spin_unlock_irqrestore(&dev->lock, flags);
- notify_sigd(dev);
+ if (head == &dev->local)
+ notify_sigd(dev);
return 0;
}
-int atm_del_addr(struct atm_dev *dev, struct sockaddr_atmsvc *addr)
+int atm_del_addr(struct atm_dev *dev, struct sockaddr_atmsvc *addr,
+ enum atm_addr_type_t atype)
{
unsigned long flags;
struct atm_dev_addr *this;
+ struct list_head *head;
int error;
error = check_addr(addr);
if (error)
return error;
spin_lock_irqsave(&dev->lock, flags);
- list_for_each_entry(this, &dev->local, entry) {
+ if (atype == ATM_ADDR_LECS)
+ head = &dev->lecs;
+ else
+ head = &dev->local;
+ list_for_each_entry(this, head, entry) {
if (identical(&this->addr, addr)) {
list_del(&this->entry);
spin_unlock_irqrestore(&dev->lock, flags);
kfree(this);
- notify_sigd(dev);
+ if (head == &dev->local)
+ notify_sigd(dev);
return 0;
}
}
@@ -108,22 +130,27 @@ int atm_del_addr(struct atm_dev *dev, struct sockaddr_atmsvc *addr)
}
int atm_get_addr(struct atm_dev *dev, struct sockaddr_atmsvc __user * buf,
- size_t size)
+ size_t size, enum atm_addr_type_t atype)
{
unsigned long flags;
struct atm_dev_addr *this;
+ struct list_head *head;
int total = 0, error;
struct sockaddr_atmsvc *tmp_buf, *tmp_bufp;
spin_lock_irqsave(&dev->lock, flags);
- list_for_each_entry(this, &dev->local, entry)
+ if (atype == ATM_ADDR_LECS)
+ head = &dev->lecs;
+ else
+ head = &dev->local;
+ list_for_each_entry(this, head, entry)
total += sizeof(struct sockaddr_atmsvc);
tmp_buf = tmp_bufp = kmalloc(total, GFP_ATOMIC);
if (!tmp_buf) {
spin_unlock_irqrestore(&dev->lock, flags);
return -ENOMEM;
}
- list_for_each_entry(this, &dev->local, entry)
+ list_for_each_entry(this, head, entry)
memcpy(tmp_bufp++, &this->addr, sizeof(struct sockaddr_atmsvc));
spin_unlock_irqrestore(&dev->lock, flags);
error = total > size ? -E2BIG : total;
diff --git a/net/atm/addr.h b/net/atm/addr.h
index 3099d21feea..f39433ad45d 100644
--- a/net/atm/addr.h
+++ b/net/atm/addr.h
@@ -9,10 +9,12 @@
#include <linux/atm.h>
#include <linux/atmdev.h>
-
-void atm_reset_addr(struct atm_dev *dev);
-int atm_add_addr(struct atm_dev *dev,struct sockaddr_atmsvc *addr);
-int atm_del_addr(struct atm_dev *dev,struct sockaddr_atmsvc *addr);
-int atm_get_addr(struct atm_dev *dev,struct sockaddr_atmsvc __user *buf,size_t size);
+void atm_reset_addr(struct atm_dev *dev, enum atm_addr_type_t type);
+int atm_add_addr(struct atm_dev *dev, struct sockaddr_atmsvc *addr,
+ enum atm_addr_type_t type);
+int atm_del_addr(struct atm_dev *dev, struct sockaddr_atmsvc *addr,
+ enum atm_addr_type_t type);
+int atm_get_addr(struct atm_dev *dev, struct sockaddr_atmsvc __user *buf,
+ size_t size, enum atm_addr_type_t type);
#endif
diff --git a/net/atm/atm_misc.c b/net/atm/atm_misc.c
index b2113c3454a..223c7ad5bd0 100644
--- a/net/atm/atm_misc.c
+++ b/net/atm/atm_misc.c
@@ -25,7 +25,7 @@ int atm_charge(struct atm_vcc *vcc,int truesize)
struct sk_buff *atm_alloc_charge(struct atm_vcc *vcc,int pdu_size,
- int gfp_flags)
+ gfp_t gfp_flags)
{
struct sock *sk = sk_atm(vcc);
int guess = atm_guess_pdu2truesize(pdu_size);
diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index 289956c4dd3..72f3f7b8de8 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -220,7 +220,7 @@ static int br2684_start_xmit(struct sk_buff *skb, struct net_device *dev)
/* netif_stop_queue(dev); */
dev_kfree_skb(skb);
read_unlock(&devs_lock);
- return -EUNATCH;
+ return 0;
}
if (!br2684_xmit_vcc(skb, brdev, brvcc)) {
/*
diff --git a/net/atm/clip.c b/net/atm/clip.c
index 28dab55a438..4f54c9a5e84 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -310,7 +310,7 @@ static int clip_constructor(struct neighbour *neigh)
if (neigh->type != RTN_UNICAST) return -EINVAL;
rcu_read_lock();
- in_dev = rcu_dereference(__in_dev_get(dev));
+ in_dev = __in_dev_get_rcu(dev);
if (!in_dev) {
rcu_read_unlock();
return -EINVAL;
diff --git a/net/atm/common.c b/net/atm/common.c
index e93e838069e..63feea49fb1 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -46,7 +46,7 @@ static void __vcc_insert_socket(struct sock *sk)
struct atm_vcc *vcc = atm_sk(sk);
struct hlist_head *head = &vcc_hash[vcc->vci &
(VCC_HTABLE_SIZE - 1)];
- sk->sk_hashent = vcc->vci & (VCC_HTABLE_SIZE - 1);
+ sk->sk_hash = vcc->vci & (VCC_HTABLE_SIZE - 1);
sk_add_node(sk, head);
}
@@ -178,8 +178,6 @@ static void vcc_destroy_socket(struct sock *sk)
if (vcc->push)
vcc->push(vcc, NULL); /* atmarpd has no push */
- vcc_remove_socket(sk); /* no more receive */
-
while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
atm_return(vcc,skb->truesize);
kfree_skb(skb);
@@ -188,6 +186,8 @@ static void vcc_destroy_socket(struct sock *sk)
module_put(vcc->dev->ops->owner);
atm_dev_put(vcc->dev);
}
+
+ vcc_remove_socket(sk);
}
diff --git a/net/atm/ioctl.c b/net/atm/ioctl.c
index 4dbb5af34a5..a150198b05a 100644
--- a/net/atm/ioctl.c
+++ b/net/atm/ioctl.c
@@ -21,6 +21,7 @@
#include "resources.h"
#include "signaling.h" /* for WAITING and sigd_attach */
+#include "common.h"
static DECLARE_MUTEX(ioctl_mutex);
@@ -104,17 +105,35 @@ int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
if (!error)
sock->state = SS_CONNECTED;
goto done;
- default:
+ case ATM_SETBACKEND:
+ case ATM_NEWBACKENDIF:
+ {
+ atm_backend_t backend;
+ error = get_user(backend, (atm_backend_t __user *) argp);
+ if (error)
+ goto done;
+ switch (backend) {
+ case ATM_BACKEND_PPP:
+ request_module("pppoatm");
+ break;
+ case ATM_BACKEND_BR2684:
+ request_module("br2684");
+ break;
+ }
+ }
+ break;
+ case ATMMPC_CTRL:
+ case ATMMPC_DATA:
+ request_module("mpoa");
+ break;
+ case ATMARPD_CTRL:
+ request_module("clip");
+ break;
+ case ATMLEC_CTRL:
+ request_module("lec");
break;
}
- if (cmd == ATMMPC_CTRL || cmd == ATMMPC_DATA)
- request_module("mpoa");
- if (cmd == ATMARPD_CTRL)
- request_module("clip");
- if (cmd == ATMLEC_CTRL)
- request_module("lec");
-
error = -ENOIOCTLCMD;
down(&ioctl_mutex);
diff --git a/net/atm/ipcommon.c b/net/atm/ipcommon.c
index 181a3002d8a..4b1faca5013 100644
--- a/net/atm/ipcommon.c
+++ b/net/atm/ipcommon.c
@@ -34,7 +34,6 @@
void skb_migrate(struct sk_buff_head *from,struct sk_buff_head *to)
{
- struct sk_buff *skb;
unsigned long flags;
struct sk_buff *skb_from = (struct sk_buff *) from;
struct sk_buff *skb_to = (struct sk_buff *) to;
@@ -47,8 +46,6 @@ void skb_migrate(struct sk_buff_head *from,struct sk_buff_head *to)
prev->next = skb_to;
to->prev->next = from->next;
to->prev = from->prev;
- for (skb = from->next; skb != skb_to; skb = skb->next)
- skb->list = to;
to->qlen += from->qlen;
spin_unlock(&to->lock);
from->prev = skb_from;
diff --git a/net/atm/lec.c b/net/atm/lec.c
index a0752487026..ad840b9afba 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -686,9 +686,19 @@ static unsigned char lec_ctrl_magic[] = {
0x01,
0x01 };
+#define LEC_DATA_DIRECT_8023 2
+#define LEC_DATA_DIRECT_8025 3
+
+static int lec_is_data_direct(struct atm_vcc *vcc)
+{
+ return ((vcc->sap.blli[0].l3.tr9577.snap[4] == LEC_DATA_DIRECT_8023) ||
+ (vcc->sap.blli[0].l3.tr9577.snap[4] == LEC_DATA_DIRECT_8025));
+}
+
static void
lec_push(struct atm_vcc *vcc, struct sk_buff *skb)
{
+ unsigned long flags;
struct net_device *dev = (struct net_device *)vcc->proto_data;
struct lec_priv *priv = (struct lec_priv *)dev->priv;
@@ -728,7 +738,8 @@ lec_push(struct atm_vcc *vcc, struct sk_buff *skb)
skb_queue_tail(&sk->sk_receive_queue, skb);
sk->sk_data_ready(sk, skb->len);
} else { /* Data frame, queue to protocol handlers */
- unsigned char *dst;
+ struct lec_arp_table *entry;
+ unsigned char *src, *dst;
atm_return(vcc,skb->truesize);
if (*(uint16_t *)skb->data == htons(priv->lecid) ||
@@ -741,10 +752,30 @@ lec_push(struct atm_vcc *vcc, struct sk_buff *skb)
return;
}
#ifdef CONFIG_TR
- if (priv->is_trdev) dst = ((struct lecdatahdr_8025 *)skb->data)->h_dest;
+ if (priv->is_trdev)
+ dst = ((struct lecdatahdr_8025 *) skb->data)->h_dest;
else
#endif
- dst = ((struct lecdatahdr_8023 *)skb->data)->h_dest;
+ dst = ((struct lecdatahdr_8023 *) skb->data)->h_dest;
+
+ /* If this is a Data Direct VCC, and the VCC does not match
+ * the LE_ARP cache entry, delete the LE_ARP cache entry.
+ */
+ spin_lock_irqsave(&priv->lec_arp_lock, flags);
+ if (lec_is_data_direct(vcc)) {
+#ifdef CONFIG_TR
+ if (priv->is_trdev)
+ src = ((struct lecdatahdr_8025 *) skb->data)->h_source;
+ else
+#endif
+ src = ((struct lecdatahdr_8023 *) skb->data)->h_source;
+ entry = lec_arp_find(priv, src);
+ if (entry && entry->vcc != vcc) {
+ lec_arp_remove(priv, entry);
+ kfree(entry);
+ }
+ }
+ spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
if (!(dst[0]&0x01) && /* Never filter Multi/Broadcast */
!priv->is_proxy && /* Proxy wants all the packets */
@@ -1990,6 +2021,12 @@ lec_arp_resolve(struct lec_priv *priv, unsigned char *mac_to_find,
found = entry->vcc;
goto out;
}
+ /* If the LE_ARP cache entry is still pending, reset count to 0
+ * so another LE_ARP request can be made for this frame.
+ */
+ if (entry->status == ESI_ARP_PENDING) {
+ entry->no_tries = 0;
+ }
/* Data direct VC not yet set up, check to see if the unknown
frame count is greater than the limit. If the limit has
not been reached, allow the caller to send packet to
diff --git a/net/atm/mpc.c b/net/atm/mpc.c
index 17a81ebe7e6..526d9531411 100644
--- a/net/atm/mpc.c
+++ b/net/atm/mpc.c
@@ -105,7 +105,7 @@ extern void mpc_proc_clean(void);
struct mpoa_client *mpcs = NULL; /* FIXME */
static struct atm_mpoa_qos *qos_head = NULL;
-static struct timer_list mpc_timer = TIMER_INITIALIZER(NULL, 0, 0);
+static DEFINE_TIMER(mpc_timer, NULL, 0, 0);
static struct mpoa_client *find_mpc_by_itfnum(int itf)
diff --git a/net/atm/resources.c b/net/atm/resources.c
index a57a9268bd2..415d2615d47 100644
--- a/net/atm/resources.c
+++ b/net/atm/resources.c
@@ -40,6 +40,7 @@ static struct atm_dev *__alloc_atm_dev(const char *type)
dev->link_rate = ATM_OC3_PCR;
spin_lock_init(&dev->lock);
INIT_LIST_HEAD(&dev->local);
+ INIT_LIST_HEAD(&dev->lecs);
return dev;
}
@@ -320,10 +321,12 @@ int atm_dev_ioctl(unsigned int cmd, void __user *arg)
error = -EPERM;
goto done;
}
- atm_reset_addr(dev);
+ atm_reset_addr(dev, ATM_ADDR_LOCAL);
break;
case ATM_ADDADDR:
case ATM_DELADDR:
+ case ATM_ADDLECSADDR:
+ case ATM_DELLECSADDR:
if (!capable(CAP_NET_ADMIN)) {
error = -EPERM;
goto done;
@@ -335,14 +338,21 @@ int atm_dev_ioctl(unsigned int cmd, void __user *arg)
error = -EFAULT;
goto done;
}
- if (cmd == ATM_ADDADDR)
- error = atm_add_addr(dev, &addr);
+ if (cmd == ATM_ADDADDR || cmd == ATM_ADDLECSADDR)
+ error = atm_add_addr(dev, &addr,
+ (cmd == ATM_ADDADDR ?
+ ATM_ADDR_LOCAL : ATM_ADDR_LECS));
else
- error = atm_del_addr(dev, &addr);
+ error = atm_del_addr(dev, &addr,
+ (cmd == ATM_DELADDR ?
+ ATM_ADDR_LOCAL : ATM_ADDR_LECS));
goto done;
}
case ATM_GETADDR:
- error = atm_get_addr(dev, buf, len);
+ case ATM_GETLECSADDR:
+ error = atm_get_addr(dev, buf, len,
+ (cmd == ATM_GETADDR ?
+ ATM_ADDR_LOCAL : ATM_ADDR_LECS));
if (error < 0)
goto done;
size = error;
diff --git a/net/atm/signaling.c b/net/atm/signaling.c
index f7c449ac180..e7211a7f382 100644
--- a/net/atm/signaling.c
+++ b/net/atm/signaling.c
@@ -217,8 +217,9 @@ void sigd_enq(struct atm_vcc *vcc,enum atmsvc_msg_type type,
static void purge_vcc(struct atm_vcc *vcc)
{
if (sk_atm(vcc)->sk_family == PF_ATMSVC &&
- !test_bit(ATM_VF_META,&vcc->flags)) {
- set_bit(ATM_VF_RELEASED,&vcc->flags);
+ !test_bit(ATM_VF_META, &vcc->flags)) {
+ set_bit(ATM_VF_RELEASED, &vcc->flags);
+ clear_bit(ATM_VF_REGIS, &vcc->flags);
vcc_release_async(vcc, -EUNATCH);
}
}
@@ -243,8 +244,7 @@ static void sigd_close(struct atm_vcc *vcc)
sk_for_each(s, node, head) {
struct atm_vcc *vcc = atm_sk(s);
- if (vcc->dev)
- purge_vcc(vcc);
+ purge_vcc(vcc);
}
}
read_unlock(&vcc_sklist_lock);
diff --git a/net/atm/svc.c b/net/atm/svc.c
index 08e46052a3e..d7b266136bf 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -302,6 +302,7 @@ static int svc_listen(struct socket *sock,int backlog)
error = -EINVAL;
goto out;
}
+ vcc_insert_socket(sk);
set_bit(ATM_VF_WAITING, &vcc->flags);
prepare_to_wait(sk->sk_sleep, &wait, TASK_UNINTERRUPTIBLE);
sigd_enq(vcc,as_listen,NULL,NULL,&vcc->local);
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 707097deac3..8e37e71e34f 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -45,7 +45,7 @@
#include <linux/sysctl.h>
#include <linux/init.h>
#include <linux/spinlock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <net/ip.h>
#include <net/arp.h>
@@ -875,12 +875,7 @@ struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev)
sk->sk_sndbuf = osk->sk_sndbuf;
sk->sk_state = TCP_ESTABLISHED;
sk->sk_sleep = osk->sk_sleep;
-
- if (sock_flag(osk, SOCK_DBG))
- sock_set_flag(sk, SOCK_DBG);
-
- if (sock_flag(osk, SOCK_ZAPPED))
- sock_set_flag(sk, SOCK_ZAPPED);
+ sock_copy_flags(sk, osk);
oax25 = ax25_sk(osk);
@@ -1007,7 +1002,8 @@ static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
struct sock *sk = sock->sk;
struct full_sockaddr_ax25 *addr = (struct full_sockaddr_ax25 *)uaddr;
ax25_dev *ax25_dev = NULL;
- ax25_address *call;
+ ax25_uid_assoc *user;
+ ax25_address call;
ax25_cb *ax25;
int err = 0;
@@ -1026,9 +1022,15 @@ static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
if (addr->fsa_ax25.sax25_family != AF_AX25)
return -EINVAL;
- call = ax25_findbyuid(current->euid);
- if (call == NULL && ax25_uid_policy && !capable(CAP_NET_ADMIN)) {
- return -EACCES;
+ user = ax25_findbyuid(current->euid);
+ if (user) {
+ call = user->call;
+ ax25_uid_put(user);
+ } else {
+ if (ax25_uid_policy && !capable(CAP_NET_ADMIN))
+ return -EACCES;
+
+ call = addr->fsa_ax25.sax25_call;
}
lock_sock(sk);
@@ -1039,10 +1041,7 @@ static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
goto out;
}
- if (call == NULL)
- ax25->source_addr = addr->fsa_ax25.sax25_call;
- else
- ax25->source_addr = *call;
+ ax25->source_addr = call;
/*
* User already set interface with SO_BINDTODEVICE
@@ -1696,16 +1695,12 @@ static int ax25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
/* These two are safe on a single CPU system as only user tasks fiddle here */
if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL)
amount = skb->len;
- res = put_user(amount, (int __user *)argp);
+ res = put_user(amount, (int __user *) argp);
break;
}
case SIOCGSTAMP:
- if (sk != NULL) {
- res = sock_get_timestamp(sk, argp);
- break;
- }
- res = -EINVAL;
+ res = sock_get_timestamp(sk, argp);
break;
case SIOCAX25ADDUID: /* Add a uid to the uid/call map table */
@@ -1875,6 +1870,7 @@ static void ax25_info_stop(struct seq_file *seq, void *v)
static int ax25_info_show(struct seq_file *seq, void *v)
{
ax25_cb *ax25 = v;
+ char buf[11];
int k;
@@ -1886,13 +1882,13 @@ static int ax25_info_show(struct seq_file *seq, void *v)
seq_printf(seq, "%8.8lx %s %s%s ",
(long) ax25,
ax25->ax25_dev == NULL? "???" : ax25->ax25_dev->dev->name,
- ax2asc(&ax25->source_addr),
+ ax2asc(buf, &ax25->source_addr),
ax25->iamdigi? "*":"");
- seq_printf(seq, "%s", ax2asc(&ax25->dest_addr));
+ seq_printf(seq, "%s", ax2asc(buf, &ax25->dest_addr));
for (k=0; (ax25->digipeat != NULL) && (k < ax25->digipeat->ndigi); k++) {
seq_printf(seq, ",%s%s",
- ax2asc(&ax25->digipeat->calls[k]),
+ ax2asc(buf, &ax25->digipeat->calls[k]),
ax25->digipeat->repeated[k]? "*":"");
}
@@ -1951,24 +1947,24 @@ static struct net_proto_family ax25_family_ops = {
};
static struct proto_ops ax25_proto_ops = {
- .family = PF_AX25,
- .owner = THIS_MODULE,
- .release = ax25_release,
- .bind = ax25_bind,
- .connect = ax25_connect,
- .socketpair = sock_no_socketpair,
- .accept = ax25_accept,
- .getname = ax25_getname,
- .poll = datagram_poll,
- .ioctl = ax25_ioctl,
- .listen = ax25_listen,
- .shutdown = ax25_shutdown,
- .setsockopt = ax25_setsockopt,
- .getsockopt = ax25_getsockopt,
- .sendmsg = ax25_sendmsg,
- .recvmsg = ax25_recvmsg,
- .mmap = sock_no_mmap,
- .sendpage = sock_no_sendpage,
+ .family = PF_AX25,
+ .owner = THIS_MODULE,
+ .release = ax25_release,
+ .bind = ax25_bind,
+ .connect = ax25_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = ax25_accept,
+ .getname = ax25_getname,
+ .poll = datagram_poll,
+ .ioctl = ax25_ioctl,
+ .listen = ax25_listen,
+ .shutdown = ax25_shutdown,
+ .setsockopt = ax25_setsockopt,
+ .getsockopt = ax25_getsockopt,
+ .sendmsg = ax25_sendmsg,
+ .recvmsg = ax25_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
};
/*
@@ -1984,7 +1980,7 @@ static struct notifier_block ax25_dev_notifier = {
.notifier_call =ax25_device_event,
};
-EXPORT_SYMBOL(ax25_encapsulate);
+EXPORT_SYMBOL(ax25_hard_header);
EXPORT_SYMBOL(ax25_rebuild_header);
EXPORT_SYMBOL(ax25_findbyuid);
EXPORT_SYMBOL(ax25_find_cb);
diff --git a/net/ax25/ax25_addr.c b/net/ax25/ax25_addr.c
index f4fa6dfb846..0164a155b8c 100644
--- a/net/ax25/ax25_addr.c
+++ b/net/ax25/ax25_addr.c
@@ -36,9 +36,8 @@ ax25_address null_ax25_address = {{0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x00}};
/*
* ax25 -> ascii conversion
*/
-char *ax2asc(ax25_address *a)
+char *ax2asc(char *buf, ax25_address *a)
{
- static char buf[11];
char c, *s;
int n;
@@ -68,37 +67,34 @@ char *ax2asc(ax25_address *a)
/*
* ascii -> ax25 conversion
*/
-ax25_address *asc2ax(char *callsign)
+void asc2ax(ax25_address *addr, char *callsign)
{
- static ax25_address addr;
char *s;
int n;
for (s = callsign, n = 0; n < 6; n++) {
if (*s != '\0' && *s != '-')
- addr.ax25_call[n] = *s++;
+ addr->ax25_call[n] = *s++;
else
- addr.ax25_call[n] = ' ';
- addr.ax25_call[n] <<= 1;
- addr.ax25_call[n] &= 0xFE;
+ addr->ax25_call[n] = ' ';
+ addr->ax25_call[n] <<= 1;
+ addr->ax25_call[n] &= 0xFE;
}
if (*s++ == '\0') {
- addr.ax25_call[6] = 0x00;
- return &addr;
+ addr->ax25_call[6] = 0x00;
+ return;
}
- addr.ax25_call[6] = *s++ - '0';
+ addr->ax25_call[6] = *s++ - '0';
if (*s != '\0') {
- addr.ax25_call[6] *= 10;
- addr.ax25_call[6] += *s++ - '0';
+ addr->ax25_call[6] *= 10;
+ addr->ax25_call[6] += *s++ - '0';
}
- addr.ax25_call[6] <<= 1;
- addr.ax25_call[6] &= 0x1E;
-
- return &addr;
+ addr->ax25_call[6] <<= 1;
+ addr->ax25_call[6] &= 0x1E;
}
/*
diff --git a/net/ax25/ax25_ds_in.c b/net/ax25/ax25_ds_in.c
index 8adc0022cf5..edcaa897027 100644
--- a/net/ax25/ax25_ds_in.c
+++ b/net/ax25/ax25_ds_in.c
@@ -22,8 +22,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <net/ip.h> /* For ip_rcv */
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <linux/fcntl.h>
diff --git a/net/ax25/ax25_ds_timer.c b/net/ax25/ax25_ds_timer.c
index 3a8b67316fc..061083efc1d 100644
--- a/net/ax25/ax25_ds_timer.c
+++ b/net/ax25/ax25_ds_timer.c
@@ -18,7 +18,7 @@
#include <linux/string.h>
#include <linux/sockios.h>
#include <linux/net.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <net/ax25.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c
index 3dc808fde33..73cfc3411c4 100644
--- a/net/ax25/ax25_in.c
+++ b/net/ax25/ax25_in.c
@@ -9,7 +9,6 @@
* Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de)
* Copyright (C) Hans-Joachim Hetscher DD8NE (dd8ne@bnv-bamberg.de)
*/
-#include <linux/config.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
@@ -26,9 +25,7 @@
#include <linux/skbuff.h>
#include <linux/netfilter.h>
#include <net/sock.h>
-#include <net/ip.h> /* For ip_rcv */
-#include <net/tcp.h>
-#include <net/arp.h> /* For arp_rcv */
+#include <net/tcp_states.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <linux/fcntl.h>
@@ -114,7 +111,6 @@ int ax25_rx_iframe(ax25_cb *ax25, struct sk_buff *skb)
pid = *skb->data;
-#ifdef CONFIG_INET
if (pid == AX25_P_IP) {
/* working around a TCP bug to keep additional listeners
* happy. TCP re-uses the buffer and destroys the original
@@ -127,15 +123,14 @@ int ax25_rx_iframe(ax25_cb *ax25, struct sk_buff *skb)
}
skb_pull(skb, 1); /* Remove PID */
- skb->h.raw = skb->data;
+ skb->mac.raw = skb->nh.raw;
skb->nh.raw = skb->data;
skb->dev = ax25->ax25_dev->dev;
skb->pkt_type = PACKET_HOST;
skb->protocol = htons(ETH_P_IP);
- ip_rcv(skb, skb->dev, NULL); /* Wrong ptype */
+ netif_rx(skb);
return 1;
}
-#endif
if (pid == AX25_P_SEGMENT) {
skb_pull(skb, 1); /* Remove PID */
return ax25_rx_fragment(ax25, skb);
@@ -250,7 +245,6 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
/* Now we are pointing at the pid byte */
switch (skb->data[1]) {
-#ifdef CONFIG_INET
case AX25_P_IP:
skb_pull(skb,2); /* drop PID/CTRL */
skb->h.raw = skb->data;
@@ -258,7 +252,7 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
skb->dev = dev;
skb->pkt_type = PACKET_HOST;
skb->protocol = htons(ETH_P_IP);
- ip_rcv(skb, dev, ptype); /* Note ptype here is the wrong one, fix me later */
+ netif_rx(skb);
break;
case AX25_P_ARP:
@@ -268,9 +262,8 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
skb->dev = dev;
skb->pkt_type = PACKET_HOST;
skb->protocol = htons(ETH_P_ARP);
- arp_rcv(skb, dev, ptype); /* Note ptype here is wrong... */
+ netif_rx(skb);
break;
-#endif
case AX25_P_TEXT:
/* Now find a suitable dgram socket */
sk = ax25_get_socket(&dest, &src, SOCK_DGRAM);
@@ -454,7 +447,7 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
* Receive an AX.25 frame via a SLIP interface.
*/
int ax25_kiss_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *ptype)
+ struct packet_type *ptype, struct net_device *orig_dev)
{
skb->sk = NULL; /* Initially we don't know who it's for */
skb->destructor = NULL; /* Who initializes this, dammit?! */
diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c
index bba0173e2d6..d643dac3ecc 100644
--- a/net/ax25/ax25_ip.c
+++ b/net/ax25/ax25_ip.c
@@ -47,7 +47,7 @@
#ifdef CONFIG_INET
-int ax25_encapsulate(struct sk_buff *skb, struct net_device *dev, unsigned short type, void *daddr, void *saddr, unsigned len)
+int ax25_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, void *daddr, void *saddr, unsigned len)
{
unsigned char *buff;
@@ -88,7 +88,7 @@ int ax25_encapsulate(struct sk_buff *skb, struct net_device *dev, unsigned short
*buff++ = AX25_P_ARP;
break;
default:
- printk(KERN_ERR "AX.25: ax25_encapsulate - wrong protocol type 0x%2.2x\n", type);
+ printk(KERN_ERR "AX.25: ax25_hard_header - wrong protocol type 0x%2.2x\n", type);
*buff++ = 0;
break;
}
@@ -209,7 +209,7 @@ put:
#else /* INET */
-int ax25_encapsulate(struct sk_buff *skb, struct net_device *dev, unsigned short type, void *daddr, void *saddr, unsigned len)
+int ax25_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, void *daddr, void *saddr, unsigned len)
{
return -AX25_HEADER_LEN;
}
diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c
index 44b99b1ff9f..26b77d97222 100644
--- a/net/ax25/ax25_route.c
+++ b/net/ax25/ax25_route.c
@@ -298,6 +298,8 @@ static void ax25_rt_seq_stop(struct seq_file *seq, void *v)
static int ax25_rt_seq_show(struct seq_file *seq, void *v)
{
+ char buf[11];
+
if (v == SEQ_START_TOKEN)
seq_puts(seq, "callsign dev mode digipeaters\n");
else {
@@ -308,7 +310,7 @@ static int ax25_rt_seq_show(struct seq_file *seq, void *v)
if (ax25cmp(&ax25_rt->callsign, &null_ax25_address) == 0)
callsign = "default";
else
- callsign = ax2asc(&ax25_rt->callsign);
+ callsign = ax2asc(buf, &ax25_rt->callsign);
seq_printf(seq, "%-9s %-4s",
callsign,
@@ -328,7 +330,8 @@ static int ax25_rt_seq_show(struct seq_file *seq, void *v)
if (ax25_rt->digipeat != NULL)
for (i = 0; i < ax25_rt->digipeat->ndigi; i++)
- seq_printf(seq, " %s", ax2asc(&ax25_rt->digipeat->calls[i]));
+ seq_printf(seq, " %s",
+ ax2asc(buf, &ax25_rt->digipeat->calls[i]));
seq_puts(seq, "\n");
}
@@ -422,8 +425,8 @@ static inline void ax25_adjust_path(ax25_address *addr, ax25_digi *digipeat)
*/
int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr)
{
+ ax25_uid_assoc *user;
ax25_route *ax25_rt;
- ax25_address *call;
int err;
if ((ax25_rt = ax25_get_route(addr, NULL)) == NULL)
@@ -434,16 +437,18 @@ int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr)
goto put;
}
- if ((call = ax25_findbyuid(current->euid)) == NULL) {
+ user = ax25_findbyuid(current->euid);
+ if (user) {
+ ax25->source_addr = user->call;
+ ax25_uid_put(user);
+ } else {
if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) {
err = -EPERM;
goto put;
}
- call = (ax25_address *)ax25->ax25_dev->dev->dev_addr;
+ ax25->source_addr = *(ax25_address *)ax25->ax25_dev->dev->dev_addr;
}
- ax25->source_addr = *call;
-
if (ax25_rt->digipeat != NULL) {
if ((ax25->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) {
err = -ENOMEM;
diff --git a/net/ax25/ax25_std_in.c b/net/ax25/ax25_std_in.c
index 7131873322c..f6ed283e9de 100644
--- a/net/ax25/ax25_std_in.c
+++ b/net/ax25/ax25_std_in.c
@@ -29,8 +29,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <net/ip.h> /* For ip_rcv */
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <linux/fcntl.h>
diff --git a/net/ax25/ax25_std_timer.c b/net/ax25/ax25_std_timer.c
index 066897bc074..a29c480a4dc 100644
--- a/net/ax25/ax25_std_timer.c
+++ b/net/ax25/ax25_std_timer.c
@@ -24,7 +24,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <linux/fcntl.h>
diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c
index 99694b57f6f..c41dbe5fade 100644
--- a/net/ax25/ax25_subr.c
+++ b/net/ax25/ax25_subr.c
@@ -24,7 +24,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <linux/fcntl.h>
@@ -76,7 +76,7 @@ void ax25_requeue_frames(ax25_cb *ax25)
if (skb_prev == NULL)
skb_queue_head(&ax25->write_queue, skb);
else
- skb_append(skb_prev, skb);
+ skb_append(skb_prev, skb, &ax25->write_queue);
skb_prev = skb;
}
}
diff --git a/net/ax25/ax25_uid.c b/net/ax25/ax25_uid.c
index cea6b7d1972..d53cc861586 100644
--- a/net/ax25/ax25_uid.c
+++ b/net/ax25/ax25_uid.c
@@ -28,6 +28,7 @@
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
+#include <linux/list.h>
#include <linux/notifier.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
@@ -41,38 +42,41 @@
* Callsign/UID mapper. This is in kernel space for security on multi-amateur machines.
*/
-static ax25_uid_assoc *ax25_uid_list;
+HLIST_HEAD(ax25_uid_list);
static DEFINE_RWLOCK(ax25_uid_lock);
int ax25_uid_policy = 0;
-ax25_address *ax25_findbyuid(uid_t uid)
+ax25_uid_assoc *ax25_findbyuid(uid_t uid)
{
- ax25_uid_assoc *ax25_uid;
- ax25_address *res = NULL;
+ ax25_uid_assoc *ax25_uid, *res = NULL;
+ struct hlist_node *node;
read_lock(&ax25_uid_lock);
- for (ax25_uid = ax25_uid_list; ax25_uid != NULL; ax25_uid = ax25_uid->next) {
+ ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) {
if (ax25_uid->uid == uid) {
- res = &ax25_uid->call;
+ ax25_uid_hold(ax25_uid);
+ res = ax25_uid;
break;
}
}
read_unlock(&ax25_uid_lock);
- return NULL;
+ return res;
}
int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax)
{
- ax25_uid_assoc *s, *ax25_uid;
+ ax25_uid_assoc *ax25_uid;
+ struct hlist_node *node;
+ ax25_uid_assoc *user;
unsigned long res;
switch (cmd) {
case SIOCAX25GETUID:
res = -ENOENT;
read_lock(&ax25_uid_lock);
- for (ax25_uid = ax25_uid_list; ax25_uid != NULL; ax25_uid = ax25_uid->next) {
+ ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) {
if (ax25cmp(&sax->sax25_call, &ax25_uid->call) == 0) {
res = ax25_uid->uid;
break;
@@ -85,19 +89,22 @@ int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax)
case SIOCAX25ADDUID:
if (!capable(CAP_NET_ADMIN))
return -EPERM;
- if (ax25_findbyuid(sax->sax25_uid))
+ user = ax25_findbyuid(sax->sax25_uid);
+ if (user) {
+ ax25_uid_put(user);
return -EEXIST;
+ }
if (sax->sax25_uid == 0)
return -EINVAL;
if ((ax25_uid = kmalloc(sizeof(*ax25_uid), GFP_KERNEL)) == NULL)
return -ENOMEM;
+ atomic_set(&ax25_uid->refcount, 1);
ax25_uid->uid = sax->sax25_uid;
ax25_uid->call = sax->sax25_call;
write_lock(&ax25_uid_lock);
- ax25_uid->next = ax25_uid_list;
- ax25_uid_list = ax25_uid;
+ hlist_add_head(&ax25_uid->uid_node, &ax25_uid_list);
write_unlock(&ax25_uid_lock);
return 0;
@@ -106,34 +113,21 @@ int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax)
if (!capable(CAP_NET_ADMIN))
return -EPERM;
+ ax25_uid = NULL;
write_lock(&ax25_uid_lock);
- for (ax25_uid = ax25_uid_list; ax25_uid != NULL; ax25_uid = ax25_uid->next) {
- if (ax25cmp(&sax->sax25_call, &ax25_uid->call) == 0) {
+ ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) {
+ if (ax25cmp(&sax->sax25_call, &ax25_uid->call) == 0)
break;
- }
}
if (ax25_uid == NULL) {
write_unlock(&ax25_uid_lock);
return -ENOENT;
}
- if ((s = ax25_uid_list) == ax25_uid) {
- ax25_uid_list = s->next;
- write_unlock(&ax25_uid_lock);
- kfree(ax25_uid);
- return 0;
- }
- while (s != NULL && s->next != NULL) {
- if (s->next == ax25_uid) {
- s->next = ax25_uid->next;
- write_unlock(&ax25_uid_lock);
- kfree(ax25_uid);
- return 0;
- }
- s = s->next;
- }
+ hlist_del_init(&ax25_uid->uid_node);
+ ax25_uid_put(ax25_uid);
write_unlock(&ax25_uid_lock);
- return -ENOENT;
+ return 0;
default:
return -EINVAL;
@@ -147,13 +141,11 @@ int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax)
static void *ax25_uid_seq_start(struct seq_file *seq, loff_t *pos)
{
struct ax25_uid_assoc *pt;
- int i = 1;
+ struct hlist_node *node;
+ int i = 0;
read_lock(&ax25_uid_lock);
- if (*pos == 0)
- return SEQ_START_TOKEN;
-
- for (pt = ax25_uid_list; pt != NULL; pt = pt->next) {
+ ax25_uid_for_each(pt, node, &ax25_uid_list) {
if (i == *pos)
return pt;
++i;
@@ -164,8 +156,9 @@ static void *ax25_uid_seq_start(struct seq_file *seq, loff_t *pos)
static void *ax25_uid_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
++*pos;
- return (v == SEQ_START_TOKEN) ? ax25_uid_list :
- ((struct ax25_uid_assoc *) v)->next;
+
+ return hlist_entry(((ax25_uid_assoc *)v)->uid_node.next,
+ ax25_uid_assoc, uid_node);
}
static void ax25_uid_seq_stop(struct seq_file *seq, void *v)
@@ -175,13 +168,14 @@ static void ax25_uid_seq_stop(struct seq_file *seq, void *v)
static int ax25_uid_seq_show(struct seq_file *seq, void *v)
{
+ char buf[11];
+
if (v == SEQ_START_TOKEN)
seq_printf(seq, "Policy: %d\n", ax25_uid_policy);
else {
struct ax25_uid_assoc *pt = v;
-
- seq_printf(seq, "%6d %s\n", pt->uid, ax2asc(&pt->call));
+ seq_printf(seq, "%6d %s\n", pt->uid, ax2asc(buf, &pt->call));
}
return 0;
}
@@ -213,16 +207,13 @@ struct file_operations ax25_uid_fops = {
*/
void __exit ax25_uid_free(void)
{
- ax25_uid_assoc *s, *ax25_uid;
+ ax25_uid_assoc *ax25_uid;
+ struct hlist_node *node;
write_lock(&ax25_uid_lock);
- ax25_uid = ax25_uid_list;
- while (ax25_uid != NULL) {
- s = ax25_uid;
- ax25_uid = ax25_uid->next;
-
- kfree(s);
+ ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) {
+ hlist_del_init(&ax25_uid->uid_node);
+ ax25_uid_put(ax25_uid);
}
- ax25_uid_list = NULL;
write_unlock(&ax25_uid_lock);
}
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 12b43345b54..03532062a46 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -308,12 +308,6 @@ static struct net_proto_family bt_sock_family_ops = {
.create = bt_sock_create,
};
-extern int hci_sock_init(void);
-extern int hci_sock_cleanup(void);
-
-extern int bt_sysfs_init(void);
-extern int bt_sysfs_cleanup(void);
-
static int __init bt_init(void)
{
BT_INFO("Core ver %s", VERSION);
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index fb5524365bc..cf0df1c8c93 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -87,7 +87,7 @@ int hci_unregister_notifier(struct notifier_block *nb)
return notifier_chain_unregister(&hci_notifier, nb);
}
-void hci_notify(struct hci_dev *hdev, int event)
+static void hci_notify(struct hci_dev *hdev, int event)
{
notifier_call_chain(&hci_notifier, event, hdev);
}
@@ -191,7 +191,7 @@ static void hci_init_req(struct hci_dev *hdev, unsigned long opt)
/* Special commands */
while ((skb = skb_dequeue(&hdev->driver_init))) {
- skb->pkt_type = HCI_COMMAND_PKT;
+ bt_cb(skb)->pkt_type = HCI_COMMAND_PKT;
skb->dev = (void *) hdev;
skb_queue_tail(&hdev->cmd_q, skb);
hci_sched_cmd(hdev);
@@ -299,7 +299,6 @@ struct hci_dev *hci_dev_get(int index)
read_unlock(&hci_dev_list_lock);
return hdev;
}
-EXPORT_SYMBOL(hci_dev_get);
/* ---- Inquiry support ---- */
static void inquiry_cache_flush(struct hci_dev *hdev)
@@ -996,11 +995,11 @@ static int hci_send_frame(struct sk_buff *skb)
return -ENODEV;
}
- BT_DBG("%s type %d len %d", hdev->name, skb->pkt_type, skb->len);
+ BT_DBG("%s type %d len %d", hdev->name, bt_cb(skb)->pkt_type, skb->len);
if (atomic_read(&hdev->promisc)) {
/* Time stamp */
- do_gettimeofday(&skb->stamp);
+ __net_timestamp(skb);
hci_send_to_sock(hdev, skb);
}
@@ -1035,14 +1034,13 @@ int hci_send_cmd(struct hci_dev *hdev, __u16 ogf, __u16 ocf, __u32 plen, void *p
BT_DBG("skb len %d", skb->len);
- skb->pkt_type = HCI_COMMAND_PKT;
+ bt_cb(skb)->pkt_type = HCI_COMMAND_PKT;
skb->dev = (void *) hdev;
skb_queue_tail(&hdev->cmd_q, skb);
hci_sched_cmd(hdev);
return 0;
}
-EXPORT_SYMBOL(hci_send_cmd);
/* Get data from the previously sent command */
void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 ogf, __u16 ocf)
@@ -1083,7 +1081,7 @@ int hci_send_acl(struct hci_conn *conn, struct sk_buff *skb, __u16 flags)
BT_DBG("%s conn %p flags 0x%x", hdev->name, conn, flags);
skb->dev = (void *) hdev;
- skb->pkt_type = HCI_ACLDATA_PKT;
+ bt_cb(skb)->pkt_type = HCI_ACLDATA_PKT;
hci_add_acl_hdr(skb, conn->handle, flags | ACL_START);
if (!(list = skb_shinfo(skb)->frag_list)) {
@@ -1105,7 +1103,7 @@ int hci_send_acl(struct hci_conn *conn, struct sk_buff *skb, __u16 flags)
skb = list; list = list->next;
skb->dev = (void *) hdev;
- skb->pkt_type = HCI_ACLDATA_PKT;
+ bt_cb(skb)->pkt_type = HCI_ACLDATA_PKT;
hci_add_acl_hdr(skb, conn->handle, flags | ACL_CONT);
BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len);
@@ -1141,7 +1139,7 @@ int hci_send_sco(struct hci_conn *conn, struct sk_buff *skb)
memcpy(skb->h.raw, &hdr, HCI_SCO_HDR_SIZE);
skb->dev = (void *) hdev;
- skb->pkt_type = HCI_SCODATA_PKT;
+ bt_cb(skb)->pkt_type = HCI_SCODATA_PKT;
skb_queue_tail(&conn->data_q, skb);
hci_sched_tx(hdev);
return 0;
@@ -1349,7 +1347,7 @@ static inline void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb)
kfree_skb(skb);
}
-void hci_rx_task(unsigned long arg)
+static void hci_rx_task(unsigned long arg)
{
struct hci_dev *hdev = (struct hci_dev *) arg;
struct sk_buff *skb;
@@ -1371,7 +1369,7 @@ void hci_rx_task(unsigned long arg)
if (test_bit(HCI_INIT, &hdev->flags)) {
/* Don't process data packets in this states. */
- switch (skb->pkt_type) {
+ switch (bt_cb(skb)->pkt_type) {
case HCI_ACLDATA_PKT:
case HCI_SCODATA_PKT:
kfree_skb(skb);
@@ -1380,7 +1378,7 @@ void hci_rx_task(unsigned long arg)
}
/* Process frame */
- switch (skb->pkt_type) {
+ switch (bt_cb(skb)->pkt_type) {
case HCI_EVENT_PKT:
hci_event_packet(hdev, skb);
break;
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index c4b592b4ef1..b61b4e8e36f 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -484,14 +484,18 @@ static inline void hci_inquiry_complete_evt(struct hci_dev *hdev, struct sk_buff
/* Inquiry Result */
static inline void hci_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *skb)
{
+ struct inquiry_data data;
struct inquiry_info *info = (struct inquiry_info *) (skb->data + 1);
int num_rsp = *((__u8 *) skb->data);
BT_DBG("%s num_rsp %d", hdev->name, num_rsp);
+ if (!num_rsp)
+ return;
+
hci_dev_lock(hdev);
+
for (; num_rsp; num_rsp--) {
- struct inquiry_data data;
bacpy(&data.bdaddr, &info->bdaddr);
data.pscan_rep_mode = info->pscan_rep_mode;
data.pscan_period_mode = info->pscan_period_mode;
@@ -502,30 +506,84 @@ static inline void hci_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *
info++;
hci_inquiry_cache_update(hdev, &data);
}
+
hci_dev_unlock(hdev);
}
/* Inquiry Result With RSSI */
static inline void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, struct sk_buff *skb)
{
- struct inquiry_info_with_rssi *info = (struct inquiry_info_with_rssi *) (skb->data + 1);
+ struct inquiry_data data;
+ int num_rsp = *((__u8 *) skb->data);
+
+ BT_DBG("%s num_rsp %d", hdev->name, num_rsp);
+
+ if (!num_rsp)
+ return;
+
+ hci_dev_lock(hdev);
+
+ if ((skb->len - 1) / num_rsp != sizeof(struct inquiry_info_with_rssi)) {
+ struct inquiry_info_with_rssi_and_pscan_mode *info =
+ (struct inquiry_info_with_rssi_and_pscan_mode *) (skb->data + 1);
+
+ for (; num_rsp; num_rsp--) {
+ bacpy(&data.bdaddr, &info->bdaddr);
+ data.pscan_rep_mode = info->pscan_rep_mode;
+ data.pscan_period_mode = info->pscan_period_mode;
+ data.pscan_mode = info->pscan_mode;
+ memcpy(data.dev_class, info->dev_class, 3);
+ data.clock_offset = info->clock_offset;
+ data.rssi = info->rssi;
+ info++;
+ hci_inquiry_cache_update(hdev, &data);
+ }
+ } else {
+ struct inquiry_info_with_rssi *info =
+ (struct inquiry_info_with_rssi *) (skb->data + 1);
+
+ for (; num_rsp; num_rsp--) {
+ bacpy(&data.bdaddr, &info->bdaddr);
+ data.pscan_rep_mode = info->pscan_rep_mode;
+ data.pscan_period_mode = info->pscan_period_mode;
+ data.pscan_mode = 0x00;
+ memcpy(data.dev_class, info->dev_class, 3);
+ data.clock_offset = info->clock_offset;
+ data.rssi = info->rssi;
+ info++;
+ hci_inquiry_cache_update(hdev, &data);
+ }
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+/* Extended Inquiry Result */
+static inline void hci_extended_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct inquiry_data data;
+ struct extended_inquiry_info *info = (struct extended_inquiry_info *) (skb->data + 1);
int num_rsp = *((__u8 *) skb->data);
BT_DBG("%s num_rsp %d", hdev->name, num_rsp);
+ if (!num_rsp)
+ return;
+
hci_dev_lock(hdev);
+
for (; num_rsp; num_rsp--) {
- struct inquiry_data data;
bacpy(&data.bdaddr, &info->bdaddr);
- data.pscan_rep_mode = info->pscan_rep_mode;
- data.pscan_period_mode = info->pscan_period_mode;
- data.pscan_mode = 0x00;
+ data.pscan_rep_mode = info->pscan_rep_mode;
+ data.pscan_period_mode = info->pscan_period_mode;
+ data.pscan_mode = 0x00;
memcpy(data.dev_class, info->dev_class, 3);
- data.clock_offset = info->clock_offset;
- data.rssi = info->rssi;
+ data.clock_offset = info->clock_offset;
+ data.rssi = info->rssi;
info++;
hci_inquiry_cache_update(hdev, &data);
}
+
hci_dev_unlock(hdev);
}
@@ -865,6 +923,24 @@ static inline void hci_clock_offset_evt(struct hci_dev *hdev, struct sk_buff *sk
hci_dev_unlock(hdev);
}
+/* Page Scan Repetition Mode */
+static inline void hci_pscan_rep_mode_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_ev_pscan_rep_mode *ev = (struct hci_ev_pscan_rep_mode *) skb->data;
+ struct inquiry_entry *ie;
+
+ BT_DBG("%s", hdev->name);
+
+ hci_dev_lock(hdev);
+
+ if ((ie = hci_inquiry_cache_lookup(hdev, &ev->bdaddr))) {
+ ie->data.pscan_rep_mode = ev->pscan_rep_mode;
+ ie->timestamp = jiffies;
+ }
+
+ hci_dev_unlock(hdev);
+}
+
void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
{
struct hci_event_hdr *hdr = (struct hci_event_hdr *) skb->data;
@@ -893,6 +969,10 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
hci_inquiry_result_with_rssi_evt(hdev, skb);
break;
+ case HCI_EV_EXTENDED_INQUIRY_RESULT:
+ hci_extended_inquiry_result_evt(hdev, skb);
+ break;
+
case HCI_EV_CONN_REQUEST:
hci_conn_request_evt(hdev, skb);
break;
@@ -937,6 +1017,10 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
hci_clock_offset_evt(hdev, skb);
break;
+ case HCI_EV_PSCAN_REP_MODE:
+ hci_pscan_rep_mode_evt(hdev, skb);
+ break;
+
case HCI_EV_CMD_STATUS:
cs = (struct hci_ev_cmd_status *) skb->data;
skb_pull(skb, sizeof(cs));
@@ -1035,9 +1119,11 @@ void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data)
ev->type = type;
memcpy(ev->data, data, dlen);
- skb->pkt_type = HCI_EVENT_PKT;
+ bt_cb(skb)->incoming = 1;
+ __net_timestamp(skb);
+
+ bt_cb(skb)->pkt_type = HCI_EVENT_PKT;
skb->dev = (void *) hdev;
hci_send_to_sock(hdev, skb);
kfree_skb(skb);
}
-EXPORT_SYMBOL(hci_si_event);
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index ebdcce5e7ca..799e448750a 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -66,20 +66,20 @@ static struct hci_sec_filter hci_sec_filter = {
/* Packet types */
0x10,
/* Events */
- { 0x1000d9fe, 0x0000300c },
+ { 0x1000d9fe, 0x0000b00c },
/* Commands */
{
{ 0x0 },
/* OGF_LINK_CTL */
- { 0xbe000006, 0x00000001, 0x0000, 0x00 },
+ { 0xbe000006, 0x00000001, 0x000000, 0x00 },
/* OGF_LINK_POLICY */
- { 0x00005200, 0x00000000, 0x0000, 0x00 },
+ { 0x00005200, 0x00000000, 0x000000, 0x00 },
/* OGF_HOST_CTL */
- { 0xaab00200, 0x2b402aaa, 0x0154, 0x00 },
+ { 0xaab00200, 0x2b402aaa, 0x020154, 0x00 },
/* OGF_INFO_PARAM */
- { 0x000002be, 0x00000000, 0x0000, 0x00 },
+ { 0x000002be, 0x00000000, 0x000000, 0x00 },
/* OGF_STATUS_PARAM */
- { 0x000000ea, 0x00000000, 0x0000, 0x00 }
+ { 0x000000ea, 0x00000000, 0x000000, 0x00 }
}
};
@@ -110,11 +110,11 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb)
/* Apply filter */
flt = &hci_pi(sk)->filter;
- if (!test_bit((skb->pkt_type == HCI_VENDOR_PKT) ?
- 0 : (skb->pkt_type & HCI_FLT_TYPE_BITS), &flt->type_mask))
+ if (!test_bit((bt_cb(skb)->pkt_type == HCI_VENDOR_PKT) ?
+ 0 : (bt_cb(skb)->pkt_type & HCI_FLT_TYPE_BITS), &flt->type_mask))
continue;
- if (skb->pkt_type == HCI_EVENT_PKT) {
+ if (bt_cb(skb)->pkt_type == HCI_EVENT_PKT) {
register int evt = (*(__u8 *)skb->data & HCI_FLT_EVENT_BITS);
if (!hci_test_bit(evt, &flt->event_mask))
@@ -131,7 +131,7 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb)
continue;
/* Put type byte before the data */
- memcpy(skb_push(nskb, 1), &nskb->pkt_type, 1);
+ memcpy(skb_push(nskb, 1), &bt_cb(nskb)->pkt_type, 1);
if (sock_queue_rcv_skb(sk, nskb))
kfree_skb(nskb);
@@ -327,11 +327,17 @@ static inline void hci_sock_cmsg(struct sock *sk, struct msghdr *msg, struct sk_
{
__u32 mask = hci_pi(sk)->cmsg_mask;
- if (mask & HCI_CMSG_DIR)
- put_cmsg(msg, SOL_HCI, HCI_CMSG_DIR, sizeof(int), &bt_cb(skb)->incoming);
+ if (mask & HCI_CMSG_DIR) {
+ int incoming = bt_cb(skb)->incoming;
+ put_cmsg(msg, SOL_HCI, HCI_CMSG_DIR, sizeof(incoming), &incoming);
+ }
+
+ if (mask & HCI_CMSG_TSTAMP) {
+ struct timeval tv;
- if (mask & HCI_CMSG_TSTAMP)
- put_cmsg(msg, SOL_HCI, HCI_CMSG_TSTAMP, sizeof(skb->stamp), &skb->stamp);
+ skb_get_timestamp(skb, &tv);
+ put_cmsg(msg, SOL_HCI, HCI_CMSG_TSTAMP, sizeof(tv), &tv);
+ }
}
static int hci_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
@@ -405,11 +411,11 @@ static int hci_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
goto drop;
}
- skb->pkt_type = *((unsigned char *) skb->data);
+ bt_cb(skb)->pkt_type = *((unsigned char *) skb->data);
skb_pull(skb, 1);
skb->dev = (void *) hdev;
- if (skb->pkt_type == HCI_COMMAND_PKT) {
+ if (bt_cb(skb)->pkt_type == HCI_COMMAND_PKT) {
u16 opcode = __le16_to_cpu(get_unaligned((u16 *)skb->data));
u16 ogf = hci_opcode_ogf(opcode);
u16 ocf = hci_opcode_ocf(opcode);
diff --git a/net/bluetooth/hidp/Kconfig b/net/bluetooth/hidp/Kconfig
index 4e958f7d941..edfea772fb6 100644
--- a/net/bluetooth/hidp/Kconfig
+++ b/net/bluetooth/hidp/Kconfig
@@ -1,6 +1,6 @@
config BT_HIDP
tristate "HIDP protocol support"
- depends on BT && BT_L2CAP
+ depends on BT && BT_L2CAP && (BROKEN || !S390)
select INPUT
help
HIDP (Human Interface Device Protocol) is a transport layer
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index de8af5f4239..860444a7fc0 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -520,7 +520,7 @@ static int hidp_session(void *arg)
if (session->input) {
input_unregister_device(session->input);
- kfree(session->input);
+ session->input = NULL;
}
up_write(&hidp_session_sem);
@@ -536,6 +536,8 @@ static inline void hidp_setup_input(struct hidp_session *session, struct hidp_co
input->private = session;
+ input->name = "Bluetooth HID Boot Protocol Device";
+
input->id.bustype = BUS_BLUETOOTH;
input->id.vendor = req->vendor;
input->id.product = req->product;
@@ -582,16 +584,15 @@ int hidp_add_connection(struct hidp_connadd_req *req, struct socket *ctrl_sock,
return -ENOTUNIQ;
session = kmalloc(sizeof(struct hidp_session), GFP_KERNEL);
- if (!session)
+ if (!session)
return -ENOMEM;
memset(session, 0, sizeof(struct hidp_session));
- session->input = kmalloc(sizeof(struct input_dev), GFP_KERNEL);
+ session->input = input_allocate_device();
if (!session->input) {
kfree(session);
return -ENOMEM;
}
- memset(session->input, 0, sizeof(struct input_dev));
down_write(&hidp_session_sem);
@@ -651,8 +652,10 @@ unlink:
__hidp_unlink_session(session);
- if (session->input)
+ if (session->input) {
input_unregister_device(session->input);
+ session->input = NULL; /* don't try to free it here */
+ }
failed:
up_write(&hidp_session_sem);
diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c
index 32fccfb5bfa..59b2dd36baa 100644
--- a/net/bluetooth/l2cap.c
+++ b/net/bluetooth/l2cap.c
@@ -372,7 +372,7 @@ static struct proto l2cap_proto = {
.obj_size = sizeof(struct l2cap_pinfo)
};
-static struct sock *l2cap_sock_alloc(struct socket *sock, int proto, int prio)
+static struct sock *l2cap_sock_alloc(struct socket *sock, int proto, gfp_t prio)
{
struct sock *sk;
diff --git a/net/bluetooth/lib.c b/net/bluetooth/lib.c
index 9efb0a09361..ee6a6697991 100644
--- a/net/bluetooth/lib.c
+++ b/net/bluetooth/lib.c
@@ -34,31 +34,6 @@
#include <net/bluetooth/bluetooth.h>
-void bt_dump(char *pref, __u8 *buf, int count)
-{
- char *ptr;
- char line[100];
- unsigned int i;
-
- printk(KERN_INFO "%s: dump, len %d\n", pref, count);
-
- ptr = line;
- *ptr = 0;
- for (i = 0; i < count; i++) {
- ptr += sprintf(ptr, " %2.2X", buf[i]);
-
- if (i && !((i + 1) % 20)) {
- printk(KERN_INFO "%s:%s\n", pref, line);
- ptr = line;
- *ptr = 0;
- }
- }
-
- if (line[0])
- printk(KERN_INFO "%s:%s\n", pref, line);
-}
-EXPORT_SYMBOL(bt_dump);
-
void baswap(bdaddr_t *dst, bdaddr_t *src)
{
unsigned char *d = (unsigned char *) dst;
diff --git a/net/bluetooth/rfcomm/Makefile b/net/bluetooth/rfcomm/Makefile
index aecec45ec68..fe07988a370 100644
--- a/net/bluetooth/rfcomm/Makefile
+++ b/net/bluetooth/rfcomm/Makefile
@@ -4,5 +4,5 @@
obj-$(CONFIG_BT_RFCOMM) += rfcomm.o
-rfcomm-y := core.o sock.o crc.o
+rfcomm-y := core.o sock.o
rfcomm-$(CONFIG_BT_RFCOMM_TTY) += tty.o
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index e9e6fda66f1..c3d56ead840 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -21,10 +21,6 @@
SOFTWARE IS DISCLAIMED.
*/
-/*
- RPN support - Dirk Husemann <hud@zurich.ibm.com>
-*/
-
/*
* Bluetooth RFCOMM core.
*
@@ -115,10 +111,10 @@ static void rfcomm_session_del(struct rfcomm_session *s);
#define __get_mcc_len(b) ((b & 0xfe) >> 1)
/* RPN macros */
-#define __rpn_line_settings(data, stop, parity) ((data & 0x3) | ((stop & 0x1) << 2) | ((parity & 0x3) << 3))
+#define __rpn_line_settings(data, stop, parity) ((data & 0x3) | ((stop & 0x1) << 2) | ((parity & 0x7) << 3))
#define __get_rpn_data_bits(line) ((line) & 0x3)
#define __get_rpn_stop_bits(line) (((line) >> 2) & 0x1)
-#define __get_rpn_parity(line) (((line) >> 3) & 0x3)
+#define __get_rpn_parity(line) (((line) >> 3) & 0x7)
static inline void rfcomm_schedule(uint event)
{
@@ -137,6 +133,49 @@ static inline void rfcomm_session_put(struct rfcomm_session *s)
/* ---- RFCOMM FCS computation ---- */
+/* reversed, 8-bit, poly=0x07 */
+static unsigned char rfcomm_crc_table[256] = {
+ 0x00, 0x91, 0xe3, 0x72, 0x07, 0x96, 0xe4, 0x75,
+ 0x0e, 0x9f, 0xed, 0x7c, 0x09, 0x98, 0xea, 0x7b,
+ 0x1c, 0x8d, 0xff, 0x6e, 0x1b, 0x8a, 0xf8, 0x69,
+ 0x12, 0x83, 0xf1, 0x60, 0x15, 0x84, 0xf6, 0x67,
+
+ 0x38, 0xa9, 0xdb, 0x4a, 0x3f, 0xae, 0xdc, 0x4d,
+ 0x36, 0xa7, 0xd5, 0x44, 0x31, 0xa0, 0xd2, 0x43,
+ 0x24, 0xb5, 0xc7, 0x56, 0x23, 0xb2, 0xc0, 0x51,
+ 0x2a, 0xbb, 0xc9, 0x58, 0x2d, 0xbc, 0xce, 0x5f,
+
+ 0x70, 0xe1, 0x93, 0x02, 0x77, 0xe6, 0x94, 0x05,
+ 0x7e, 0xef, 0x9d, 0x0c, 0x79, 0xe8, 0x9a, 0x0b,
+ 0x6c, 0xfd, 0x8f, 0x1e, 0x6b, 0xfa, 0x88, 0x19,
+ 0x62, 0xf3, 0x81, 0x10, 0x65, 0xf4, 0x86, 0x17,
+
+ 0x48, 0xd9, 0xab, 0x3a, 0x4f, 0xde, 0xac, 0x3d,
+ 0x46, 0xd7, 0xa5, 0x34, 0x41, 0xd0, 0xa2, 0x33,
+ 0x54, 0xc5, 0xb7, 0x26, 0x53, 0xc2, 0xb0, 0x21,
+ 0x5a, 0xcb, 0xb9, 0x28, 0x5d, 0xcc, 0xbe, 0x2f,
+
+ 0xe0, 0x71, 0x03, 0x92, 0xe7, 0x76, 0x04, 0x95,
+ 0xee, 0x7f, 0x0d, 0x9c, 0xe9, 0x78, 0x0a, 0x9b,
+ 0xfc, 0x6d, 0x1f, 0x8e, 0xfb, 0x6a, 0x18, 0x89,
+ 0xf2, 0x63, 0x11, 0x80, 0xf5, 0x64, 0x16, 0x87,
+
+ 0xd8, 0x49, 0x3b, 0xaa, 0xdf, 0x4e, 0x3c, 0xad,
+ 0xd6, 0x47, 0x35, 0xa4, 0xd1, 0x40, 0x32, 0xa3,
+ 0xc4, 0x55, 0x27, 0xb6, 0xc3, 0x52, 0x20, 0xb1,
+ 0xca, 0x5b, 0x29, 0xb8, 0xcd, 0x5c, 0x2e, 0xbf,
+
+ 0x90, 0x01, 0x73, 0xe2, 0x97, 0x06, 0x74, 0xe5,
+ 0x9e, 0x0f, 0x7d, 0xec, 0x99, 0x08, 0x7a, 0xeb,
+ 0x8c, 0x1d, 0x6f, 0xfe, 0x8b, 0x1a, 0x68, 0xf9,
+ 0x82, 0x13, 0x61, 0xf0, 0x85, 0x14, 0x66, 0xf7,
+
+ 0xa8, 0x39, 0x4b, 0xda, 0xaf, 0x3e, 0x4c, 0xdd,
+ 0xa6, 0x37, 0x45, 0xd4, 0xa1, 0x30, 0x42, 0xd3,
+ 0xb4, 0x25, 0x57, 0xc6, 0xb3, 0x22, 0x50, 0xc1,
+ 0xba, 0x2b, 0x59, 0xc8, 0xbd, 0x2c, 0x5e, 0xcf
+};
+
/* CRC on 2 bytes */
#define __crc(data) (rfcomm_crc_table[rfcomm_crc_table[0xff ^ data[0]] ^ data[1]])
@@ -233,7 +272,7 @@ static void rfcomm_dlc_clear_state(struct rfcomm_dlc *d)
d->rx_credits = RFCOMM_DEFAULT_CREDITS;
}
-struct rfcomm_dlc *rfcomm_dlc_alloc(int prio)
+struct rfcomm_dlc *rfcomm_dlc_alloc(gfp_t prio)
{
struct rfcomm_dlc *d = kmalloc(sizeof(*d), prio);
if (!d)
@@ -389,8 +428,6 @@ static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err)
rfcomm_dlc_unlock(d);
skb_queue_purge(&d->tx_queue);
- rfcomm_session_put(s);
-
rfcomm_dlc_unlink(d);
}
@@ -600,8 +637,6 @@ static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, bdaddr_t *dst
goto failed;
}
- rfcomm_session_hold(s);
-
s->initiator = 1;
bacpy(&addr.l2_bdaddr, dst);
@@ -784,10 +819,10 @@ static int rfcomm_send_pn(struct rfcomm_session *s, int cr, struct rfcomm_dlc *d
return rfcomm_send_frame(s, buf, ptr - buf);
}
-static int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci,
- u8 bit_rate, u8 data_bits, u8 stop_bits,
- u8 parity, u8 flow_ctrl_settings,
- u8 xon_char, u8 xoff_char, u16 param_mask)
+int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci,
+ u8 bit_rate, u8 data_bits, u8 stop_bits,
+ u8 parity, u8 flow_ctrl_settings,
+ u8 xon_char, u8 xoff_char, u16 param_mask)
{
struct rfcomm_hdr *hdr;
struct rfcomm_mcc *mcc;
@@ -795,9 +830,9 @@ static int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci,
u8 buf[16], *ptr = buf;
BT_DBG("%p cr %d dlci %d bit_r 0x%x data_b 0x%x stop_b 0x%x parity 0x%x"
- "flwc_s 0x%x xon_c 0x%x xoff_c 0x%x p_mask 0x%x",
- s, cr, dlci, bit_rate, data_bits, stop_bits, parity,
- flow_ctrl_settings, xon_char, xoff_char, param_mask);
+ " flwc_s 0x%x xon_c 0x%x xoff_c 0x%x p_mask 0x%x",
+ s, cr, dlci, bit_rate, data_bits, stop_bits, parity,
+ flow_ctrl_settings, xon_char, xoff_char, param_mask);
hdr = (void *) ptr; ptr += sizeof(*hdr);
hdr->addr = __addr(s->initiator, 0);
@@ -1269,16 +1304,16 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
u8 xon_char = 0;
u8 xoff_char = 0;
u16 rpn_mask = RFCOMM_RPN_PM_ALL;
-
- BT_DBG("dlci %d cr %d len 0x%x bitr 0x%x line 0x%x flow 0x%x xonc 0x%x xoffc 0x%x pm 0x%x",
- dlci, cr, len, rpn->bit_rate, rpn->line_settings, rpn->flow_ctrl,
- rpn->xon_char, rpn->xoff_char, rpn->param_mask);
-
- if (!cr)
+
+ BT_DBG("dlci %d cr %d len 0x%x bitr 0x%x line 0x%x flow 0x%x xonc 0x%x xoffc 0x%x pm 0x%x",
+ dlci, cr, len, rpn->bit_rate, rpn->line_settings, rpn->flow_ctrl,
+ rpn->xon_char, rpn->xoff_char, rpn->param_mask);
+
+ if (!cr)
return 0;
-
+
if (len == 1) {
- /* request: return default setting */
+ /* This is a request, return default settings */
bit_rate = RFCOMM_RPN_BR_115200;
data_bits = RFCOMM_RPN_DATA_8;
stop_bits = RFCOMM_RPN_STOP_1;
@@ -1286,11 +1321,12 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
flow_ctrl = RFCOMM_RPN_FLOW_NONE;
xon_char = RFCOMM_RPN_XON_CHAR;
xoff_char = RFCOMM_RPN_XOFF_CHAR;
-
goto rpn_out;
}
- /* check for sane values: ignore/accept bit_rate, 8 bits, 1 stop bit, no parity,
- no flow control lines, normal XON/XOFF chars */
+
+ /* Check for sane values, ignore/accept bit_rate, 8 bits, 1 stop bit,
+ * no parity, no flow control lines, normal XON/XOFF chars */
+
if (rpn->param_mask & RFCOMM_RPN_PM_BITRATE) {
bit_rate = rpn->bit_rate;
if (bit_rate != RFCOMM_RPN_BR_115200) {
@@ -1299,6 +1335,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
rpn_mask ^= RFCOMM_RPN_PM_BITRATE;
}
}
+
if (rpn->param_mask & RFCOMM_RPN_PM_DATA) {
data_bits = __get_rpn_data_bits(rpn->line_settings);
if (data_bits != RFCOMM_RPN_DATA_8) {
@@ -1307,6 +1344,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
rpn_mask ^= RFCOMM_RPN_PM_DATA;
}
}
+
if (rpn->param_mask & RFCOMM_RPN_PM_STOP) {
stop_bits = __get_rpn_stop_bits(rpn->line_settings);
if (stop_bits != RFCOMM_RPN_STOP_1) {
@@ -1315,6 +1353,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
rpn_mask ^= RFCOMM_RPN_PM_STOP;
}
}
+
if (rpn->param_mask & RFCOMM_RPN_PM_PARITY) {
parity = __get_rpn_parity(rpn->line_settings);
if (parity != RFCOMM_RPN_PARITY_NONE) {
@@ -1323,6 +1362,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
rpn_mask ^= RFCOMM_RPN_PM_PARITY;
}
}
+
if (rpn->param_mask & RFCOMM_RPN_PM_FLOW) {
flow_ctrl = rpn->flow_ctrl;
if (flow_ctrl != RFCOMM_RPN_FLOW_NONE) {
@@ -1331,6 +1371,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
rpn_mask ^= RFCOMM_RPN_PM_FLOW;
}
}
+
if (rpn->param_mask & RFCOMM_RPN_PM_XON) {
xon_char = rpn->xon_char;
if (xon_char != RFCOMM_RPN_XON_CHAR) {
@@ -1339,6 +1380,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
rpn_mask ^= RFCOMM_RPN_PM_XON;
}
}
+
if (rpn->param_mask & RFCOMM_RPN_PM_XOFF) {
xoff_char = rpn->xoff_char;
if (xoff_char != RFCOMM_RPN_XOFF_CHAR) {
@@ -1349,9 +1391,8 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
}
rpn_out:
- rfcomm_send_rpn(s, 0, dlci,
- bit_rate, data_bits, stop_bits, parity, flow_ctrl,
- xon_char, xoff_char, rpn_mask);
+ rfcomm_send_rpn(s, 0, dlci, bit_rate, data_bits, stop_bits,
+ parity, flow_ctrl, xon_char, xoff_char, rpn_mask);
return 0;
}
@@ -1362,14 +1403,13 @@ static int rfcomm_recv_rls(struct rfcomm_session *s, int cr, struct sk_buff *skb
u8 dlci = __get_dlci(rls->dlci);
BT_DBG("dlci %d cr %d status 0x%x", dlci, cr, rls->status);
-
+
if (!cr)
return 0;
- /* FIXME: We should probably do something with this
- information here. But for now it's sufficient just
- to reply -- Bluetooth 1.1 says it's mandatory to
- recognise and respond to RLS */
+ /* We should probably do something with this information here. But
+ * for now it's sufficient just to reply -- Bluetooth 1.1 says it's
+ * mandatory to recognise and respond to RLS */
rfcomm_send_rls(s, 0, dlci, rls->status);
@@ -1385,7 +1425,7 @@ static int rfcomm_recv_msc(struct rfcomm_session *s, int cr, struct sk_buff *skb
BT_DBG("dlci %d cr %d v24 0x%x", dlci, cr, msc->v24_sig);
d = rfcomm_dlc_get(s, dlci);
- if (!d)
+ if (!d)
return 0;
if (cr) {
@@ -1393,7 +1433,7 @@ static int rfcomm_recv_msc(struct rfcomm_session *s, int cr, struct sk_buff *skb
set_bit(RFCOMM_TX_THROTTLED, &d->flags);
else
clear_bit(RFCOMM_TX_THROTTLED, &d->flags);
-
+
rfcomm_dlc_lock(d);
if (d->modem_status)
d->modem_status(d, msc->v24_sig);
@@ -1402,7 +1442,7 @@ static int rfcomm_recv_msc(struct rfcomm_session *s, int cr, struct sk_buff *skb
rfcomm_send_msc(s, 0, dlci, msc->v24_sig);
d->mscex |= RFCOMM_MSCEX_RX;
- } else
+ } else
d->mscex |= RFCOMM_MSCEX_TX;
return 0;
diff --git a/net/bluetooth/rfcomm/crc.c b/net/bluetooth/rfcomm/crc.c
deleted file mode 100644
index 1011bc4a869..00000000000
--- a/net/bluetooth/rfcomm/crc.c
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- RFCOMM implementation for Linux Bluetooth stack (BlueZ).
- Copyright (C) 2002 Maxim Krasnyansky <maxk@qualcomm.com>
- Copyright (C) 2002 Marcel Holtmann <marcel@holtmann.org>
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License version 2 as
- published by the Free Software Foundation;
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
- IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
- CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-
- ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
- COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
- SOFTWARE IS DISCLAIMED.
-*/
-
-/*
- * RFCOMM FCS calculation.
- *
- * $Id: crc.c,v 1.2 2002/09/21 09:54:32 holtmann Exp $
- */
-
-/* reversed, 8-bit, poly=0x07 */
-unsigned char rfcomm_crc_table[256] = {
- 0x00, 0x91, 0xe3, 0x72, 0x07, 0x96, 0xe4, 0x75,
- 0x0e, 0x9f, 0xed, 0x7c, 0x09, 0x98, 0xea, 0x7b,
- 0x1c, 0x8d, 0xff, 0x6e, 0x1b, 0x8a, 0xf8, 0x69,
- 0x12, 0x83, 0xf1, 0x60, 0x15, 0x84, 0xf6, 0x67,
-
- 0x38, 0xa9, 0xdb, 0x4a, 0x3f, 0xae, 0xdc, 0x4d,
- 0x36, 0xa7, 0xd5, 0x44, 0x31, 0xa0, 0xd2, 0x43,
- 0x24, 0xb5, 0xc7, 0x56, 0x23, 0xb2, 0xc0, 0x51,
- 0x2a, 0xbb, 0xc9, 0x58, 0x2d, 0xbc, 0xce, 0x5f,
-
- 0x70, 0xe1, 0x93, 0x02, 0x77, 0xe6, 0x94, 0x05,
- 0x7e, 0xef, 0x9d, 0x0c, 0x79, 0xe8, 0x9a, 0x0b,
- 0x6c, 0xfd, 0x8f, 0x1e, 0x6b, 0xfa, 0x88, 0x19,
- 0x62, 0xf3, 0x81, 0x10, 0x65, 0xf4, 0x86, 0x17,
-
- 0x48, 0xd9, 0xab, 0x3a, 0x4f, 0xde, 0xac, 0x3d,
- 0x46, 0xd7, 0xa5, 0x34, 0x41, 0xd0, 0xa2, 0x33,
- 0x54, 0xc5, 0xb7, 0x26, 0x53, 0xc2, 0xb0, 0x21,
- 0x5a, 0xcb, 0xb9, 0x28, 0x5d, 0xcc, 0xbe, 0x2f,
-
- 0xe0, 0x71, 0x03, 0x92, 0xe7, 0x76, 0x04, 0x95,
- 0xee, 0x7f, 0x0d, 0x9c, 0xe9, 0x78, 0x0a, 0x9b,
- 0xfc, 0x6d, 0x1f, 0x8e, 0xfb, 0x6a, 0x18, 0x89,
- 0xf2, 0x63, 0x11, 0x80, 0xf5, 0x64, 0x16, 0x87,
-
- 0xd8, 0x49, 0x3b, 0xaa, 0xdf, 0x4e, 0x3c, 0xad,
- 0xd6, 0x47, 0x35, 0xa4, 0xd1, 0x40, 0x32, 0xa3,
- 0xc4, 0x55, 0x27, 0xb6, 0xc3, 0x52, 0x20, 0xb1,
- 0xca, 0x5b, 0x29, 0xb8, 0xcd, 0x5c, 0x2e, 0xbf,
-
- 0x90, 0x01, 0x73, 0xe2, 0x97, 0x06, 0x74, 0xe5,
- 0x9e, 0x0f, 0x7d, 0xec, 0x99, 0x08, 0x7a, 0xeb,
- 0x8c, 0x1d, 0x6f, 0xfe, 0x8b, 0x1a, 0x68, 0xf9,
- 0x82, 0x13, 0x61, 0xf0, 0x85, 0x14, 0x66, 0xf7,
-
- 0xa8, 0x39, 0x4b, 0xda, 0xaf, 0x3e, 0x4c, 0xdd,
- 0xa6, 0x37, 0x45, 0xd4, 0xa1, 0x30, 0x42, 0xd3,
- 0xb4, 0x25, 0x57, 0xc6, 0xb3, 0x22, 0x50, 0xc1,
- 0xba, 0x2b, 0x59, 0xc8, 0xbd, 0x2c, 0x5e, 0xcf
-};
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 63a123c5c41..a2b30f0aedb 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -284,7 +284,7 @@ static struct proto rfcomm_proto = {
.obj_size = sizeof(struct rfcomm_pinfo)
};
-static struct sock *rfcomm_sock_alloc(struct socket *sock, int proto, int prio)
+static struct sock *rfcomm_sock_alloc(struct socket *sock, int proto, gfp_t prio)
{
struct rfcomm_dlc *d;
struct sock *sk;
@@ -363,6 +363,11 @@ static int rfcomm_sock_bind(struct socket *sock, struct sockaddr *addr, int addr
goto done;
}
+ if (sk->sk_type != SOCK_STREAM) {
+ err = -EINVAL;
+ goto done;
+ }
+
write_lock_bh(&rfcomm_sk_list.lock);
if (sa->rc_channel && __rfcomm_get_sock_by_addr(sa->rc_channel, &sa->rc_bdaddr)) {
@@ -393,13 +398,17 @@ static int rfcomm_sock_connect(struct socket *sock, struct sockaddr *addr, int a
if (addr->sa_family != AF_BLUETOOTH || alen < sizeof(struct sockaddr_rc))
return -EINVAL;
- if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND)
- return -EBADFD;
+ lock_sock(sk);
- if (sk->sk_type != SOCK_STREAM)
- return -EINVAL;
+ if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND) {
+ err = -EBADFD;
+ goto done;
+ }
- lock_sock(sk);
+ if (sk->sk_type != SOCK_STREAM) {
+ err = -EINVAL;
+ goto done;
+ }
sk->sk_state = BT_CONNECT;
bacpy(&bt_sk(sk)->dst, &sa->rc_bdaddr);
@@ -410,6 +419,7 @@ static int rfcomm_sock_connect(struct socket *sock, struct sockaddr *addr, int a
err = bt_sock_wait_state(sk, BT_CONNECTED,
sock_sndtimeo(sk, flags & O_NONBLOCK));
+done:
release_sock(sk);
return err;
}
@@ -428,6 +438,11 @@ static int rfcomm_sock_listen(struct socket *sock, int backlog)
goto done;
}
+ if (sk->sk_type != SOCK_STREAM) {
+ err = -EINVAL;
+ goto done;
+ }
+
if (!rfcomm_pi(sk)->channel) {
bdaddr_t *src = &bt_sk(sk)->src;
u8 channel;
@@ -472,6 +487,11 @@ static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock, int f
goto done;
}
+ if (sk->sk_type != SOCK_STREAM) {
+ err = -EINVAL;
+ goto done;
+ }
+
timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
BT_DBG("sk %p timeo %ld", sk, timeo);
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index 6304590fd36..158a9c46d86 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -286,7 +286,7 @@ static inline void rfcomm_set_owner_w(struct sk_buff *skb, struct rfcomm_dev *de
skb->destructor = rfcomm_wfree;
}
-static struct sk_buff *rfcomm_wmalloc(struct rfcomm_dev *dev, unsigned long size, int priority)
+static struct sk_buff *rfcomm_wmalloc(struct rfcomm_dev *dev, unsigned long size, gfp_t priority)
{
if (atomic_read(&dev->wmem_alloc) < rfcomm_room(dev->dlc)) {
struct sk_buff *skb = alloc_skb(size, priority);
@@ -528,9 +528,14 @@ static void rfcomm_dev_modem_status(struct rfcomm_dlc *dlc, u8 v24_sig)
struct rfcomm_dev *dev = dlc->owner;
if (!dev)
return;
-
+
BT_DBG("dlc %p dev %p v24_sig 0x%02x", dlc, dev, v24_sig);
+ if ((dev->modem_status & TIOCM_CD) && !(v24_sig & RFCOMM_V24_DV)) {
+ if (dev->tty && !C_CLOCAL(dev->tty))
+ tty_hangup(dev->tty);
+ }
+
dev->modem_status =
((v24_sig & RFCOMM_V24_RTC) ? (TIOCM_DSR | TIOCM_DTR) : 0) |
((v24_sig & RFCOMM_V24_RTR) ? (TIOCM_RTS | TIOCM_CTS) : 0) |
@@ -740,20 +745,143 @@ static int rfcomm_tty_ioctl(struct tty_struct *tty, struct file *filp, unsigned
return -ENOIOCTLCMD;
}
-#define RELEVANT_IFLAG(iflag) (iflag & (IGNBRK|BRKINT|IGNPAR|PARMRK|INPCK))
-
static void rfcomm_tty_set_termios(struct tty_struct *tty, struct termios *old)
{
- BT_DBG("tty %p", tty);
+ struct termios *new = (struct termios *) tty->termios;
+ int old_baud_rate = tty_termios_baud_rate(old);
+ int new_baud_rate = tty_termios_baud_rate(new);
- if ((tty->termios->c_cflag == old->c_cflag) &&
- (RELEVANT_IFLAG(tty->termios->c_iflag) == RELEVANT_IFLAG(old->c_iflag)))
- return;
+ u8 baud, data_bits, stop_bits, parity, x_on, x_off;
+ u16 changes = 0;
+
+ struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+
+ BT_DBG("tty %p termios %p", tty, old);
+
+ /* Handle turning off CRTSCTS */
+ if ((old->c_cflag & CRTSCTS) && !(new->c_cflag & CRTSCTS))
+ BT_DBG("Turning off CRTSCTS unsupported");
+
+ /* Parity on/off and when on, odd/even */
+ if (((old->c_cflag & PARENB) != (new->c_cflag & PARENB)) ||
+ ((old->c_cflag & PARODD) != (new->c_cflag & PARODD)) ) {
+ changes |= RFCOMM_RPN_PM_PARITY;
+ BT_DBG("Parity change detected.");
+ }
+
+ /* Mark and space parity are not supported! */
+ if (new->c_cflag & PARENB) {
+ if (new->c_cflag & PARODD) {
+ BT_DBG("Parity is ODD");
+ parity = RFCOMM_RPN_PARITY_ODD;
+ } else {
+ BT_DBG("Parity is EVEN");
+ parity = RFCOMM_RPN_PARITY_EVEN;
+ }
+ } else {
+ BT_DBG("Parity is OFF");
+ parity = RFCOMM_RPN_PARITY_NONE;
+ }
+
+ /* Setting the x_on / x_off characters */
+ if (old->c_cc[VSTOP] != new->c_cc[VSTOP]) {
+ BT_DBG("XOFF custom");
+ x_on = new->c_cc[VSTOP];
+ changes |= RFCOMM_RPN_PM_XON;
+ } else {
+ BT_DBG("XOFF default");
+ x_on = RFCOMM_RPN_XON_CHAR;
+ }
+
+ if (old->c_cc[VSTART] != new->c_cc[VSTART]) {
+ BT_DBG("XON custom");
+ x_off = new->c_cc[VSTART];
+ changes |= RFCOMM_RPN_PM_XOFF;
+ } else {
+ BT_DBG("XON default");
+ x_off = RFCOMM_RPN_XOFF_CHAR;
+ }
+
+ /* Handle setting of stop bits */
+ if ((old->c_cflag & CSTOPB) != (new->c_cflag & CSTOPB))
+ changes |= RFCOMM_RPN_PM_STOP;
+
+ /* POSIX does not support 1.5 stop bits and RFCOMM does not
+ * support 2 stop bits. So a request for 2 stop bits gets
+ * translated to 1.5 stop bits */
+ if (new->c_cflag & CSTOPB) {
+ stop_bits = RFCOMM_RPN_STOP_15;
+ } else {
+ stop_bits = RFCOMM_RPN_STOP_1;
+ }
+
+ /* Handle number of data bits [5-8] */
+ if ((old->c_cflag & CSIZE) != (new->c_cflag & CSIZE))
+ changes |= RFCOMM_RPN_PM_DATA;
+
+ switch (new->c_cflag & CSIZE) {
+ case CS5:
+ data_bits = RFCOMM_RPN_DATA_5;
+ break;
+ case CS6:
+ data_bits = RFCOMM_RPN_DATA_6;
+ break;
+ case CS7:
+ data_bits = RFCOMM_RPN_DATA_7;
+ break;
+ case CS8:
+ data_bits = RFCOMM_RPN_DATA_8;
+ break;
+ default:
+ data_bits = RFCOMM_RPN_DATA_8;
+ break;
+ }
+
+ /* Handle baudrate settings */
+ if (old_baud_rate != new_baud_rate)
+ changes |= RFCOMM_RPN_PM_BITRATE;
- /* handle turning off CRTSCTS */
- if ((old->c_cflag & CRTSCTS) && !(tty->termios->c_cflag & CRTSCTS)) {
- BT_DBG("turning off CRTSCTS");
+ switch (new_baud_rate) {
+ case 2400:
+ baud = RFCOMM_RPN_BR_2400;
+ break;
+ case 4800:
+ baud = RFCOMM_RPN_BR_4800;
+ break;
+ case 7200:
+ baud = RFCOMM_RPN_BR_7200;
+ break;
+ case 9600:
+ baud = RFCOMM_RPN_BR_9600;
+ break;
+ case 19200:
+ baud = RFCOMM_RPN_BR_19200;
+ break;
+ case 38400:
+ baud = RFCOMM_RPN_BR_38400;
+ break;
+ case 57600:
+ baud = RFCOMM_RPN_BR_57600;
+ break;
+ case 115200:
+ baud = RFCOMM_RPN_BR_115200;
+ break;
+ case 230400:
+ baud = RFCOMM_RPN_BR_230400;
+ break;
+ default:
+ /* 9600 is standard accordinag to the RFCOMM specification */
+ baud = RFCOMM_RPN_BR_9600;
+ break;
+
}
+
+ if (changes)
+ rfcomm_send_rpn(dev->dlc->session, 1, dev->dlc->dlci, baud,
+ data_bits, stop_bits, parity,
+ RFCOMM_RPN_FLOW_NONE, x_on, x_off, changes);
+
+ return;
}
static void rfcomm_tty_throttle(struct tty_struct *tty)
@@ -761,7 +889,7 @@ static void rfcomm_tty_throttle(struct tty_struct *tty)
struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
BT_DBG("tty %p dev %p", tty, dev);
-
+
rfcomm_dlc_throttle(dev->dlc);
}
@@ -770,7 +898,7 @@ static void rfcomm_tty_unthrottle(struct tty_struct *tty)
struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
BT_DBG("tty %p dev %p", tty, dev);
-
+
rfcomm_dlc_unthrottle(dev->dlc);
}
@@ -841,35 +969,35 @@ static int rfcomm_tty_tiocmget(struct tty_struct *tty, struct file *filp)
static int rfcomm_tty_tiocmset(struct tty_struct *tty, struct file *filp, unsigned int set, unsigned int clear)
{
- struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
- struct rfcomm_dlc *dlc = dev->dlc;
- u8 v24_sig;
+ struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+ struct rfcomm_dlc *dlc = dev->dlc;
+ u8 v24_sig;
BT_DBG("tty %p dev %p set 0x%02x clear 0x%02x", tty, dev, set, clear);
- rfcomm_dlc_get_modem_status(dlc, &v24_sig);
-
- if (set & TIOCM_DSR || set & TIOCM_DTR)
- v24_sig |= RFCOMM_V24_RTC;
- if (set & TIOCM_RTS || set & TIOCM_CTS)
- v24_sig |= RFCOMM_V24_RTR;
- if (set & TIOCM_RI)
- v24_sig |= RFCOMM_V24_IC;
- if (set & TIOCM_CD)
- v24_sig |= RFCOMM_V24_DV;
-
- if (clear & TIOCM_DSR || clear & TIOCM_DTR)
- v24_sig &= ~RFCOMM_V24_RTC;
- if (clear & TIOCM_RTS || clear & TIOCM_CTS)
- v24_sig &= ~RFCOMM_V24_RTR;
- if (clear & TIOCM_RI)
- v24_sig &= ~RFCOMM_V24_IC;
- if (clear & TIOCM_CD)
- v24_sig &= ~RFCOMM_V24_DV;
-
- rfcomm_dlc_set_modem_status(dlc, v24_sig);
-
- return 0;
+ rfcomm_dlc_get_modem_status(dlc, &v24_sig);
+
+ if (set & TIOCM_DSR || set & TIOCM_DTR)
+ v24_sig |= RFCOMM_V24_RTC;
+ if (set & TIOCM_RTS || set & TIOCM_CTS)
+ v24_sig |= RFCOMM_V24_RTR;
+ if (set & TIOCM_RI)
+ v24_sig |= RFCOMM_V24_IC;
+ if (set & TIOCM_CD)
+ v24_sig |= RFCOMM_V24_DV;
+
+ if (clear & TIOCM_DSR || clear & TIOCM_DTR)
+ v24_sig &= ~RFCOMM_V24_RTC;
+ if (clear & TIOCM_RTS || clear & TIOCM_CTS)
+ v24_sig &= ~RFCOMM_V24_RTR;
+ if (clear & TIOCM_RI)
+ v24_sig &= ~RFCOMM_V24_IC;
+ if (clear & TIOCM_CD)
+ v24_sig &= ~RFCOMM_V24_DV;
+
+ rfcomm_dlc_set_modem_status(dlc, v24_sig);
+
+ return 0;
}
/* ---- TTY structure ---- */
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 746c11fc017..997e42df115 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -418,7 +418,7 @@ static struct proto sco_proto = {
.obj_size = sizeof(struct sco_pinfo)
};
-static struct sock *sco_sock_alloc(struct socket *sock, int proto, int prio)
+static struct sock *sco_sock_alloc(struct socket *sock, int proto, gfp_t prio)
{
struct sock *sk;
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index e6c2200b7ca..1f08a59b51e 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -23,7 +23,7 @@
#include <asm/atomic.h>
#include "br_private.h"
-static kmem_cache_t *br_fdb_cache;
+static kmem_cache_t *br_fdb_cache __read_mostly;
static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
const unsigned char *addr);
@@ -86,8 +86,8 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
struct net_bridge_port *op;
list_for_each_entry(op, &br->port_list, list) {
if (op != p &&
- !memcmp(op->dev->dev_addr,
- f->addr.addr, ETH_ALEN)) {
+ !compare_ether_addr(op->dev->dev_addr,
+ f->addr.addr)) {
f->dst = op;
goto insert;
}
@@ -151,8 +151,8 @@ void br_fdb_delete_by_port(struct net_bridge *br, struct net_bridge_port *p)
struct net_bridge_port *op;
list_for_each_entry(op, &br->port_list, list) {
if (op != p &&
- !memcmp(op->dev->dev_addr,
- f->addr.addr, ETH_ALEN)) {
+ !compare_ether_addr(op->dev->dev_addr,
+ f->addr.addr)) {
f->dst = op;
goto skip_delete;
}
@@ -174,7 +174,7 @@ struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br,
struct net_bridge_fdb_entry *fdb;
hlist_for_each_entry_rcu(fdb, h, &br->hash[br_mac_hash(addr)], hlist) {
- if (!memcmp(fdb->addr.addr, addr, ETH_ALEN)) {
+ if (!compare_ether_addr(fdb->addr.addr, addr)) {
if (unlikely(has_expired(br, fdb)))
break;
return fdb;
@@ -264,7 +264,7 @@ static inline struct net_bridge_fdb_entry *fdb_find(struct hlist_head *head,
struct net_bridge_fdb_entry *fdb;
hlist_for_each_entry_rcu(fdb, h, head, hlist) {
- if (!memcmp(fdb->addr.addr, addr, ETH_ALEN))
+ if (!compare_ether_addr(fdb->addr.addr, addr))
return fdb;
}
return NULL;
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 069253f830c..2d24fb400e0 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -31,7 +31,8 @@ static inline int should_deliver(const struct net_bridge_port *p,
int br_dev_queue_push_xmit(struct sk_buff *skb)
{
- if (skb->len > skb->dev->mtu)
+ /* drop mtu oversized packets except tso */
+ if (skb->len > skb->dev->mtu && !skb_shinfo(skb)->tso_size)
kfree_skb(skb);
else {
#ifdef CONFIG_BRIDGE_NETFILTER
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 91bb895375f..defcf6a8607 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -79,7 +79,6 @@ static void destroy_nbp(struct net_bridge_port *p)
{
struct net_device *dev = p->dev;
- dev->br_port = NULL;
p->br = NULL;
p->dev = NULL;
dev_put(dev);
@@ -100,6 +99,7 @@ static void del_nbp(struct net_bridge_port *p)
struct net_bridge *br = p->br;
struct net_device *dev = p->dev;
+ dev->br_port = NULL;
dev_set_promiscuity(dev, -1);
spin_lock_bh(&br->lock);
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 9a45e6279c5..b88220a64cd 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -128,7 +128,7 @@ int br_handle_frame(struct net_bridge_port *p, struct sk_buff **pskb)
dest = eth_hdr(skb)->h_dest;
}
- if (!memcmp(p->br->dev->dev_addr, dest, ETH_ALEN))
+ if (!compare_ether_addr(p->br->dev->dev_addr, dest))
skb->pkt_type = PACKET_HOST;
NF_HOOK(PF_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL,
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 2d52fee63a8..d8e36b77512 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -214,9 +214,11 @@ static int br_nf_pre_routing_finish(struct sk_buff *skb)
.tos = RT_TOS(iph->tos)} }, .proto = 0};
if (!ip_route_output_key(&rt, &fl)) {
- /* Bridged-and-DNAT'ed traffic doesn't
- * require ip_forwarding. */
- if (((struct dst_entry *)rt)->dev == dev) {
+ /* - Bridged-and-DNAT'ed traffic doesn't
+ * require ip_forwarding.
+ * - Deal with redirected traffic. */
+ if (((struct dst_entry *)rt)->dev == dev ||
+ rt->rt_type == RTN_LOCAL) {
skb->dst = (struct dst_entry *)rt;
goto bridged_dnat;
}
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index 0da11ff05fa..ac09b6a2352 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -15,6 +15,7 @@
#include <linux/kernel.h>
#include <linux/smp_lock.h>
+#include <linux/etherdevice.h>
#include "br_private.h"
#include "br_private_stp.h"
@@ -133,10 +134,10 @@ static void br_stp_change_bridge_id(struct net_bridge *br,
memcpy(br->dev->dev_addr, addr, ETH_ALEN);
list_for_each_entry(p, &br->port_list, list) {
- if (!memcmp(p->designated_bridge.addr, oldaddr, ETH_ALEN))
+ if (!compare_ether_addr(p->designated_bridge.addr, oldaddr))
memcpy(p->designated_bridge.addr, addr, ETH_ALEN);
- if (!memcmp(p->designated_root.addr, oldaddr, ETH_ALEN))
+ if (!compare_ether_addr(p->designated_root.addr, oldaddr))
memcpy(p->designated_root.addr, addr, ETH_ALEN);
}
@@ -157,12 +158,12 @@ void br_stp_recalculate_bridge_id(struct net_bridge *br)
list_for_each_entry(p, &br->port_list, list) {
if (addr == br_mac_zero ||
- memcmp(p->dev->dev_addr, addr, ETH_ALEN) < 0)
+ compare_ether_addr(p->dev->dev_addr, addr) < 0)
addr = p->dev->dev_addr;
}
- if (memcmp(br->bridge_id.addr, addr, ETH_ALEN))
+ if (compare_ether_addr(br->bridge_id.addr, addr))
br_stp_change_bridge_id(br, addr);
}
diff --git a/net/bridge/netfilter/ebt_mark.c b/net/bridge/netfilter/ebt_mark.c
index 02c632b4d32..c93d35ab95c 100644
--- a/net/bridge/netfilter/ebt_mark.c
+++ b/net/bridge/netfilter/ebt_mark.c
@@ -23,10 +23,9 @@ static int ebt_target_mark(struct sk_buff **pskb, unsigned int hooknr,
{
struct ebt_mark_t_info *info = (struct ebt_mark_t_info *)data;
- if ((*pskb)->nfmark != info->mark) {
+ if ((*pskb)->nfmark != info->mark)
(*pskb)->nfmark = info->mark;
- (*pskb)->nfcache |= NFC_ALTERED;
- }
+
return info->target;
}
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index 01af4fcef26..aae26ae2e61 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -78,8 +78,8 @@ static void ulog_send(unsigned int nlgroup)
if (ub->qlen > 1)
ub->lastnlh->nlmsg_type = NLMSG_DONE;
- NETLINK_CB(ub->skb).dst_groups = 1 << nlgroup;
- netlink_broadcast(ebtulognl, ub->skb, 0, 1 << nlgroup, GFP_ATOMIC);
+ NETLINK_CB(ub->skb).dst_group = nlgroup + 1;
+ netlink_broadcast(ebtulognl, ub->skb, 0, nlgroup + 1, GFP_ATOMIC);
ub->qlen = 0;
ub->skb = NULL;
@@ -162,7 +162,7 @@ static void ebt_ulog(const struct sk_buff *skb, unsigned int hooknr,
pm->version = EBT_ULOG_VERSION;
do_gettimeofday(&pm->stamp);
if (ub->qlen == 1)
- ub->skb->stamp = pm->stamp;
+ skb_set_timestamp(ub->skb, &pm->stamp);
pm->data_len = copy_len;
pm->mark = skb->nfmark;
pm->hook = hooknr;
@@ -258,7 +258,8 @@ static int __init init(void)
spin_lock_init(&ulog_buffers[i].lock);
}
- ebtulognl = netlink_kernel_create(NETLINK_NFLOG, NULL);
+ ebtulognl = netlink_kernel_create(NETLINK_NFLOG, EBT_ULOG_MAXNLGROUPS,
+ NULL, THIS_MODULE);
if (!ebtulognl)
ret = -ENOMEM;
else if ((ret = ebt_register_watcher(&ulog)))
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index c4540144f0f..f8ffbf6e233 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -26,6 +26,7 @@
#include <linux/spinlock.h>
#include <asm/uaccess.h>
#include <linux/smp.h>
+#include <linux/cpumask.h>
#include <net/sock.h>
/* needed for logical [in,out]-dev filtering */
#include "../br_private.h"
@@ -823,10 +824,11 @@ static int translate_table(struct ebt_replace *repl,
/* this will get free'd in do_replace()/ebt_register_table()
if an error occurs */
newinfo->chainstack = (struct ebt_chainstack **)
- vmalloc(num_possible_cpus() * sizeof(struct ebt_chainstack));
+ vmalloc((highest_possible_processor_id()+1)
+ * sizeof(struct ebt_chainstack));
if (!newinfo->chainstack)
return -ENOMEM;
- for (i = 0; i < num_possible_cpus(); i++) {
+ for_each_cpu(i) {
newinfo->chainstack[i] =
vmalloc(udc_cnt * sizeof(struct ebt_chainstack));
if (!newinfo->chainstack[i]) {
@@ -895,9 +897,12 @@ static void get_counters(struct ebt_counter *oldcounters,
/* counters of cpu 0 */
memcpy(counters, oldcounters,
- sizeof(struct ebt_counter) * nentries);
+ sizeof(struct ebt_counter) * nentries);
+
/* add other counters to those of cpu 0 */
- for (cpu = 1; cpu < num_possible_cpus(); cpu++) {
+ for_each_cpu(cpu) {
+ if (cpu == 0)
+ continue;
counter_base = COUNTER_BASE(oldcounters, nentries, cpu);
for (i = 0; i < nentries; i++) {
counters[i].pcnt += counter_base[i].pcnt;
@@ -929,7 +934,8 @@ static int do_replace(void __user *user, unsigned int len)
BUGPRINT("Entries_size never zero\n");
return -EINVAL;
}
- countersize = COUNTER_OFFSET(tmp.nentries) * num_possible_cpus();
+ countersize = COUNTER_OFFSET(tmp.nentries) *
+ (highest_possible_processor_id()+1);
newinfo = (struct ebt_table_info *)
vmalloc(sizeof(struct ebt_table_info) + countersize);
if (!newinfo)
@@ -1022,7 +1028,7 @@ static int do_replace(void __user *user, unsigned int len)
vfree(table->entries);
if (table->chainstack) {
- for (i = 0; i < num_possible_cpus(); i++)
+ for_each_cpu(i)
vfree(table->chainstack[i]);
vfree(table->chainstack);
}
@@ -1040,7 +1046,7 @@ free_counterstmp:
vfree(counterstmp);
/* can be initialized in translate_table() */
if (newinfo->chainstack) {
- for (i = 0; i < num_possible_cpus(); i++)
+ for_each_cpu(i)
vfree(newinfo->chainstack[i]);
vfree(newinfo->chainstack);
}
@@ -1132,7 +1138,8 @@ int ebt_register_table(struct ebt_table *table)
return -EINVAL;
}
- countersize = COUNTER_OFFSET(table->table->nentries) * num_possible_cpus();
+ countersize = COUNTER_OFFSET(table->table->nentries) *
+ (highest_possible_processor_id()+1);
newinfo = (struct ebt_table_info *)
vmalloc(sizeof(struct ebt_table_info) + countersize);
ret = -ENOMEM;
@@ -1186,7 +1193,7 @@ free_unlock:
up(&ebt_mutex);
free_chainstack:
if (newinfo->chainstack) {
- for (i = 0; i < num_possible_cpus(); i++)
+ for_each_cpu(i)
vfree(newinfo->chainstack[i]);
vfree(newinfo->chainstack);
}
@@ -1209,7 +1216,7 @@ void ebt_unregister_table(struct ebt_table *table)
up(&ebt_mutex);
vfree(table->private->entries);
if (table->private->chainstack) {
- for (i = 0; i < num_possible_cpus(); i++)
+ for_each_cpu(i)
vfree(table->private->chainstack[i]);
vfree(table->private->chainstack);
}
diff --git a/net/compat.c b/net/compat.c
index be5d936dc42..e593dace2fd 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -91,20 +91,11 @@ int verify_compat_iovec(struct msghdr *kern_msg, struct iovec *kern_iov,
} else
kern_msg->msg_name = NULL;
- if(kern_msg->msg_iovlen > UIO_FASTIOV) {
- kern_iov = kmalloc(kern_msg->msg_iovlen * sizeof(struct iovec),
- GFP_KERNEL);
- if(!kern_iov)
- return -ENOMEM;
- }
-
tot_len = iov_from_user_compat_to_kern(kern_iov,
(struct compat_iovec __user *)kern_msg->msg_iov,
kern_msg->msg_iovlen);
if(tot_len >= 0)
kern_msg->msg_iov = kern_iov;
- else if(kern_msg->msg_iovlen > UIO_FASTIOV)
- kfree(kern_iov);
return tot_len;
}
@@ -144,13 +135,14 @@ static inline struct compat_cmsghdr __user *cmsg_compat_nxthdr(struct msghdr *ms
* thus placement) of cmsg headers and length are different for
* 32-bit apps. -DaveM
*/
-int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg,
+int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk,
unsigned char *stackbuf, int stackbuf_size)
{
struct compat_cmsghdr __user *ucmsg;
struct cmsghdr *kcmsg, *kcmsg_base;
compat_size_t ucmlen;
__kernel_size_t kcmlen, tmp;
+ int err = -EFAULT;
kcmlen = 0;
kcmsg_base = kcmsg = (struct cmsghdr *)stackbuf;
@@ -165,6 +157,7 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg,
tmp = ((ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))) +
CMSG_ALIGN(sizeof(struct cmsghdr)));
+ tmp = CMSG_ALIGN(tmp);
kcmlen += tmp;
ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen);
}
@@ -176,30 +169,34 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg,
* until we have successfully copied over all of the data
* from the user.
*/
- if(kcmlen > stackbuf_size)
- kcmsg_base = kcmsg = kmalloc(kcmlen, GFP_KERNEL);
- if(kcmsg == NULL)
+ if (kcmlen > stackbuf_size)
+ kcmsg_base = kcmsg = sock_kmalloc(sk, kcmlen, GFP_KERNEL);
+ if (kcmsg == NULL)
return -ENOBUFS;
/* Now copy them over neatly. */
memset(kcmsg, 0, kcmlen);
ucmsg = CMSG_COMPAT_FIRSTHDR(kmsg);
while(ucmsg != NULL) {
- __get_user(ucmlen, &ucmsg->cmsg_len);
+ if (__get_user(ucmlen, &ucmsg->cmsg_len))
+ goto Efault;
+ if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg))
+ goto Einval;
tmp = ((ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))) +
CMSG_ALIGN(sizeof(struct cmsghdr)));
+ if ((char *)kcmsg_base + kcmlen - (char *)kcmsg < CMSG_ALIGN(tmp))
+ goto Einval;
kcmsg->cmsg_len = tmp;
- __get_user(kcmsg->cmsg_level, &ucmsg->cmsg_level);
- __get_user(kcmsg->cmsg_type, &ucmsg->cmsg_type);
-
- /* Copy over the data. */
- if(copy_from_user(CMSG_DATA(kcmsg),
- CMSG_COMPAT_DATA(ucmsg),
- (ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg)))))
- goto out_free_efault;
+ tmp = CMSG_ALIGN(tmp);
+ if (__get_user(kcmsg->cmsg_level, &ucmsg->cmsg_level) ||
+ __get_user(kcmsg->cmsg_type, &ucmsg->cmsg_type) ||
+ copy_from_user(CMSG_DATA(kcmsg),
+ CMSG_COMPAT_DATA(ucmsg),
+ (ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg)))))
+ goto Efault;
/* Advance. */
- kcmsg = (struct cmsghdr *)((char *)kcmsg + CMSG_ALIGN(tmp));
+ kcmsg = (struct cmsghdr *)((char *)kcmsg + tmp);
ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen);
}
@@ -208,10 +205,12 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg,
kmsg->msg_controllen = kcmlen;
return 0;
-out_free_efault:
- if(kcmsg_base != (struct cmsghdr *)stackbuf)
- kfree(kcmsg_base);
- return -EFAULT;
+Einval:
+ err = -EINVAL;
+Efault:
+ if (kcmsg_base != (struct cmsghdr *)stackbuf)
+ sock_kfree_s(sk, kcmsg_base, kcmlen);
+ return err;
}
int put_cmsg_compat(struct msghdr *kmsg, int level, int type, int len, void *data)
diff --git a/net/core/Makefile b/net/core/Makefile
index f5f5e58943e..630da0f0579 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -12,7 +12,6 @@ obj-y += dev.o ethtool.o dev_mcast.o dst.o \
obj-$(CONFIG_XFRM) += flow.o
obj-$(CONFIG_SYSFS) += net-sysfs.o
-obj-$(CONFIG_NETFILTER) += netfilter.o
obj-$(CONFIG_NET_DIVERT) += dv.o
obj-$(CONFIG_NET_PKTGEN) += pktgen.o
obj-$(CONFIG_NET_RADIO) += wireless.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
index fcee054b6f7..d219435d086 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -43,7 +43,6 @@
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/inet.h>
-#include <linux/tcp.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/poll.h>
@@ -51,9 +50,10 @@
#include <net/protocol.h>
#include <linux/skbuff.h>
-#include <net/sock.h>
-#include <net/checksum.h>
+#include <net/checksum.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
/*
* Is a socket 'connection oriented' ?
@@ -211,74 +211,49 @@ void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
struct iovec *to, int len)
{
- int start = skb_headlen(skb);
- int i, copy = start - offset;
+ int i, err, fraglen, end = 0;
+ struct sk_buff *next = skb_shinfo(skb)->frag_list;
- /* Copy header. */
- if (copy > 0) {
- if (copy > len)
- copy = len;
- if (memcpy_toiovec(to, skb->data + offset, copy))
- goto fault;
- if ((len -= copy) == 0)
- return 0;
- offset += copy;
- }
+ if (!len)
+ return 0;
- /* Copy paged appendix. Hmm... why does this look so complicated? */
- for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
- int end;
+next_skb:
+ fraglen = skb_headlen(skb);
+ i = -1;
- BUG_TRAP(start <= offset + len);
+ while (1) {
+ int start = end;
- end = start + skb_shinfo(skb)->frags[i].size;
- if ((copy = end - offset) > 0) {
- int err;
- u8 *vaddr;
- skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
- struct page *page = frag->page;
+ if ((end += fraglen) > offset) {
+ int copy = end - offset, o = offset - start;
if (copy > len)
copy = len;
- vaddr = kmap(page);
- err = memcpy_toiovec(to, vaddr + frag->page_offset +
- offset - start, copy);
- kunmap(page);
+ if (i == -1)
+ err = memcpy_toiovec(to, skb->data + o, copy);
+ else {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ struct page *page = frag->page;
+ void *p = kmap(page) + frag->page_offset + o;
+ err = memcpy_toiovec(to, p, copy);
+ kunmap(page);
+ }
if (err)
goto fault;
if (!(len -= copy))
return 0;
offset += copy;
}
- start = end;
+ if (++i >= skb_shinfo(skb)->nr_frags)
+ break;
+ fraglen = skb_shinfo(skb)->frags[i].size;
}
-
- if (skb_shinfo(skb)->frag_list) {
- struct sk_buff *list = skb_shinfo(skb)->frag_list;
-
- for (; list; list = list->next) {
- int end;
-
- BUG_TRAP(start <= offset + len);
-
- end = start + list->len;
- if ((copy = end - offset) > 0) {
- if (copy > len)
- copy = len;
- if (skb_copy_datagram_iovec(list,
- offset - start,
- to, copy))
- goto fault;
- if ((len -= copy) == 0)
- return 0;
- offset += copy;
- }
- start = end;
- }
+ if (next) {
+ skb = next;
+ BUG_ON(skb_shinfo(skb)->frag_list);
+ next = skb->next;
+ goto next_skb;
}
- if (!len)
- return 0;
-
fault:
return -EFAULT;
}
diff --git a/net/core/dev.c b/net/core/dev.c
index 52a3bf7ae17..8d154159527 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -267,10 +267,6 @@ void dev_add_pack(struct packet_type *pt)
spin_unlock_bh(&ptype_lock);
}
-extern void linkwatch_run_queue(void);
-
-
-
/**
* __dev_remove_pack - remove packet handler
* @pt: packet type declaration
@@ -578,6 +574,8 @@ struct net_device *dev_getbyhwaddr(unsigned short type, char *ha)
return dev;
}
+EXPORT_SYMBOL(dev_getbyhwaddr);
+
struct net_device *dev_getfirstbyhwtype(unsigned short type)
{
struct net_device *dev;
@@ -1009,13 +1007,22 @@ void net_disable_timestamp(void)
atomic_dec(&netstamp_needed);
}
-static inline void net_timestamp(struct timeval *stamp)
+void __net_timestamp(struct sk_buff *skb)
+{
+ struct timeval tv;
+
+ do_gettimeofday(&tv);
+ skb_set_timestamp(skb, &tv);
+}
+EXPORT_SYMBOL(__net_timestamp);
+
+static inline void net_timestamp(struct sk_buff *skb)
{
if (atomic_read(&netstamp_needed))
- do_gettimeofday(stamp);
+ __net_timestamp(skb);
else {
- stamp->tv_sec = 0;
- stamp->tv_usec = 0;
+ skb->tstamp.off_sec = 0;
+ skb->tstamp.off_usec = 0;
}
}
@@ -1027,7 +1034,8 @@ static inline void net_timestamp(struct timeval *stamp)
void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
{
struct packet_type *ptype;
- net_timestamp(&skb->stamp);
+
+ net_timestamp(skb);
rcu_read_lock();
list_for_each_entry_rcu(ptype, &ptype_all, list) {
@@ -1058,7 +1066,7 @@ void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
skb2->h.raw = skb2->nh.raw;
skb2->pkt_type = PACKET_OUTGOING;
- ptype->func(skb2, skb->dev, ptype);
+ ptype->func(skb2, skb->dev, ptype, skb->dev);
}
}
rcu_read_unlock();
@@ -1123,10 +1131,8 @@ static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
#define illegal_highdma(dev, skb) (0)
#endif
-extern void skb_release_data(struct sk_buff *);
-
/* Keep head the same: replace data */
-int __skb_linearize(struct sk_buff *skb, unsigned int __nocast gfp_mask)
+int __skb_linearize(struct sk_buff *skb, gfp_t gfp_mask)
{
unsigned int size;
u8 *data;
@@ -1253,6 +1259,8 @@ int dev_queue_xmit(struct sk_buff *skb)
if (skb_checksum_help(skb, 0))
goto out_kfree_skb;
+ spin_lock_prefetch(&dev->queue_lock);
+
/* Disable soft irqs for various locks below. Also
* stops preemption for RCU.
*/
@@ -1379,8 +1387,8 @@ int netif_rx(struct sk_buff *skb)
if (netpoll_rx(skb))
return NET_RX_DROP;
- if (!skb->stamp.tv_sec)
- net_timestamp(&skb->stamp);
+ if (!skb->tstamp.off_sec)
+ net_timestamp(skb);
/*
* The code is rearranged so that the path is the most
@@ -1425,14 +1433,14 @@ int netif_rx_ni(struct sk_buff *skb)
EXPORT_SYMBOL(netif_rx_ni);
-static __inline__ void skb_bond(struct sk_buff *skb)
+static inline struct net_device *skb_bond(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
- if (dev->master) {
- skb->real_dev = skb->dev;
+ if (dev->master)
skb->dev = dev->master;
- }
+
+ return dev;
}
static void net_tx_action(struct softirq_action *h)
@@ -1482,10 +1490,11 @@ static void net_tx_action(struct softirq_action *h)
}
static __inline__ int deliver_skb(struct sk_buff *skb,
- struct packet_type *pt_prev)
+ struct packet_type *pt_prev,
+ struct net_device *orig_dev)
{
atomic_inc(&skb->users);
- return pt_prev->func(skb, skb->dev, pt_prev);
+ return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}
#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
@@ -1496,7 +1505,8 @@ struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent);
static __inline__ int handle_bridge(struct sk_buff **pskb,
- struct packet_type **pt_prev, int *ret)
+ struct packet_type **pt_prev, int *ret,
+ struct net_device *orig_dev)
{
struct net_bridge_port *port;
@@ -1505,14 +1515,14 @@ static __inline__ int handle_bridge(struct sk_buff **pskb,
return 0;
if (*pt_prev) {
- *ret = deliver_skb(*pskb, *pt_prev);
+ *ret = deliver_skb(*pskb, *pt_prev, orig_dev);
*pt_prev = NULL;
}
return br_handle_frame_hook(port, pskb);
}
#else
-#define handle_bridge(skb, pt_prev, ret) (0)
+#define handle_bridge(skb, pt_prev, ret, orig_dev) (0)
#endif
#ifdef CONFIG_NET_CLS_ACT
@@ -1534,17 +1544,14 @@ static int ing_filter(struct sk_buff *skb)
__u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd);
if (MAX_RED_LOOP < ttl++) {
printk("Redir loop detected Dropping packet (%s->%s)\n",
- skb->input_dev?skb->input_dev->name:"??",skb->dev->name);
+ skb->input_dev->name, skb->dev->name);
return TC_ACT_SHOT;
}
skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl);
skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS);
- if (NULL == skb->input_dev) {
- skb->input_dev = skb->dev;
- printk("ing_filter: fixed %s out %s\n",skb->input_dev->name,skb->dev->name);
- }
+
spin_lock(&dev->ingress_lock);
if ((q = dev->qdisc_ingress) != NULL)
result = q->enqueue(skb, q);
@@ -1559,6 +1566,7 @@ static int ing_filter(struct sk_buff *skb)
int netif_receive_skb(struct sk_buff *skb)
{
struct packet_type *ptype, *pt_prev;
+ struct net_device *orig_dev;
int ret = NET_RX_DROP;
unsigned short type;
@@ -1566,10 +1574,13 @@ int netif_receive_skb(struct sk_buff *skb)
if (skb->dev->poll && netpoll_rx(skb))
return NET_RX_DROP;
- if (!skb->stamp.tv_sec)
- net_timestamp(&skb->stamp);
+ if (!skb->tstamp.off_sec)
+ net_timestamp(skb);
+
+ if (!skb->input_dev)
+ skb->input_dev = skb->dev;
- skb_bond(skb);
+ orig_dev = skb_bond(skb);
__get_cpu_var(netdev_rx_stat).total++;
@@ -1590,14 +1601,14 @@ int netif_receive_skb(struct sk_buff *skb)
list_for_each_entry_rcu(ptype, &ptype_all, list) {
if (!ptype->dev || ptype->dev == skb->dev) {
if (pt_prev)
- ret = deliver_skb(skb, pt_prev);
+ ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
}
#ifdef CONFIG_NET_CLS_ACT
if (pt_prev) {
- ret = deliver_skb(skb, pt_prev);
+ ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL; /* noone else should process this after*/
} else {
skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
@@ -1616,7 +1627,7 @@ ncls:
handle_diverter(skb);
- if (handle_bridge(&skb, &pt_prev, &ret))
+ if (handle_bridge(&skb, &pt_prev, &ret, orig_dev))
goto out;
type = skb->protocol;
@@ -1624,13 +1635,13 @@ ncls:
if (ptype->type == type &&
(!ptype->dev || ptype->dev == skb->dev)) {
if (pt_prev)
- ret = deliver_skb(skb, pt_prev);
+ ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
}
if (pt_prev) {
- ret = pt_prev->func(skb, skb->dev, pt_prev);
+ ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
} else {
kfree_skb(skb);
/* Jamal, now you will not able to escape explaining
@@ -1696,7 +1707,8 @@ static void net_rx_action(struct softirq_action *h)
struct softnet_data *queue = &__get_cpu_var(softnet_data);
unsigned long start_time = jiffies;
int budget = netdev_budget;
-
+ void *have;
+
local_irq_disable();
while (!list_empty(&queue->poll_list)) {
@@ -1709,10 +1721,10 @@ static void net_rx_action(struct softirq_action *h)
dev = list_entry(queue->poll_list.next,
struct net_device, poll_list);
- netpoll_poll_lock(dev);
+ have = netpoll_poll_lock(dev);
if (dev->quota <= 0 || dev->poll(dev, &budget)) {
- netpoll_poll_unlock(dev);
+ netpoll_poll_unlock(have);
local_irq_disable();
list_del(&dev->poll_list);
list_add_tail(&dev->poll_list, &queue->poll_list);
@@ -1721,7 +1733,7 @@ static void net_rx_action(struct softirq_action *h)
else
dev->quota = dev->weight;
} else {
- netpoll_poll_unlock(dev);
+ netpoll_poll_unlock(have);
dev_put(dev);
local_irq_disable();
}
@@ -2705,6 +2717,20 @@ int register_netdevice(struct net_device *dev)
dev->name);
dev->features &= ~NETIF_F_TSO;
}
+ if (dev->features & NETIF_F_UFO) {
+ if (!(dev->features & NETIF_F_HW_CSUM)) {
+ printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
+ "NETIF_F_HW_CSUM feature.\n",
+ dev->name);
+ dev->features &= ~NETIF_F_UFO;
+ }
+ if (!(dev->features & NETIF_F_SG)) {
+ printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
+ "NETIF_F_SG feature.\n",
+ dev->name);
+ dev->features &= ~NETIF_F_UFO;
+ }
+ }
/*
* nil rebuild_header routine,
diff --git a/net/core/dst.c b/net/core/dst.c
index 334790da9f1..470c05bc4cb 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -39,8 +39,7 @@ static unsigned long dst_gc_timer_inc = DST_GC_MAX;
static void dst_run_gc(unsigned long);
static void ___dst_free(struct dst_entry * dst);
-static struct timer_list dst_gc_timer =
- TIMER_INITIALIZER(dst_run_gc, DST_GC_MIN, 0);
+static DEFINE_TIMER(dst_gc_timer, dst_run_gc, DST_GC_MIN, 0);
static void dst_run_gc(unsigned long dummy)
{
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index a3eeb88e1c8..0350586e919 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -81,6 +81,32 @@ int ethtool_op_set_tso(struct net_device *dev, u32 data)
return 0;
}
+int ethtool_op_get_perm_addr(struct net_device *dev, struct ethtool_perm_addr *addr, u8 *data)
+{
+ unsigned char len = dev->addr_len;
+ if ( addr->size < len )
+ return -ETOOSMALL;
+
+ addr->size = len;
+ memcpy(data, dev->perm_addr, len);
+ return 0;
+}
+
+
+u32 ethtool_op_get_ufo(struct net_device *dev)
+{
+ return (dev->features & NETIF_F_UFO) != 0;
+}
+
+int ethtool_op_set_ufo(struct net_device *dev, u32 data)
+{
+ if (data)
+ dev->features |= NETIF_F_UFO;
+ else
+ dev->features &= ~NETIF_F_UFO;
+ return 0;
+}
+
/* Handlers for each ethtool command */
static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
@@ -471,6 +497,11 @@ static int __ethtool_set_sg(struct net_device *dev, u32 data)
return err;
}
+ if (!data && dev->ethtool_ops->set_ufo) {
+ err = dev->ethtool_ops->set_ufo(dev, 0);
+ if (err)
+ return err;
+ }
return dev->ethtool_ops->set_sg(dev, data);
}
@@ -557,6 +588,32 @@ static int ethtool_set_tso(struct net_device *dev, char __user *useraddr)
return dev->ethtool_ops->set_tso(dev, edata.data);
}
+static int ethtool_get_ufo(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_value edata = { ETHTOOL_GTSO };
+
+ if (!dev->ethtool_ops->get_ufo)
+ return -EOPNOTSUPP;
+ edata.data = dev->ethtool_ops->get_ufo(dev);
+ if (copy_to_user(useraddr, &edata, sizeof(edata)))
+ return -EFAULT;
+ return 0;
+}
+static int ethtool_set_ufo(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_value edata;
+
+ if (!dev->ethtool_ops->set_ufo)
+ return -EOPNOTSUPP;
+ if (copy_from_user(&edata, useraddr, sizeof(edata)))
+ return -EFAULT;
+ if (edata.data && !(dev->features & NETIF_F_SG))
+ return -EINVAL;
+ if (edata.data && !(dev->features & NETIF_F_HW_CSUM))
+ return -EINVAL;
+ return dev->ethtool_ops->set_ufo(dev, edata.data);
+}
+
static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
{
struct ethtool_test test;
@@ -683,6 +740,39 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
return ret;
}
+static int ethtool_get_perm_addr(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_perm_addr epaddr;
+ u8 *data;
+ int ret;
+
+ if (!dev->ethtool_ops->get_perm_addr)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&epaddr,useraddr,sizeof(epaddr)))
+ return -EFAULT;
+
+ data = kmalloc(epaddr.size, GFP_USER);
+ if (!data)
+ return -ENOMEM;
+
+ ret = dev->ethtool_ops->get_perm_addr(dev,&epaddr,data);
+ if (ret)
+ return ret;
+
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, &epaddr, sizeof(epaddr)))
+ goto out;
+ useraddr += sizeof(epaddr);
+ if (copy_to_user(useraddr, data, epaddr.size))
+ goto out;
+ ret = 0;
+
+ out:
+ kfree(data);
+ return ret;
+}
+
/* The main entry point in this file. Called from net/core/dev.c */
int dev_ethtool(struct ifreq *ifr)
@@ -806,6 +896,15 @@ int dev_ethtool(struct ifreq *ifr)
case ETHTOOL_GSTATS:
rc = ethtool_get_stats(dev, useraddr);
break;
+ case ETHTOOL_GPERMADDR:
+ rc = ethtool_get_perm_addr(dev, useraddr);
+ break;
+ case ETHTOOL_GUFO:
+ rc = ethtool_get_ufo(dev, useraddr);
+ break;
+ case ETHTOOL_SUFO:
+ rc = ethtool_set_ufo(dev, useraddr);
+ break;
default:
rc = -EOPNOTSUPP;
}
@@ -826,6 +925,7 @@ int dev_ethtool(struct ifreq *ifr)
EXPORT_SYMBOL(dev_ethtool);
EXPORT_SYMBOL(ethtool_op_get_link);
+EXPORT_SYMBOL_GPL(ethtool_op_get_perm_addr);
EXPORT_SYMBOL(ethtool_op_get_sg);
EXPORT_SYMBOL(ethtool_op_get_tso);
EXPORT_SYMBOL(ethtool_op_get_tx_csum);
@@ -833,3 +933,5 @@ EXPORT_SYMBOL(ethtool_op_set_sg);
EXPORT_SYMBOL(ethtool_op_set_tso);
EXPORT_SYMBOL(ethtool_op_set_tx_csum);
EXPORT_SYMBOL(ethtool_op_set_tx_hw_csum);
+EXPORT_SYMBOL(ethtool_op_set_ufo);
+EXPORT_SYMBOL(ethtool_op_get_ufo);
diff --git a/net/core/filter.c b/net/core/filter.c
index cd91a24f972..079c2edff78 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -182,7 +182,7 @@ int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
A = ntohl(*(u32 *)ptr);
continue;
}
- return 0;
+ break;
case BPF_LD|BPF_H|BPF_ABS:
k = fentry->k;
load_h:
@@ -191,7 +191,7 @@ int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
A = ntohs(*(u16 *)ptr);
continue;
}
- return 0;
+ break;
case BPF_LD|BPF_B|BPF_ABS:
k = fentry->k;
load_b:
@@ -200,7 +200,7 @@ load_b:
A = *(u8 *)ptr;
continue;
}
- return 0;
+ break;
case BPF_LD|BPF_W|BPF_LEN:
A = skb->len;
continue;
diff --git a/net/core/flow.c b/net/core/flow.c
index f289570b15a..7e95b39de9f 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -42,7 +42,7 @@ static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL };
#define flow_table(cpu) (per_cpu(flow_tables, cpu))
-static kmem_cache_t *flow_cachep;
+static kmem_cache_t *flow_cachep __read_mostly;
static int flow_lwm, flow_hwm;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 1beb782ac41..e68700f950a 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -61,7 +61,9 @@ static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev);
void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev);
static struct neigh_table *neigh_tables;
+#ifdef CONFIG_PROC_FS
static struct file_operations neigh_stat_seq_fops;
+#endif
/*
Neighbour hash table buckets are protected with rwlock tbl->lock.
@@ -173,39 +175,10 @@ static void pneigh_queue_purge(struct sk_buff_head *list)
}
}
-void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev)
+static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
{
int i;
- write_lock_bh(&tbl->lock);
-
- for (i=0; i <= tbl->hash_mask; i++) {
- struct neighbour *n, **np;
-
- np = &tbl->hash_buckets[i];
- while ((n = *np) != NULL) {
- if (dev && n->dev != dev) {
- np = &n->next;
- continue;
- }
- *np = n->next;
- write_lock_bh(&n->lock);
- n->dead = 1;
- neigh_del_timer(n);
- write_unlock_bh(&n->lock);
- neigh_release(n);
- }
- }
-
- write_unlock_bh(&tbl->lock);
-}
-
-int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
-{
- int i;
-
- write_lock_bh(&tbl->lock);
-
for (i = 0; i <= tbl->hash_mask; i++) {
struct neighbour *n, **np = &tbl->hash_buckets[i];
@@ -241,7 +214,19 @@ int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
neigh_release(n);
}
}
+}
+
+void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev)
+{
+ write_lock_bh(&tbl->lock);
+ neigh_flush_dev(tbl, dev);
+ write_unlock_bh(&tbl->lock);
+}
+int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
+{
+ write_lock_bh(&tbl->lock);
+ neigh_flush_dev(tbl, dev);
pneigh_ifdown(tbl, dev);
write_unlock_bh(&tbl->lock);
@@ -725,6 +710,14 @@ static __inline__ int neigh_max_probes(struct neighbour *n)
p->ucast_probes + p->app_probes + p->mcast_probes);
}
+static inline void neigh_add_timer(struct neighbour *n, unsigned long when)
+{
+ if (unlikely(mod_timer(&n->timer, when))) {
+ printk("NEIGH: BUG, double timer add, state is %x\n",
+ n->nud_state);
+ dump_stack();
+ }
+}
/* Called when a timer expires for a neighbour entry. */
@@ -806,11 +799,10 @@ static void neigh_timer_handler(unsigned long arg)
}
if (neigh->nud_state & NUD_IN_TIMER) {
- neigh_hold(neigh);
if (time_before(next, jiffies + HZ/2))
next = jiffies + HZ/2;
- neigh->timer.expires = next;
- add_timer(&neigh->timer);
+ if (!mod_timer(&neigh->timer, next))
+ neigh_hold(neigh);
}
if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) {
struct sk_buff *skb = skb_peek(&neigh->arp_queue);
@@ -852,8 +844,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
atomic_set(&neigh->probes, neigh->parms->ucast_probes);
neigh->nud_state = NUD_INCOMPLETE;
neigh_hold(neigh);
- neigh->timer.expires = now + 1;
- add_timer(&neigh->timer);
+ neigh_add_timer(neigh, now + 1);
} else {
neigh->nud_state = NUD_FAILED;
write_unlock_bh(&neigh->lock);
@@ -866,8 +857,8 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
NEIGH_PRINTK2("neigh %p is delayed.\n", neigh);
neigh_hold(neigh);
neigh->nud_state = NUD_DELAY;
- neigh->timer.expires = jiffies + neigh->parms->delay_probe_time;
- add_timer(&neigh->timer);
+ neigh_add_timer(neigh,
+ jiffies + neigh->parms->delay_probe_time);
}
if (neigh->nud_state == NUD_INCOMPLETE) {
@@ -1013,10 +1004,10 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
neigh_del_timer(neigh);
if (new & NUD_IN_TIMER) {
neigh_hold(neigh);
- neigh->timer.expires = jiffies +
+ neigh_add_timer(neigh, (jiffies +
((new & NUD_REACHABLE) ?
- neigh->parms->reachable_time : 0);
- add_timer(&neigh->timer);
+ neigh->parms->reachable_time :
+ 0)));
}
neigh->nud_state = new;
}
@@ -1217,7 +1208,7 @@ static void neigh_proxy_process(unsigned long arg)
while (skb != (struct sk_buff *)&tbl->proxy_queue) {
struct sk_buff *back = skb;
- long tdif = back->stamp.tv_usec - now;
+ long tdif = NEIGH_CB(back)->sched_next - now;
skb = skb->next;
if (tdif <= 0) {
@@ -1248,8 +1239,9 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
kfree_skb(skb);
return;
}
- skb->stamp.tv_sec = LOCALLY_ENQUEUED;
- skb->stamp.tv_usec = sched_next;
+
+ NEIGH_CB(skb)->sched_next = sched_next;
+ NEIGH_CB(skb)->flags |= LOCALLY_ENQUEUED;
spin_lock(&tbl->proxy_queue.lock);
if (del_timer(&tbl->proxy_timer)) {
@@ -1633,12 +1625,9 @@ static int neightbl_fill_info(struct neigh_table *tbl, struct sk_buff *skb,
memset(&ndst, 0, sizeof(ndst));
- for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ for_each_cpu(cpu) {
struct neigh_statistics *st;
- if (!cpu_possible(cpu))
- continue;
-
st = per_cpu_ptr(tbl->stats, cpu);
ndst.ndts_allocs += st->allocs;
ndst.ndts_destroys += st->destroys;
@@ -2342,8 +2331,8 @@ void neigh_app_ns(struct neighbour *n)
}
nlh = (struct nlmsghdr *)skb->data;
nlh->nlmsg_flags = NLM_F_REQUEST;
- NETLINK_CB(skb).dst_groups = RTMGRP_NEIGH;
- netlink_broadcast(rtnl, skb, 0, RTMGRP_NEIGH, GFP_ATOMIC);
+ NETLINK_CB(skb).dst_group = RTNLGRP_NEIGH;
+ netlink_broadcast(rtnl, skb, 0, RTNLGRP_NEIGH, GFP_ATOMIC);
}
static void neigh_app_notify(struct neighbour *n)
@@ -2360,8 +2349,8 @@ static void neigh_app_notify(struct neighbour *n)
return;
}
nlh = (struct nlmsghdr *)skb->data;
- NETLINK_CB(skb).dst_groups = RTMGRP_NEIGH;
- netlink_broadcast(rtnl, skb, 0, RTMGRP_NEIGH, GFP_ATOMIC);
+ NETLINK_CB(skb).dst_group = RTNLGRP_NEIGH;
+ netlink_broadcast(rtnl, skb, 0, RTNLGRP_NEIGH, GFP_ATOMIC);
}
#endif /* CONFIG_ARPD */
diff --git a/net/core/netfilter.c b/net/core/netfilter.c
deleted file mode 100644
index 076c156d5ed..00000000000
--- a/net/core/netfilter.c
+++ /dev/null
@@ -1,648 +0,0 @@
-/* netfilter.c: look after the filters for various protocols.
- * Heavily influenced by the old firewall.c by David Bonn and Alan Cox.
- *
- * Thanks to Rob `CmdrTaco' Malda for not influencing this code in any
- * way.
- *
- * Rusty Russell (C)2000 -- This code is GPL.
- *
- * February 2000: Modified by James Morris to have 1 queue per protocol.
- * 15-Mar-2000: Added NF_REPEAT --RR.
- * 08-May-2003: Internal logging interface added by Jozsef Kadlecsik.
- */
-#include <linux/config.h>
-#include <linux/kernel.h>
-#include <linux/netfilter.h>
-#include <net/protocol.h>
-#include <linux/init.h>
-#include <linux/skbuff.h>
-#include <linux/wait.h>
-#include <linux/module.h>
-#include <linux/interrupt.h>
-#include <linux/if.h>
-#include <linux/netdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/tcp.h>
-#include <linux/udp.h>
-#include <linux/icmp.h>
-#include <net/sock.h>
-#include <net/route.h>
-#include <linux/ip.h>
-
-/* In this code, we can be waiting indefinitely for userspace to
- * service a packet if a hook returns NF_QUEUE. We could keep a count
- * of skbuffs queued for userspace, and not deregister a hook unless
- * this is zero, but that sucks. Now, we simply check when the
- * packets come back: if the hook is gone, the packet is discarded. */
-#ifdef CONFIG_NETFILTER_DEBUG
-#define NFDEBUG(format, args...) printk(format , ## args)
-#else
-#define NFDEBUG(format, args...)
-#endif
-
-/* Sockopts only registered and called from user context, so
- net locking would be overkill. Also, [gs]etsockopt calls may
- sleep. */
-static DECLARE_MUTEX(nf_sockopt_mutex);
-
-struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS];
-static LIST_HEAD(nf_sockopts);
-static DEFINE_SPINLOCK(nf_hook_lock);
-
-/*
- * A queue handler may be registered for each protocol. Each is protected by
- * long term mutex. The handler must provide an an outfn() to accept packets
- * for queueing and must reinject all packets it receives, no matter what.
- */
-static struct nf_queue_handler_t {
- nf_queue_outfn_t outfn;
- void *data;
-} queue_handler[NPROTO];
-static DEFINE_RWLOCK(queue_handler_lock);
-
-int nf_register_hook(struct nf_hook_ops *reg)
-{
- struct list_head *i;
-
- spin_lock_bh(&nf_hook_lock);
- list_for_each(i, &nf_hooks[reg->pf][reg->hooknum]) {
- if (reg->priority < ((struct nf_hook_ops *)i)->priority)
- break;
- }
- list_add_rcu(&reg->list, i->prev);
- spin_unlock_bh(&nf_hook_lock);
-
- synchronize_net();
- return 0;
-}
-
-void nf_unregister_hook(struct nf_hook_ops *reg)
-{
- spin_lock_bh(&nf_hook_lock);
- list_del_rcu(&reg->list);
- spin_unlock_bh(&nf_hook_lock);
-
- synchronize_net();
-}
-
-/* Do exclusive ranges overlap? */
-static inline int overlap(int min1, int max1, int min2, int max2)
-{
- return max1 > min2 && min1 < max2;
-}
-
-/* Functions to register sockopt ranges (exclusive). */
-int nf_register_sockopt(struct nf_sockopt_ops *reg)
-{
- struct list_head *i;
- int ret = 0;
-
- if (down_interruptible(&nf_sockopt_mutex) != 0)
- return -EINTR;
-
- list_for_each(i, &nf_sockopts) {
- struct nf_sockopt_ops *ops = (struct nf_sockopt_ops *)i;
- if (ops->pf == reg->pf
- && (overlap(ops->set_optmin, ops->set_optmax,
- reg->set_optmin, reg->set_optmax)
- || overlap(ops->get_optmin, ops->get_optmax,
- reg->get_optmin, reg->get_optmax))) {
- NFDEBUG("nf_sock overlap: %u-%u/%u-%u v %u-%u/%u-%u\n",
- ops->set_optmin, ops->set_optmax,
- ops->get_optmin, ops->get_optmax,
- reg->set_optmin, reg->set_optmax,
- reg->get_optmin, reg->get_optmax);
- ret = -EBUSY;
- goto out;
- }
- }
-
- list_add(&reg->list, &nf_sockopts);
-out:
- up(&nf_sockopt_mutex);
- return ret;
-}
-
-void nf_unregister_sockopt(struct nf_sockopt_ops *reg)
-{
- /* No point being interruptible: we're probably in cleanup_module() */
- restart:
- down(&nf_sockopt_mutex);
- if (reg->use != 0) {
- /* To be woken by nf_sockopt call... */
- /* FIXME: Stuart Young's name appears gratuitously. */
- set_current_state(TASK_UNINTERRUPTIBLE);
- reg->cleanup_task = current;
- up(&nf_sockopt_mutex);
- schedule();
- goto restart;
- }
- list_del(&reg->list);
- up(&nf_sockopt_mutex);
-}
-
-/* Call get/setsockopt() */
-static int nf_sockopt(struct sock *sk, int pf, int val,
- char __user *opt, int *len, int get)
-{
- struct list_head *i;
- struct nf_sockopt_ops *ops;
- int ret;
-
- if (down_interruptible(&nf_sockopt_mutex) != 0)
- return -EINTR;
-
- list_for_each(i, &nf_sockopts) {
- ops = (struct nf_sockopt_ops *)i;
- if (ops->pf == pf) {
- if (get) {
- if (val >= ops->get_optmin
- && val < ops->get_optmax) {
- ops->use++;
- up(&nf_sockopt_mutex);
- ret = ops->get(sk, val, opt, len);
- goto out;
- }
- } else {
- if (val >= ops->set_optmin
- && val < ops->set_optmax) {
- ops->use++;
- up(&nf_sockopt_mutex);
- ret = ops->set(sk, val, opt, *len);
- goto out;
- }
- }
- }
- }
- up(&nf_sockopt_mutex);
- return -ENOPROTOOPT;
-
- out:
- down(&nf_sockopt_mutex);
- ops->use--;
- if (ops->cleanup_task)
- wake_up_process(ops->cleanup_task);
- up(&nf_sockopt_mutex);
- return ret;
-}
-
-int nf_setsockopt(struct sock *sk, int pf, int val, char __user *opt,
- int len)
-{
- return nf_sockopt(sk, pf, val, opt, &len, 0);
-}
-
-int nf_getsockopt(struct sock *sk, int pf, int val, char __user *opt, int *len)
-{
- return nf_sockopt(sk, pf, val, opt, len, 1);
-}
-
-static unsigned int nf_iterate(struct list_head *head,
- struct sk_buff **skb,
- int hook,
- const struct net_device *indev,
- const struct net_device *outdev,
- struct list_head **i,
- int (*okfn)(struct sk_buff *),
- int hook_thresh)
-{
- unsigned int verdict;
-
- /*
- * The caller must not block between calls to this
- * function because of risk of continuing from deleted element.
- */
- list_for_each_continue_rcu(*i, head) {
- struct nf_hook_ops *elem = (struct nf_hook_ops *)*i;
-
- if (hook_thresh > elem->priority)
- continue;
-
- /* Optimization: we don't need to hold module
- reference here, since function can't sleep. --RR */
- verdict = elem->hook(hook, skb, indev, outdev, okfn);
- if (verdict != NF_ACCEPT) {
-#ifdef CONFIG_NETFILTER_DEBUG
- if (unlikely(verdict > NF_MAX_VERDICT)) {
- NFDEBUG("Evil return from %p(%u).\n",
- elem->hook, hook);
- continue;
- }
-#endif
- if (verdict != NF_REPEAT)
- return verdict;
- *i = (*i)->prev;
- }
- }
- return NF_ACCEPT;
-}
-
-int nf_register_queue_handler(int pf, nf_queue_outfn_t outfn, void *data)
-{
- int ret;
-
- write_lock_bh(&queue_handler_lock);
- if (queue_handler[pf].outfn)
- ret = -EBUSY;
- else {
- queue_handler[pf].outfn = outfn;
- queue_handler[pf].data = data;
- ret = 0;
- }
- write_unlock_bh(&queue_handler_lock);
-
- return ret;
-}
-
-/* The caller must flush their queue before this */
-int nf_unregister_queue_handler(int pf)
-{
- write_lock_bh(&queue_handler_lock);
- queue_handler[pf].outfn = NULL;
- queue_handler[pf].data = NULL;
- write_unlock_bh(&queue_handler_lock);
-
- return 0;
-}
-
-/*
- * Any packet that leaves via this function must come back
- * through nf_reinject().
- */
-static int nf_queue(struct sk_buff *skb,
- struct list_head *elem,
- int pf, unsigned int hook,
- struct net_device *indev,
- struct net_device *outdev,
- int (*okfn)(struct sk_buff *))
-{
- int status;
- struct nf_info *info;
-#ifdef CONFIG_BRIDGE_NETFILTER
- struct net_device *physindev = NULL;
- struct net_device *physoutdev = NULL;
-#endif
-
- /* QUEUE == DROP if noone is waiting, to be safe. */
- read_lock(&queue_handler_lock);
- if (!queue_handler[pf].outfn) {
- read_unlock(&queue_handler_lock);
- kfree_skb(skb);
- return 1;
- }
-
- info = kmalloc(sizeof(*info), GFP_ATOMIC);
- if (!info) {
- if (net_ratelimit())
- printk(KERN_ERR "OOM queueing packet %p\n",
- skb);
- read_unlock(&queue_handler_lock);
- kfree_skb(skb);
- return 1;
- }
-
- *info = (struct nf_info) {
- (struct nf_hook_ops *)elem, pf, hook, indev, outdev, okfn };
-
- /* If it's going away, ignore hook. */
- if (!try_module_get(info->elem->owner)) {
- read_unlock(&queue_handler_lock);
- kfree(info);
- return 0;
- }
-
- /* Bump dev refs so they don't vanish while packet is out */
- if (indev) dev_hold(indev);
- if (outdev) dev_hold(outdev);
-
-#ifdef CONFIG_BRIDGE_NETFILTER
- if (skb->nf_bridge) {
- physindev = skb->nf_bridge->physindev;
- if (physindev) dev_hold(physindev);
- physoutdev = skb->nf_bridge->physoutdev;
- if (physoutdev) dev_hold(physoutdev);
- }
-#endif
-
- status = queue_handler[pf].outfn(skb, info, queue_handler[pf].data);
- read_unlock(&queue_handler_lock);
-
- if (status < 0) {
- /* James M doesn't say fuck enough. */
- if (indev) dev_put(indev);
- if (outdev) dev_put(outdev);
-#ifdef CONFIG_BRIDGE_NETFILTER
- if (physindev) dev_put(physindev);
- if (physoutdev) dev_put(physoutdev);
-#endif
- module_put(info->elem->owner);
- kfree(info);
- kfree_skb(skb);
- return 1;
- }
- return 1;
-}
-
-/* Returns 1 if okfn() needs to be executed by the caller,
- * -EPERM for NF_DROP, 0 otherwise. */
-int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb,
- struct net_device *indev,
- struct net_device *outdev,
- int (*okfn)(struct sk_buff *),
- int hook_thresh)
-{
- struct list_head *elem;
- unsigned int verdict;
- int ret = 0;
-
- /* We may already have this, but read-locks nest anyway */
- rcu_read_lock();
-
- elem = &nf_hooks[pf][hook];
-next_hook:
- verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev,
- outdev, &elem, okfn, hook_thresh);
- if (verdict == NF_ACCEPT || verdict == NF_STOP) {
- ret = 1;
- goto unlock;
- } else if (verdict == NF_DROP) {
- kfree_skb(*pskb);
- ret = -EPERM;
- } else if (verdict == NF_QUEUE) {
- NFDEBUG("nf_hook: Verdict = QUEUE.\n");
- if (!nf_queue(*pskb, elem, pf, hook, indev, outdev, okfn))
- goto next_hook;
- }
-unlock:
- rcu_read_unlock();
- return ret;
-}
-
-void nf_reinject(struct sk_buff *skb, struct nf_info *info,
- unsigned int verdict)
-{
- struct list_head *elem = &info->elem->list;
- struct list_head *i;
-
- rcu_read_lock();
-
- /* Release those devices we held, or Alexey will kill me. */
- if (info->indev) dev_put(info->indev);
- if (info->outdev) dev_put(info->outdev);
-#ifdef CONFIG_BRIDGE_NETFILTER
- if (skb->nf_bridge) {
- if (skb->nf_bridge->physindev)
- dev_put(skb->nf_bridge->physindev);
- if (skb->nf_bridge->physoutdev)
- dev_put(skb->nf_bridge->physoutdev);
- }
-#endif
-
- /* Drop reference to owner of hook which queued us. */
- module_put(info->elem->owner);
-
- list_for_each_rcu(i, &nf_hooks[info->pf][info->hook]) {
- if (i == elem)
- break;
- }
-
- if (elem == &nf_hooks[info->pf][info->hook]) {
- /* The module which sent it to userspace is gone. */
- NFDEBUG("%s: module disappeared, dropping packet.\n",
- __FUNCTION__);
- verdict = NF_DROP;
- }
-
- /* Continue traversal iff userspace said ok... */
- if (verdict == NF_REPEAT) {
- elem = elem->prev;
- verdict = NF_ACCEPT;
- }
-
- if (verdict == NF_ACCEPT) {
- next_hook:
- verdict = nf_iterate(&nf_hooks[info->pf][info->hook],
- &skb, info->hook,
- info->indev, info->outdev, &elem,
- info->okfn, INT_MIN);
- }
-
- switch (verdict) {
- case NF_ACCEPT:
- info->okfn(skb);
- break;
-
- case NF_QUEUE:
- if (!nf_queue(skb, elem, info->pf, info->hook,
- info->indev, info->outdev, info->okfn))
- goto next_hook;
- break;
- }
- rcu_read_unlock();
-
- if (verdict == NF_DROP)
- kfree_skb(skb);
-
- kfree(info);
- return;
-}
-
-#ifdef CONFIG_INET
-/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
-int ip_route_me_harder(struct sk_buff **pskb)
-{
- struct iphdr *iph = (*pskb)->nh.iph;
- struct rtable *rt;
- struct flowi fl = {};
- struct dst_entry *odst;
- unsigned int hh_len;
-
- /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause
- * packets with foreign saddr to appear on the NF_IP_LOCAL_OUT hook.
- */
- if (inet_addr_type(iph->saddr) == RTN_LOCAL) {
- fl.nl_u.ip4_u.daddr = iph->daddr;
- fl.nl_u.ip4_u.saddr = iph->saddr;
- fl.nl_u.ip4_u.tos = RT_TOS(iph->tos);
- fl.oif = (*pskb)->sk ? (*pskb)->sk->sk_bound_dev_if : 0;
-#ifdef CONFIG_IP_ROUTE_FWMARK
- fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark;
-#endif
- fl.proto = iph->protocol;
- if (ip_route_output_key(&rt, &fl) != 0)
- return -1;
-
- /* Drop old route. */
- dst_release((*pskb)->dst);
- (*pskb)->dst = &rt->u.dst;
- } else {
- /* non-local src, find valid iif to satisfy
- * rp-filter when calling ip_route_input. */
- fl.nl_u.ip4_u.daddr = iph->saddr;
- if (ip_route_output_key(&rt, &fl) != 0)
- return -1;
-
- odst = (*pskb)->dst;
- if (ip_route_input(*pskb, iph->daddr, iph->saddr,
- RT_TOS(iph->tos), rt->u.dst.dev) != 0) {
- dst_release(&rt->u.dst);
- return -1;
- }
- dst_release(&rt->u.dst);
- dst_release(odst);
- }
-
- if ((*pskb)->dst->error)
- return -1;
-
- /* Change in oif may mean change in hh_len. */
- hh_len = (*pskb)->dst->dev->hard_header_len;
- if (skb_headroom(*pskb) < hh_len) {
- struct sk_buff *nskb;
-
- nskb = skb_realloc_headroom(*pskb, hh_len);
- if (!nskb)
- return -1;
- if ((*pskb)->sk)
- skb_set_owner_w(nskb, (*pskb)->sk);
- kfree_skb(*pskb);
- *pskb = nskb;
- }
-
- return 0;
-}
-EXPORT_SYMBOL(ip_route_me_harder);
-
-int skb_ip_make_writable(struct sk_buff **pskb, unsigned int writable_len)
-{
- struct sk_buff *nskb;
-
- if (writable_len > (*pskb)->len)
- return 0;
-
- /* Not exclusive use of packet? Must copy. */
- if (skb_shared(*pskb) || skb_cloned(*pskb))
- goto copy_skb;
-
- return pskb_may_pull(*pskb, writable_len);
-
-copy_skb:
- nskb = skb_copy(*pskb, GFP_ATOMIC);
- if (!nskb)
- return 0;
- BUG_ON(skb_is_nonlinear(nskb));
-
- /* Rest of kernel will get very unhappy if we pass it a
- suddenly-orphaned skbuff */
- if ((*pskb)->sk)
- skb_set_owner_w(nskb, (*pskb)->sk);
- kfree_skb(*pskb);
- *pskb = nskb;
- return 1;
-}
-EXPORT_SYMBOL(skb_ip_make_writable);
-#endif /*CONFIG_INET*/
-
-/* Internal logging interface, which relies on the real
- LOG target modules */
-
-#define NF_LOG_PREFIXLEN 128
-
-static nf_logfn *nf_logging[NPROTO]; /* = NULL */
-static int reported = 0;
-static DEFINE_SPINLOCK(nf_log_lock);
-
-int nf_log_register(int pf, nf_logfn *logfn)
-{
- int ret = -EBUSY;
-
- /* Any setup of logging members must be done before
- * substituting pointer. */
- spin_lock(&nf_log_lock);
- if (!nf_logging[pf]) {
- rcu_assign_pointer(nf_logging[pf], logfn);
- ret = 0;
- }
- spin_unlock(&nf_log_lock);
- return ret;
-}
-
-void nf_log_unregister(int pf, nf_logfn *logfn)
-{
- spin_lock(&nf_log_lock);
- if (nf_logging[pf] == logfn)
- nf_logging[pf] = NULL;
- spin_unlock(&nf_log_lock);
-
- /* Give time to concurrent readers. */
- synchronize_net();
-}
-
-void nf_log_packet(int pf,
- unsigned int hooknum,
- const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const char *fmt, ...)
-{
- va_list args;
- char prefix[NF_LOG_PREFIXLEN];
- nf_logfn *logfn;
-
- rcu_read_lock();
- logfn = rcu_dereference(nf_logging[pf]);
- if (logfn) {
- va_start(args, fmt);
- vsnprintf(prefix, sizeof(prefix), fmt, args);
- va_end(args);
- /* We must read logging before nf_logfn[pf] */
- logfn(hooknum, skb, in, out, prefix);
- } else if (!reported) {
- printk(KERN_WARNING "nf_log_packet: can\'t log yet, "
- "no backend logging module loaded in!\n");
- reported++;
- }
- rcu_read_unlock();
-}
-EXPORT_SYMBOL(nf_log_register);
-EXPORT_SYMBOL(nf_log_unregister);
-EXPORT_SYMBOL(nf_log_packet);
-
-/* This does not belong here, but locally generated errors need it if connection
- tracking in use: without this, connection may not be in hash table, and hence
- manufactured ICMP or RST packets will not be associated with it. */
-void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *);
-
-void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb)
-{
- void (*attach)(struct sk_buff *, struct sk_buff *);
-
- if (skb->nfct && (attach = ip_ct_attach) != NULL) {
- mb(); /* Just to be sure: must be read before executing this */
- attach(new, skb);
- }
-}
-
-void __init netfilter_init(void)
-{
- int i, h;
-
- for (i = 0; i < NPROTO; i++) {
- for (h = 0; h < NF_MAX_HOOKS; h++)
- INIT_LIST_HEAD(&nf_hooks[i][h]);
- }
-}
-
-EXPORT_SYMBOL(ip_ct_attach);
-EXPORT_SYMBOL(nf_ct_attach);
-EXPORT_SYMBOL(nf_getsockopt);
-EXPORT_SYMBOL(nf_hook_slow);
-EXPORT_SYMBOL(nf_hooks);
-EXPORT_SYMBOL(nf_register_hook);
-EXPORT_SYMBOL(nf_register_queue_handler);
-EXPORT_SYMBOL(nf_register_sockopt);
-EXPORT_SYMBOL(nf_reinject);
-EXPORT_SYMBOL(nf_setsockopt);
-EXPORT_SYMBOL(nf_unregister_hook);
-EXPORT_SYMBOL(nf_unregister_queue_handler);
-EXPORT_SYMBOL(nf_unregister_sockopt);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index c327c9edadc..802fe11efad 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -33,6 +33,7 @@
#define MAX_UDP_CHUNK 1460
#define MAX_SKBS 32
#define MAX_QUEUE_DEPTH (MAX_SKBS / 2)
+#define MAX_RETRIES 20000
static DEFINE_SPINLOCK(skb_list_lock);
static int nr_skbs;
@@ -248,14 +249,14 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
int status;
struct netpoll_info *npinfo;
-repeat:
- if(!np || !np->dev || !netif_running(np->dev)) {
+ if (!np || !np->dev || !netif_running(np->dev)) {
__kfree_skb(skb);
return;
}
- /* avoid recursion */
npinfo = np->dev->npinfo;
+
+ /* avoid recursion */
if (npinfo->poll_owner == smp_processor_id() ||
np->dev->xmit_lock_owner == smp_processor_id()) {
if (np->drop)
@@ -265,30 +266,37 @@ repeat:
return;
}
- spin_lock(&np->dev->xmit_lock);
- np->dev->xmit_lock_owner = smp_processor_id();
+ do {
+ npinfo->tries--;
+ spin_lock(&np->dev->xmit_lock);
+ np->dev->xmit_lock_owner = smp_processor_id();
- /*
- * network drivers do not expect to be called if the queue is
- * stopped.
- */
- if (netif_queue_stopped(np->dev)) {
+ /*
+ * network drivers do not expect to be called if the queue is
+ * stopped.
+ */
+ if (netif_queue_stopped(np->dev)) {
+ np->dev->xmit_lock_owner = -1;
+ spin_unlock(&np->dev->xmit_lock);
+ netpoll_poll(np);
+ udelay(50);
+ continue;
+ }
+
+ status = np->dev->hard_start_xmit(skb, np->dev);
np->dev->xmit_lock_owner = -1;
spin_unlock(&np->dev->xmit_lock);
- netpoll_poll(np);
- goto repeat;
- }
-
- status = np->dev->hard_start_xmit(skb, np->dev);
- np->dev->xmit_lock_owner = -1;
- spin_unlock(&np->dev->xmit_lock);
+ /* success */
+ if(!status) {
+ npinfo->tries = MAX_RETRIES; /* reset */
+ return;
+ }
- /* transmit busy */
- if(status) {
+ /* transmit busy */
netpoll_poll(np);
- goto repeat;
- }
+ udelay(50);
+ } while (npinfo->tries > 0);
}
void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
@@ -349,15 +357,11 @@ static void arp_reply(struct sk_buff *skb)
unsigned char *arp_ptr;
int size, type = ARPOP_REPLY, ptype = ETH_P_ARP;
u32 sip, tip;
- unsigned long flags;
struct sk_buff *send_skb;
struct netpoll *np = NULL;
- spin_lock_irqsave(&npinfo->rx_lock, flags);
if (npinfo->rx_np && npinfo->rx_np->dev == skb->dev)
np = npinfo->rx_np;
- spin_unlock_irqrestore(&npinfo->rx_lock, flags);
-
if (!np)
return;
@@ -639,10 +643,12 @@ int netpoll_setup(struct netpoll *np)
if (!npinfo)
goto release;
+ npinfo->rx_flags = 0;
npinfo->rx_np = NULL;
- npinfo->poll_lock = SPIN_LOCK_UNLOCKED;
+ spin_lock_init(&npinfo->poll_lock);
npinfo->poll_owner = -1;
- npinfo->rx_lock = SPIN_LOCK_UNLOCKED;
+ npinfo->tries = MAX_RETRIES;
+ spin_lock_init(&npinfo->rx_lock);
} else
npinfo = ndev->npinfo;
@@ -697,7 +703,7 @@ int netpoll_setup(struct netpoll *np)
if (!np->local_ip) {
rcu_read_lock();
- in_dev = __in_dev_get(ndev);
+ in_dev = __in_dev_get_rcu(ndev);
if (!in_dev || !in_dev->ifa_list) {
rcu_read_unlock();
@@ -718,9 +724,16 @@ int netpoll_setup(struct netpoll *np)
npinfo->rx_np = np;
spin_unlock_irqrestore(&npinfo->rx_lock, flags);
}
+
+ /* fill up the skb queue */
+ refill_skbs();
+
/* last thing to do is link it to the net device structure */
ndev->npinfo = npinfo;
+ /* avoid racing with NAPI reading npinfo */
+ synchronize_rcu();
+
return 0;
release:
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 8eb083b6041..7fc3e9e28c3 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -75,7 +75,7 @@
* By design there should only be *one* "controlling" process. In practice
* multiple write accesses gives unpredictable result. Understood by "write"
* to /proc gives result code thats should be read be the "writer".
- * For pratical use this should be no problem.
+ * For practical use this should be no problem.
*
* Note when adding devices to a specific CPU there good idea to also assign
* /proc/irq/XX/smp_affinity so TX-interrupts gets bound to the same CPU.
@@ -96,7 +96,7 @@
* New xmit() return, do_div and misc clean up by Stephen Hemminger
* <shemminger@osdl.org> 040923
*
- * Rany Dunlap fixed u64 printk compiler waring
+ * Randy Dunlap fixed u64 printk compiler waring
*
* Remove FCS from BW calculation. Lennert Buytenhek <buytenh@wantstofly.org>
* New time handling. Lennert Buytenhek <buytenh@wantstofly.org> 041213
@@ -137,6 +137,7 @@
#include <linux/ipv6.h>
#include <linux/udp.h>
#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
#include <linux/wait.h>
#include <net/checksum.h>
#include <net/ipv6.h>
@@ -151,7 +152,7 @@
#include <asm/timex.h>
-#define VERSION "pktgen v2.62: Packet Generator for packet performance testing.\n"
+#define VERSION "pktgen v2.63: Packet Generator for packet performance testing.\n"
/* #define PG_DEBUG(a) a */
#define PG_DEBUG(a)
@@ -177,8 +178,8 @@
#define T_REMDEV (1<<3) /* Remove all devs */
/* Locks */
-#define thread_lock() spin_lock(&_thread_lock)
-#define thread_unlock() spin_unlock(&_thread_lock)
+#define thread_lock() down(&pktgen_sem)
+#define thread_unlock() up(&pktgen_sem)
/* If lock -- can be removed after some work */
#define if_lock(t) spin_lock(&(t->if_lock));
@@ -187,6 +188,8 @@
/* Used to help with determining the pkts on receive */
#define PKTGEN_MAGIC 0xbe9be955
#define PG_PROC_DIR "pktgen"
+#define PGCTRL "pgctrl"
+static struct proc_dir_entry *pg_proc_dir = NULL;
#define MAX_CFLOWS 65536
@@ -202,11 +205,8 @@ struct pktgen_dev {
* Try to keep frequent/infrequent used vars. separated.
*/
- char ifname[32];
- struct proc_dir_entry *proc_ent;
+ char ifname[IFNAMSIZ];
char result[512];
- /* proc file names */
- char fname[80];
struct pktgen_thread* pg_thread; /* the owner */
struct pktgen_dev *next; /* Used for chaining in the thread's run-queue */
@@ -244,7 +244,7 @@ struct pktgen_dev {
__u32 seq_num;
int clone_skb; /* Use multiple SKBs during packet gen. If this number
- * is greater than 1, then that many coppies of the same
+ * is greater than 1, then that many copies of the same
* packet will be sent before a new packet is allocated.
* For instance, if you want to send 1024 identical packets
* before creating a new packet, set clone_skb to 1024.
@@ -330,8 +330,6 @@ struct pktgen_thread {
struct pktgen_dev *if_list; /* All device here */
struct pktgen_thread* next;
char name[32];
- char fname[128]; /* name of proc file */
- struct proc_dir_entry *proc_ent;
char result[512];
u32 max_before_softirq; /* We'll call do_softirq to prevent starvation. */
@@ -396,7 +394,7 @@ static inline s64 divremdi3(s64 x, s64 y, int type)
/* End of hacks to deal with 64-bit math on x86 */
-/** Convert to miliseconds */
+/** Convert to milliseconds */
static inline __u64 tv_to_ms(const struct timeval* tv)
{
__u64 ms = tv->tv_usec / 1000;
@@ -425,7 +423,7 @@ static inline __u64 pg_div64(__u64 n, __u64 base)
{
__u64 tmp = n;
/*
- * How do we know if the architectrure we are running on
+ * How do we know if the architecture we are running on
* supports division with 64 bit base?
*
*/
@@ -473,16 +471,6 @@ static inline __u64 tv_diff(const struct timeval* a, const struct timeval* b)
static char version[] __initdata = VERSION;
-static ssize_t proc_pgctrl_read(struct file* file, char __user * buf, size_t count, loff_t *ppos);
-static ssize_t proc_pgctrl_write(struct file* file, const char __user * buf, size_t count, loff_t *ppos);
-static int proc_if_read(char *buf , char **start, off_t offset, int len, int *eof, void *data);
-
-static int proc_thread_read(char *buf , char **start, off_t offset, int len, int *eof, void *data);
-static int proc_if_write(struct file *file, const char __user *user_buffer, unsigned long count, void *data);
-static int proc_thread_write(struct file *file, const char __user *user_buffer, unsigned long count, void *data);
-static int create_proc_dir(void);
-static int remove_proc_dir(void);
-
static int pktgen_remove_device(struct pktgen_thread* t, struct pktgen_dev *i);
static int pktgen_add_device(struct pktgen_thread* t, const char* ifname);
static struct pktgen_thread* pktgen_find_thread(const char* name);
@@ -503,83 +491,41 @@ static int pg_delay_d = 0;
static int pg_clone_skb_d = 0;
static int debug = 0;
-static spinlock_t _thread_lock = SPIN_LOCK_UNLOCKED;
+static DECLARE_MUTEX(pktgen_sem);
static struct pktgen_thread *pktgen_threads = NULL;
-static char module_fname[128];
-static struct proc_dir_entry *module_proc_ent = NULL;
-
static struct notifier_block pktgen_notifier_block = {
.notifier_call = pktgen_device_event,
};
-static struct file_operations pktgen_fops = {
- .read = proc_pgctrl_read,
- .write = proc_pgctrl_write,
- /* .ioctl = pktgen_ioctl, later maybe */
-};
-
/*
* /proc handling functions
*
*/
-static struct proc_dir_entry *pg_proc_dir = NULL;
-static int proc_pgctrl_read_eof=0;
-
-static ssize_t proc_pgctrl_read(struct file* file, char __user * buf,
- size_t count, loff_t *ppos)
+static int pgctrl_show(struct seq_file *seq, void *v)
{
- char data[200];
- int len = 0;
-
- if(proc_pgctrl_read_eof) {
- proc_pgctrl_read_eof=0;
- len = 0;
- goto out;
- }
-
- sprintf(data, "%s", VERSION);
-
- len = strlen(data);
-
- if(len > count) {
- len =-EFAULT;
- goto out;
- }
-
- if (copy_to_user(buf, data, len)) {
- len =-EFAULT;
- goto out;
- }
-
- *ppos += len;
- proc_pgctrl_read_eof=1; /* EOF next call */
-
- out:
- return len;
+ seq_puts(seq, VERSION);
+ return 0;
}
-static ssize_t proc_pgctrl_write(struct file* file,const char __user * buf,
- size_t count, loff_t *ppos)
+static ssize_t pgctrl_write(struct file* file,const char __user * buf,
+ size_t count, loff_t *ppos)
{
- char *data = NULL;
int err = 0;
+ char data[128];
if (!capable(CAP_NET_ADMIN)){
err = -EPERM;
goto out;
}
- data = (void*)vmalloc ((unsigned int)count);
+ if (count > sizeof(data))
+ count = sizeof(data);
- if(!data) {
- err = -ENOMEM;
- goto out;
- }
if (copy_from_user(data, buf, count)) {
- err =-EFAULT;
- goto out_free;
+ err = -EFAULT;
+ goto out;
}
data[count-1] = 0; /* Make string */
@@ -594,31 +540,40 @@ static ssize_t proc_pgctrl_write(struct file* file,const char __user * buf,
err = count;
- out_free:
- vfree (data);
out:
return err;
}
-static int proc_if_read(char *buf , char **start, off_t offset,
- int len, int *eof, void *data)
+static int pgctrl_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, pgctrl_show, PDE(inode)->data);
+}
+
+static struct file_operations pktgen_fops = {
+ .owner = THIS_MODULE,
+ .open = pgctrl_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .write = pgctrl_write,
+ .release = single_release,
+};
+
+static int pktgen_if_show(struct seq_file *seq, void *v)
{
- char *p;
int i;
- struct pktgen_dev *pkt_dev = (struct pktgen_dev*)(data);
+ struct pktgen_dev *pkt_dev = seq->private;
__u64 sa;
__u64 stopped;
__u64 now = getCurUs();
- p = buf;
- p += sprintf(p, "Params: count %llu min_pkt_size: %u max_pkt_size: %u\n",
- (unsigned long long) pkt_dev->count,
- pkt_dev->min_pkt_size, pkt_dev->max_pkt_size);
+ seq_printf(seq, "Params: count %llu min_pkt_size: %u max_pkt_size: %u\n",
+ (unsigned long long) pkt_dev->count,
+ pkt_dev->min_pkt_size, pkt_dev->max_pkt_size);
- p += sprintf(p, " frags: %d delay: %u clone_skb: %d ifname: %s\n",
- pkt_dev->nfrags, 1000*pkt_dev->delay_us+pkt_dev->delay_ns, pkt_dev->clone_skb, pkt_dev->ifname);
+ seq_printf(seq, " frags: %d delay: %u clone_skb: %d ifname: %s\n",
+ pkt_dev->nfrags, 1000*pkt_dev->delay_us+pkt_dev->delay_ns, pkt_dev->clone_skb, pkt_dev->ifname);
- p += sprintf(p, " flows: %u flowlen: %u\n", pkt_dev->cflows, pkt_dev->lflow);
+ seq_printf(seq, " flows: %u flowlen: %u\n", pkt_dev->cflows, pkt_dev->lflow);
if(pkt_dev->flags & F_IPV6) {
@@ -626,19 +581,19 @@ static int proc_if_read(char *buf , char **start, off_t offset,
fmt_ip6(b1, pkt_dev->in6_saddr.s6_addr);
fmt_ip6(b2, pkt_dev->min_in6_saddr.s6_addr);
fmt_ip6(b3, pkt_dev->max_in6_saddr.s6_addr);
- p += sprintf(p, " saddr: %s min_saddr: %s max_saddr: %s\n", b1, b2, b3);
+ seq_printf(seq, " saddr: %s min_saddr: %s max_saddr: %s\n", b1, b2, b3);
fmt_ip6(b1, pkt_dev->in6_daddr.s6_addr);
fmt_ip6(b2, pkt_dev->min_in6_daddr.s6_addr);
fmt_ip6(b3, pkt_dev->max_in6_daddr.s6_addr);
- p += sprintf(p, " daddr: %s min_daddr: %s max_daddr: %s\n", b1, b2, b3);
+ seq_printf(seq, " daddr: %s min_daddr: %s max_daddr: %s\n", b1, b2, b3);
}
else
- p += sprintf(p, " dst_min: %s dst_max: %s\n src_min: %s src_max: %s\n",
- pkt_dev->dst_min, pkt_dev->dst_max, pkt_dev->src_min, pkt_dev->src_max);
+ seq_printf(seq," dst_min: %s dst_max: %s\n src_min: %s src_max: %s\n",
+ pkt_dev->dst_min, pkt_dev->dst_max, pkt_dev->src_min, pkt_dev->src_max);
- p += sprintf(p, " src_mac: ");
+ seq_puts(seq, " src_mac: ");
if ((pkt_dev->src_mac[0] == 0) &&
(pkt_dev->src_mac[1] == 0) &&
@@ -648,89 +603,89 @@ static int proc_if_read(char *buf , char **start, off_t offset,
(pkt_dev->src_mac[5] == 0))
for (i = 0; i < 6; i++)
- p += sprintf(p, "%02X%s", pkt_dev->odev->dev_addr[i], i == 5 ? " " : ":");
+ seq_printf(seq, "%02X%s", pkt_dev->odev->dev_addr[i], i == 5 ? " " : ":");
else
for (i = 0; i < 6; i++)
- p += sprintf(p, "%02X%s", pkt_dev->src_mac[i], i == 5 ? " " : ":");
+ seq_printf(seq, "%02X%s", pkt_dev->src_mac[i], i == 5 ? " " : ":");
- p += sprintf(p, "dst_mac: ");
+ seq_printf(seq, "dst_mac: ");
for (i = 0; i < 6; i++)
- p += sprintf(p, "%02X%s", pkt_dev->dst_mac[i], i == 5 ? "\n" : ":");
+ seq_printf(seq, "%02X%s", pkt_dev->dst_mac[i], i == 5 ? "\n" : ":");
- p += sprintf(p, " udp_src_min: %d udp_src_max: %d udp_dst_min: %d udp_dst_max: %d\n",
- pkt_dev->udp_src_min, pkt_dev->udp_src_max, pkt_dev->udp_dst_min,
- pkt_dev->udp_dst_max);
+ seq_printf(seq, " udp_src_min: %d udp_src_max: %d udp_dst_min: %d udp_dst_max: %d\n",
+ pkt_dev->udp_src_min, pkt_dev->udp_src_max, pkt_dev->udp_dst_min,
+ pkt_dev->udp_dst_max);
- p += sprintf(p, " src_mac_count: %d dst_mac_count: %d \n Flags: ",
- pkt_dev->src_mac_count, pkt_dev->dst_mac_count);
+ seq_printf(seq, " src_mac_count: %d dst_mac_count: %d \n Flags: ",
+ pkt_dev->src_mac_count, pkt_dev->dst_mac_count);
if (pkt_dev->flags & F_IPV6)
- p += sprintf(p, "IPV6 ");
+ seq_printf(seq, "IPV6 ");
if (pkt_dev->flags & F_IPSRC_RND)
- p += sprintf(p, "IPSRC_RND ");
+ seq_printf(seq, "IPSRC_RND ");
if (pkt_dev->flags & F_IPDST_RND)
- p += sprintf(p, "IPDST_RND ");
+ seq_printf(seq, "IPDST_RND ");
if (pkt_dev->flags & F_TXSIZE_RND)
- p += sprintf(p, "TXSIZE_RND ");
+ seq_printf(seq, "TXSIZE_RND ");
if (pkt_dev->flags & F_UDPSRC_RND)
- p += sprintf(p, "UDPSRC_RND ");
+ seq_printf(seq, "UDPSRC_RND ");
if (pkt_dev->flags & F_UDPDST_RND)
- p += sprintf(p, "UDPDST_RND ");
+ seq_printf(seq, "UDPDST_RND ");
if (pkt_dev->flags & F_MACSRC_RND)
- p += sprintf(p, "MACSRC_RND ");
+ seq_printf(seq, "MACSRC_RND ");
if (pkt_dev->flags & F_MACDST_RND)
- p += sprintf(p, "MACDST_RND ");
+ seq_printf(seq, "MACDST_RND ");
- p += sprintf(p, "\n");
+ seq_puts(seq, "\n");
sa = pkt_dev->started_at;
stopped = pkt_dev->stopped_at;
if (pkt_dev->running)
stopped = now; /* not really stopped, more like last-running-at */
- p += sprintf(p, "Current:\n pkts-sofar: %llu errors: %llu\n started: %lluus stopped: %lluus idle: %lluus\n",
- (unsigned long long) pkt_dev->sofar,
- (unsigned long long) pkt_dev->errors,
- (unsigned long long) sa,
- (unsigned long long) stopped,
- (unsigned long long) pkt_dev->idle_acc);
+ seq_printf(seq, "Current:\n pkts-sofar: %llu errors: %llu\n started: %lluus stopped: %lluus idle: %lluus\n",
+ (unsigned long long) pkt_dev->sofar,
+ (unsigned long long) pkt_dev->errors,
+ (unsigned long long) sa,
+ (unsigned long long) stopped,
+ (unsigned long long) pkt_dev->idle_acc);
- p += sprintf(p, " seq_num: %d cur_dst_mac_offset: %d cur_src_mac_offset: %d\n",
- pkt_dev->seq_num, pkt_dev->cur_dst_mac_offset, pkt_dev->cur_src_mac_offset);
+ seq_printf(seq, " seq_num: %d cur_dst_mac_offset: %d cur_src_mac_offset: %d\n",
+ pkt_dev->seq_num, pkt_dev->cur_dst_mac_offset,
+ pkt_dev->cur_src_mac_offset);
if(pkt_dev->flags & F_IPV6) {
char b1[128], b2[128];
fmt_ip6(b1, pkt_dev->cur_in6_daddr.s6_addr);
fmt_ip6(b2, pkt_dev->cur_in6_saddr.s6_addr);
- p += sprintf(p, " cur_saddr: %s cur_daddr: %s\n", b2, b1);
+ seq_printf(seq, " cur_saddr: %s cur_daddr: %s\n", b2, b1);
}
else
- p += sprintf(p, " cur_saddr: 0x%x cur_daddr: 0x%x\n",
- pkt_dev->cur_saddr, pkt_dev->cur_daddr);
+ seq_printf(seq, " cur_saddr: 0x%x cur_daddr: 0x%x\n",
+ pkt_dev->cur_saddr, pkt_dev->cur_daddr);
- p += sprintf(p, " cur_udp_dst: %d cur_udp_src: %d\n",
- pkt_dev->cur_udp_dst, pkt_dev->cur_udp_src);
+ seq_printf(seq, " cur_udp_dst: %d cur_udp_src: %d\n",
+ pkt_dev->cur_udp_dst, pkt_dev->cur_udp_src);
- p += sprintf(p, " flows: %u\n", pkt_dev->nflows);
+ seq_printf(seq, " flows: %u\n", pkt_dev->nflows);
if (pkt_dev->result[0])
- p += sprintf(p, "Result: %s\n", pkt_dev->result);
+ seq_printf(seq, "Result: %s\n", pkt_dev->result);
else
- p += sprintf(p, "Result: Idle\n");
- *eof = 1;
+ seq_printf(seq, "Result: Idle\n");
- return p - buf;
+ return 0;
}
@@ -802,13 +757,14 @@ done_str:
return i;
}
-static int proc_if_write(struct file *file, const char __user *user_buffer,
- unsigned long count, void *data)
+static ssize_t pktgen_if_write(struct file *file, const char __user *user_buffer,
+ size_t count, loff_t *offset)
{
+ struct seq_file *seq = (struct seq_file *) file->private_data;
+ struct pktgen_dev *pkt_dev = seq->private;
int i = 0, max, len;
char name[16], valstr[32];
unsigned long value = 0;
- struct pktgen_dev *pkt_dev = (struct pktgen_dev*)(data);
char* pg_result = NULL;
int tmp = 0;
char buf[128];
@@ -849,7 +805,8 @@ static int proc_if_write(struct file *file, const char __user *user_buffer,
if (copy_from_user(tb, user_buffer, count))
return -EFAULT;
tb[count] = 0;
- printk("pktgen: %s,%lu buffer -:%s:-\n", name, count, tb);
+ printk("pktgen: %s,%lu buffer -:%s:-\n", name,
+ (unsigned long) count, tb);
}
if (!strcmp(name, "min_pkt_size")) {
@@ -1335,92 +1292,98 @@ static int proc_if_write(struct file *file, const char __user *user_buffer,
return -EINVAL;
}
-static int proc_thread_read(char *buf , char **start, off_t offset,
- int len, int *eof, void *data)
+static int pktgen_if_open(struct inode *inode, struct file *file)
{
- char *p;
- struct pktgen_thread *t = (struct pktgen_thread*)(data);
- struct pktgen_dev *pkt_dev = NULL;
+ return single_open(file, pktgen_if_show, PDE(inode)->data);
+}
+static struct file_operations pktgen_if_fops = {
+ .owner = THIS_MODULE,
+ .open = pktgen_if_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .write = pktgen_if_write,
+ .release = single_release,
+};
- if (!t) {
- printk("pktgen: ERROR: could not find thread in proc_thread_read\n");
- return -EINVAL;
- }
+static int pktgen_thread_show(struct seq_file *seq, void *v)
+{
+ struct pktgen_thread *t = seq->private;
+ struct pktgen_dev *pkt_dev = NULL;
- p = buf;
- p += sprintf(p, "Name: %s max_before_softirq: %d\n",
+ BUG_ON(!t);
+
+ seq_printf(seq, "Name: %s max_before_softirq: %d\n",
t->name, t->max_before_softirq);
- p += sprintf(p, "Running: ");
+ seq_printf(seq, "Running: ");
if_lock(t);
for(pkt_dev = t->if_list;pkt_dev; pkt_dev = pkt_dev->next)
if(pkt_dev->running)
- p += sprintf(p, "%s ", pkt_dev->ifname);
+ seq_printf(seq, "%s ", pkt_dev->ifname);
- p += sprintf(p, "\nStopped: ");
+ seq_printf(seq, "\nStopped: ");
for(pkt_dev = t->if_list;pkt_dev; pkt_dev = pkt_dev->next)
if(!pkt_dev->running)
- p += sprintf(p, "%s ", pkt_dev->ifname);
+ seq_printf(seq, "%s ", pkt_dev->ifname);
if (t->result[0])
- p += sprintf(p, "\nResult: %s\n", t->result);
+ seq_printf(seq, "\nResult: %s\n", t->result);
else
- p += sprintf(p, "\nResult: NA\n");
-
- *eof = 1;
+ seq_printf(seq, "\nResult: NA\n");
if_unlock(t);
- return p - buf;
+ return 0;
}
-static int proc_thread_write(struct file *file, const char __user *user_buffer,
- unsigned long count, void *data)
+static ssize_t pktgen_thread_write(struct file *file,
+ const char __user *user_buffer,
+ size_t count, loff_t *offset)
{
+ struct seq_file *seq = (struct seq_file *) file->private_data;
+ struct pktgen_thread *t = seq->private;
int i = 0, max, len, ret;
char name[40];
- struct pktgen_thread *t;
char *pg_result;
unsigned long value = 0;
-
+
if (count < 1) {
// sprintf(pg_result, "Wrong command format");
return -EINVAL;
}
-
+
max = count - i;
len = count_trail_chars(&user_buffer[i], max);
- if (len < 0)
- return len;
-
+ if (len < 0)
+ return len;
+
i += len;
-
+
/* Read variable name */
len = strn_len(&user_buffer[i], sizeof(name) - 1);
- if (len < 0)
- return len;
+ if (len < 0)
+ return len;
memset(name, 0, sizeof(name));
if (copy_from_user(name, &user_buffer[i], len))
return -EFAULT;
i += len;
-
+
max = count -i;
len = count_trail_chars(&user_buffer[i], max);
- if (len < 0)
- return len;
-
+ if (len < 0)
+ return len;
+
i += len;
- if (debug)
- printk("pktgen: t=%s, count=%lu\n", name, count);
-
+ if (debug)
+ printk("pktgen: t=%s, count=%lu\n", name,
+ (unsigned long) count);
- t = (struct pktgen_thread*)(data);
if(!t) {
printk("pktgen: ERROR: No thread\n");
ret = -EINVAL;
@@ -1452,8 +1415,7 @@ static int proc_thread_write(struct file *file, const char __user *user_buffer,
thread_lock();
t->control |= T_REMDEV;
thread_unlock();
- current->state = TASK_INTERRUPTIBLE;
- schedule_timeout(HZ/8); /* Propagate thread->control */
+ schedule_timeout_interruptible(msecs_to_jiffies(125)); /* Propagate thread->control */
ret = count;
sprintf(pg_result, "OK: rem_device_all");
goto out;
@@ -1475,32 +1437,19 @@ static int proc_thread_write(struct file *file, const char __user *user_buffer,
return ret;
}
-static int create_proc_dir(void)
+static int pktgen_thread_open(struct inode *inode, struct file *file)
{
- int len;
- /* does proc_dir already exists */
- len = strlen(PG_PROC_DIR);
-
- for (pg_proc_dir = proc_net->subdir; pg_proc_dir; pg_proc_dir=pg_proc_dir->next) {
- if ((pg_proc_dir->namelen == len) &&
- (! memcmp(pg_proc_dir->name, PG_PROC_DIR, len)))
- break;
- }
-
- if (!pg_proc_dir)
- pg_proc_dir = create_proc_entry(PG_PROC_DIR, S_IFDIR, proc_net);
-
- if (!pg_proc_dir)
- return -ENODEV;
-
- return 0;
+ return single_open(file, pktgen_thread_show, PDE(inode)->data);
}
-static int remove_proc_dir(void)
-{
- remove_proc_entry(PG_PROC_DIR, proc_net);
- return 0;
-}
+static struct file_operations pktgen_thread_fops = {
+ .owner = THIS_MODULE,
+ .open = pktgen_thread_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .write = pktgen_thread_write,
+ .release = single_release,
+};
/* Think find or remove for NN */
static struct pktgen_dev *__pktgen_NN_threads(const char* ifname, int remove)
@@ -1679,13 +1628,12 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
struct in_device *in_dev;
rcu_read_lock();
- in_dev = __in_dev_get(pkt_dev->odev);
+ in_dev = __in_dev_get_rcu(pkt_dev->odev);
if (in_dev) {
if (in_dev->ifa_list) {
pkt_dev->saddr_min = in_dev->ifa_list->ifa_address;
pkt_dev->saddr_max = pkt_dev->saddr_min;
}
- __in_dev_put(in_dev);
}
rcu_read_unlock();
}
@@ -1715,11 +1663,10 @@ static void spin(struct pktgen_dev *pkt_dev, __u64 spin_until_us)
start = now = getCurUs();
printk(KERN_INFO "sleeping for %d\n", (int)(spin_until_us - now));
while (now < spin_until_us) {
- /* TODO: optimise sleeping behavior */
- if (spin_until_us - now > (1000000/HZ)+1) {
- current->state = TASK_INTERRUPTIBLE;
- schedule_timeout(1);
- } else if (spin_until_us - now > 100) {
+ /* TODO: optimize sleeping behavior */
+ if (spin_until_us - now > jiffies_to_usecs(1)+1)
+ schedule_timeout_interruptible(1);
+ else if (spin_until_us - now > 100) {
do_softirq();
if (!pkt_dev->running)
return;
@@ -2375,7 +2322,7 @@ static void pktgen_stop_all_threads_ifs(void)
pktgen_stop(t);
t = t->next;
}
- thread_unlock();
+ thread_unlock();
}
static int thread_is_running(struct pktgen_thread *t )
@@ -2449,8 +2396,7 @@ static void pktgen_run_all_threads(void)
}
thread_unlock();
- current->state = TASK_INTERRUPTIBLE;
- schedule_timeout(HZ/8); /* Propagate thread->control */
+ schedule_timeout_interruptible(msecs_to_jiffies(125)); /* Propagate thread->control */
pktgen_wait_all_threads_run();
}
@@ -2567,10 +2513,9 @@ static void pktgen_rem_thread(struct pktgen_thread *t)
struct pktgen_thread *tmp = pktgen_threads;
- if (strlen(t->fname))
- remove_proc_entry(t->fname, NULL);
+ remove_proc_entry(t->name, pg_proc_dir);
- thread_lock();
+ thread_lock();
if (tmp == t)
pktgen_threads = tmp->next;
@@ -2840,7 +2785,7 @@ static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, const char* i
if_lock(t);
for(pkt_dev=t->if_list; pkt_dev; pkt_dev = pkt_dev->next ) {
- if (strcmp(pkt_dev->ifname, ifname) == 0) {
+ if (strncmp(pkt_dev->ifname, ifname, IFNAMSIZ) == 0) {
break;
}
}
@@ -2879,74 +2824,70 @@ static int add_dev_to_thread(struct pktgen_thread *t, struct pktgen_dev *pkt_dev
static int pktgen_add_device(struct pktgen_thread *t, const char* ifname)
{
struct pktgen_dev *pkt_dev;
+ struct proc_dir_entry *pe;
/* We don't allow a device to be on several threads */
- if( (pkt_dev = __pktgen_NN_threads(ifname, FIND)) == NULL) {
-
- pkt_dev = kmalloc(sizeof(struct pktgen_dev), GFP_KERNEL);
- if (!pkt_dev)
- return -ENOMEM;
+ pkt_dev = __pktgen_NN_threads(ifname, FIND);
+ if (pkt_dev) {
+ printk("pktgen: ERROR: interface already used.\n");
+ return -EBUSY;
+ }
- memset(pkt_dev, 0, sizeof(struct pktgen_dev));
+ pkt_dev = kzalloc(sizeof(struct pktgen_dev), GFP_KERNEL);
+ if (!pkt_dev)
+ return -ENOMEM;
- pkt_dev->flows = vmalloc(MAX_CFLOWS*sizeof(struct flow_state));
- if (pkt_dev->flows == NULL) {
- kfree(pkt_dev);
- return -ENOMEM;
- }
- memset(pkt_dev->flows, 0, MAX_CFLOWS*sizeof(struct flow_state));
-
- pkt_dev->min_pkt_size = ETH_ZLEN;
- pkt_dev->max_pkt_size = ETH_ZLEN;
- pkt_dev->nfrags = 0;
- pkt_dev->clone_skb = pg_clone_skb_d;
- pkt_dev->delay_us = pg_delay_d / 1000;
- pkt_dev->delay_ns = pg_delay_d % 1000;
- pkt_dev->count = pg_count_d;
- pkt_dev->sofar = 0;
- pkt_dev->udp_src_min = 9; /* sink port */
- pkt_dev->udp_src_max = 9;
- pkt_dev->udp_dst_min = 9;
- pkt_dev->udp_dst_max = 9;
-
- strncpy(pkt_dev->ifname, ifname, 31);
- sprintf(pkt_dev->fname, "net/%s/%s", PG_PROC_DIR, ifname);
-
- if (! pktgen_setup_dev(pkt_dev)) {
- printk("pktgen: ERROR: pktgen_setup_dev failed.\n");
- if (pkt_dev->flows)
- vfree(pkt_dev->flows);
- kfree(pkt_dev);
- return -ENODEV;
- }
+ pkt_dev->flows = vmalloc(MAX_CFLOWS*sizeof(struct flow_state));
+ if (pkt_dev->flows == NULL) {
+ kfree(pkt_dev);
+ return -ENOMEM;
+ }
+ memset(pkt_dev->flows, 0, MAX_CFLOWS*sizeof(struct flow_state));
- pkt_dev->proc_ent = create_proc_entry(pkt_dev->fname, 0600, NULL);
- if (!pkt_dev->proc_ent) {
- printk("pktgen: cannot create %s procfs entry.\n", pkt_dev->fname);
- if (pkt_dev->flows)
- vfree(pkt_dev->flows);
- kfree(pkt_dev);
- return -EINVAL;
- }
- pkt_dev->proc_ent->read_proc = proc_if_read;
- pkt_dev->proc_ent->write_proc = proc_if_write;
- pkt_dev->proc_ent->data = (void*)(pkt_dev);
- pkt_dev->proc_ent->owner = THIS_MODULE;
+ pkt_dev->min_pkt_size = ETH_ZLEN;
+ pkt_dev->max_pkt_size = ETH_ZLEN;
+ pkt_dev->nfrags = 0;
+ pkt_dev->clone_skb = pg_clone_skb_d;
+ pkt_dev->delay_us = pg_delay_d / 1000;
+ pkt_dev->delay_ns = pg_delay_d % 1000;
+ pkt_dev->count = pg_count_d;
+ pkt_dev->sofar = 0;
+ pkt_dev->udp_src_min = 9; /* sink port */
+ pkt_dev->udp_src_max = 9;
+ pkt_dev->udp_dst_min = 9;
+ pkt_dev->udp_dst_max = 9;
+
+ strncpy(pkt_dev->ifname, ifname, IFNAMSIZ);
+
+ if (! pktgen_setup_dev(pkt_dev)) {
+ printk("pktgen: ERROR: pktgen_setup_dev failed.\n");
+ if (pkt_dev->flows)
+ vfree(pkt_dev->flows);
+ kfree(pkt_dev);
+ return -ENODEV;
+ }
+
+ pe = create_proc_entry(ifname, 0600, pg_proc_dir);
+ if (!pe) {
+ printk("pktgen: cannot create %s/%s procfs entry.\n",
+ PG_PROC_DIR, ifname);
+ if (pkt_dev->flows)
+ vfree(pkt_dev->flows);
+ kfree(pkt_dev);
+ return -EINVAL;
+ }
+ pe->proc_fops = &pktgen_if_fops;
+ pe->data = pkt_dev;
- return add_dev_to_thread(t, pkt_dev);
- }
- else {
- printk("pktgen: ERROR: interface already used.\n");
- return -EBUSY;
- }
+ return add_dev_to_thread(t, pkt_dev);
}
static struct pktgen_thread *pktgen_find_thread(const char* name)
{
struct pktgen_thread *t = NULL;
- thread_lock();
+ thread_lock();
t = pktgen_threads;
while (t) {
@@ -2962,6 +2903,7 @@ static struct pktgen_thread *pktgen_find_thread(const char* name)
static int pktgen_create_thread(const char* name, int cpu)
{
struct pktgen_thread *t = NULL;
+ struct proc_dir_entry *pe;
if (strlen(name) > 31) {
printk("pktgen: ERROR: Thread name cannot be more than 31 characters.\n");
@@ -2973,28 +2915,26 @@ static int pktgen_create_thread(const char* name, int cpu)
return -EINVAL;
}
- t = (struct pktgen_thread*)(kmalloc(sizeof(struct pktgen_thread), GFP_KERNEL));
+ t = kzalloc(sizeof(struct pktgen_thread), GFP_KERNEL);
if (!t) {
printk("pktgen: ERROR: out of memory, can't create new thread.\n");
return -ENOMEM;
}
- memset(t, 0, sizeof(struct pktgen_thread));
strcpy(t->name, name);
spin_lock_init(&t->if_lock);
t->cpu = cpu;
- sprintf(t->fname, "net/%s/%s", PG_PROC_DIR, t->name);
- t->proc_ent = create_proc_entry(t->fname, 0600, NULL);
- if (!t->proc_ent) {
- printk("pktgen: cannot create %s procfs entry.\n", t->fname);
+ pe = create_proc_entry(t->name, 0600, pg_proc_dir);
+ if (!pe) {
+ printk("pktgen: cannot create %s/%s procfs entry.\n",
+ PG_PROC_DIR, t->name);
kfree(t);
return -EINVAL;
}
- t->proc_ent->read_proc = proc_thread_read;
- t->proc_ent->write_proc = proc_thread_write;
- t->proc_ent->data = (void*)(t);
- t->proc_ent->owner = THIS_MODULE;
+
+ pe->proc_fops = &pktgen_thread_fops;
+ pe->data = t;
t->next = pktgen_threads;
pktgen_threads = t;
@@ -3049,8 +2989,7 @@ static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *pkt_
/* Clean up proc file system */
- if (strlen(pkt_dev->fname))
- remove_proc_entry(pkt_dev->fname, NULL);
+ remove_proc_entry(pkt_dev->ifname, pg_proc_dir);
if (pkt_dev->flows)
vfree(pkt_dev->flows);
@@ -3061,31 +3000,31 @@ static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *pkt_
static int __init pg_init(void)
{
int cpu;
- printk(version);
+ struct proc_dir_entry *pe;
- module_fname[0] = 0;
+ printk(version);
- create_proc_dir();
+ pg_proc_dir = proc_mkdir(PG_PROC_DIR, proc_net);
+ if (!pg_proc_dir)
+ return -ENODEV;
+ pg_proc_dir->owner = THIS_MODULE;
- sprintf(module_fname, "net/%s/pgctrl", PG_PROC_DIR);
- module_proc_ent = create_proc_entry(module_fname, 0600, NULL);
- if (!module_proc_ent) {
- printk("pktgen: ERROR: cannot create %s procfs entry.\n", module_fname);
+ pe = create_proc_entry(PGCTRL, 0600, pg_proc_dir);
+ if (pe == NULL) {
+ printk("pktgen: ERROR: cannot create %s procfs entry.\n", PGCTRL);
+ proc_net_remove(PG_PROC_DIR);
return -EINVAL;
}
- module_proc_ent->proc_fops = &pktgen_fops;
- module_proc_ent->data = NULL;
+ pe->proc_fops = &pktgen_fops;
+ pe->data = NULL;
/* Register us to receive netdevice events */
register_netdevice_notifier(&pktgen_notifier_block);
- for (cpu = 0; cpu < NR_CPUS ; cpu++) {
+ for_each_online_cpu(cpu) {
char buf[30];
- if (!cpu_online(cpu))
- continue;
-
sprintf(buf, "kpktgend_%i", cpu);
pktgen_create_thread(buf, cpu);
}
@@ -3110,10 +3049,8 @@ static void __exit pg_cleanup(void)
unregister_netdevice_notifier(&pktgen_notifier_block);
/* Clean up proc file system */
-
- remove_proc_entry(module_fname, NULL);
-
- remove_proc_dir();
+ remove_proc_entry(PGCTRL, pg_proc_dir);
+ proc_net_remove(PG_PROC_DIR);
}
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index bb55675f068..b8203de5ff0 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -32,7 +32,6 @@
* Further increasing requires to change hash table size.
*/
int sysctl_max_syn_backlog = 256;
-EXPORT_SYMBOL(sysctl_max_syn_backlog);
int reqsk_queue_alloc(struct request_sock_queue *queue,
const int nr_table_entries)
@@ -53,6 +52,8 @@ int reqsk_queue_alloc(struct request_sock_queue *queue,
get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
rwlock_init(&queue->syn_wait_lock);
queue->rskq_accept_head = queue->rskq_accept_head = NULL;
+ queue->rskq_defer_accept = 0;
+ lopt->nr_table_entries = nr_table_entries;
write_lock_bh(&queue->syn_wait_lock);
queue->listen_opt = lopt;
@@ -62,3 +63,28 @@ int reqsk_queue_alloc(struct request_sock_queue *queue,
}
EXPORT_SYMBOL(reqsk_queue_alloc);
+
+void reqsk_queue_destroy(struct request_sock_queue *queue)
+{
+ /* make all the listen_opt local to us */
+ struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue);
+
+ if (lopt->qlen != 0) {
+ int i;
+
+ for (i = 0; i < lopt->nr_table_entries; i++) {
+ struct request_sock *req;
+
+ while ((req = lopt->syn_table[i]) != NULL) {
+ lopt->syn_table[i] = req->dl_next;
+ lopt->qlen--;
+ reqsk_free(req);
+ }
+ }
+ }
+
+ BUG_TRAP(lopt->qlen == 0);
+ kfree(lopt);
+}
+
+EXPORT_SYMBOL(reqsk_queue_destroy);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 4b1bb30e638..9bed7569ce3 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -148,7 +148,7 @@ int rtnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo)
{
int err = 0;
- NETLINK_CB(skb).dst_groups = group;
+ NETLINK_CB(skb).dst_group = group;
if (echo)
atomic_inc(&skb->users);
netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL);
@@ -458,8 +458,8 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change)
kfree_skb(skb);
return;
}
- NETLINK_CB(skb).dst_groups = RTMGRP_LINK;
- netlink_broadcast(rtnl, skb, 0, RTMGRP_LINK, GFP_KERNEL);
+ NETLINK_CB(skb).dst_group = RTNLGRP_LINK;
+ netlink_broadcast(rtnl, skb, 0, RTNLGRP_LINK, GFP_KERNEL);
}
static int rtnetlink_done(struct netlink_callback *cb)
@@ -708,7 +708,8 @@ void __init rtnetlink_init(void)
if (!rta_buf)
panic("rtnetlink_init: cannot allocate rta_buf\n");
- rtnl = netlink_kernel_create(NETLINK_ROUTE, rtnetlink_rcv);
+ rtnl = netlink_kernel_create(NETLINK_ROUTE, RTNLGRP_MAX, rtnetlink_rcv,
+ THIS_MODULE);
if (rtnl == NULL)
panic("rtnetlink_init: cannot initialize rtnetlink\n");
netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 7eab867ede5..95501e40100 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -68,7 +68,8 @@
#include <asm/uaccess.h>
#include <asm/system.h>
-static kmem_cache_t *skbuff_head_cache;
+static kmem_cache_t *skbuff_head_cache __read_mostly;
+static kmem_cache_t *skbuff_fclone_cache __read_mostly;
/*
* Keep out-of-line to prevent kernel bloat.
@@ -118,9 +119,11 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
*/
/**
- * alloc_skb - allocate a network buffer
+ * __alloc_skb - allocate a network buffer
* @size: size to allocate
* @gfp_mask: allocation mask
+ * @fclone: allocate from fclone cache instead of head cache
+ * and allocate a cloned (child) skb
*
* Allocate a new &sk_buff. The returned buffer has no headroom and a
* tail room of size bytes. The object has a reference count of one.
@@ -129,14 +132,20 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
* Buffers may only be allocated from interrupts using a @gfp_mask of
* %GFP_ATOMIC.
*/
-struct sk_buff *alloc_skb(unsigned int size, unsigned int __nocast gfp_mask)
+struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
+ int fclone)
{
struct sk_buff *skb;
u8 *data;
/* Get the HEAD */
- skb = kmem_cache_alloc(skbuff_head_cache,
- gfp_mask & ~__GFP_DMA);
+ if (fclone)
+ skb = kmem_cache_alloc(skbuff_fclone_cache,
+ gfp_mask & ~__GFP_DMA);
+ else
+ skb = kmem_cache_alloc(skbuff_head_cache,
+ gfp_mask & ~__GFP_DMA);
+
if (!skb)
goto out;
@@ -153,12 +162,22 @@ struct sk_buff *alloc_skb(unsigned int size, unsigned int __nocast gfp_mask)
skb->data = data;
skb->tail = data;
skb->end = data + size;
+ if (fclone) {
+ struct sk_buff *child = skb + 1;
+ atomic_t *fclone_ref = (atomic_t *) (child + 1);
+ skb->fclone = SKB_FCLONE_ORIG;
+ atomic_set(fclone_ref, 1);
+
+ child->fclone = SKB_FCLONE_UNAVAILABLE;
+ }
atomic_set(&(skb_shinfo(skb)->dataref), 1);
skb_shinfo(skb)->nr_frags = 0;
skb_shinfo(skb)->tso_size = 0;
skb_shinfo(skb)->tso_segs = 0;
skb_shinfo(skb)->frag_list = NULL;
+ skb_shinfo(skb)->ufo_size = 0;
+ skb_shinfo(skb)->ip6_frag_id = 0;
out:
return skb;
nodata:
@@ -183,7 +202,7 @@ nodata:
*/
struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
unsigned int size,
- unsigned int __nocast gfp_mask)
+ gfp_t gfp_mask)
{
struct sk_buff *skb;
u8 *data;
@@ -266,8 +285,34 @@ void skb_release_data(struct sk_buff *skb)
*/
void kfree_skbmem(struct sk_buff *skb)
{
+ struct sk_buff *other;
+ atomic_t *fclone_ref;
+
skb_release_data(skb);
- kmem_cache_free(skbuff_head_cache, skb);
+ switch (skb->fclone) {
+ case SKB_FCLONE_UNAVAILABLE:
+ kmem_cache_free(skbuff_head_cache, skb);
+ break;
+
+ case SKB_FCLONE_ORIG:
+ fclone_ref = (atomic_t *) (skb + 2);
+ if (atomic_dec_and_test(fclone_ref))
+ kmem_cache_free(skbuff_fclone_cache, skb);
+ break;
+
+ case SKB_FCLONE_CLONE:
+ fclone_ref = (atomic_t *) (skb + 1);
+ other = skb - 1;
+
+ /* The clone portion is available for
+ * fast-cloning again.
+ */
+ skb->fclone = SKB_FCLONE_UNAVAILABLE;
+
+ if (atomic_dec_and_test(fclone_ref))
+ kmem_cache_free(skbuff_fclone_cache, other);
+ break;
+ };
}
/**
@@ -281,8 +326,6 @@ void kfree_skbmem(struct sk_buff *skb)
void __kfree_skb(struct sk_buff *skb)
{
- BUG_ON(skb->list != NULL);
-
dst_release(skb->dst);
#ifdef CONFIG_XFRM
secpath_put(skb->sp);
@@ -302,7 +345,6 @@ void __kfree_skb(struct sk_buff *skb)
skb->tc_index = 0;
#ifdef CONFIG_NET_CLS_ACT
skb->tc_verd = 0;
- skb->tc_classid = 0;
#endif
#endif
@@ -323,21 +365,29 @@ void __kfree_skb(struct sk_buff *skb)
* %GFP_ATOMIC.
*/
-struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask)
+struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
{
- struct sk_buff *n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
-
- if (!n)
- return NULL;
+ struct sk_buff *n;
+
+ n = skb + 1;
+ if (skb->fclone == SKB_FCLONE_ORIG &&
+ n->fclone == SKB_FCLONE_UNAVAILABLE) {
+ atomic_t *fclone_ref = (atomic_t *) (n + 1);
+ n->fclone = SKB_FCLONE_CLONE;
+ atomic_inc(fclone_ref);
+ } else {
+ n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
+ if (!n)
+ return NULL;
+ n->fclone = SKB_FCLONE_UNAVAILABLE;
+ }
#define C(x) n->x = skb->x
n->next = n->prev = NULL;
- n->list = NULL;
n->sk = NULL;
- C(stamp);
+ C(tstamp);
C(dev);
- C(real_dev);
C(h);
C(nh);
C(mac);
@@ -361,18 +411,17 @@ struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask)
n->destructor = NULL;
#ifdef CONFIG_NETFILTER
C(nfmark);
- C(nfcache);
C(nfct);
nf_conntrack_get(skb->nfct);
C(nfctinfo);
+#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
+ C(ipvs_property);
+#endif
#ifdef CONFIG_BRIDGE_NETFILTER
C(nf_bridge);
nf_bridge_get(skb->nf_bridge);
#endif
#endif /*CONFIG_NETFILTER*/
-#if defined(CONFIG_HIPPI)
- C(private);
-#endif
#ifdef CONFIG_NET_SCHED
C(tc_index);
#ifdef CONFIG_NET_CLS_ACT
@@ -380,7 +429,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask)
n->tc_verd = CLR_TC_OK2MUNGE(n->tc_verd);
n->tc_verd = CLR_TC_MUNGED(n->tc_verd);
C(input_dev);
- C(tc_classid);
#endif
#endif
@@ -404,10 +452,8 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
*/
unsigned long offset = new->data - old->data;
- new->list = NULL;
new->sk = NULL;
new->dev = old->dev;
- new->real_dev = old->real_dev;
new->priority = old->priority;
new->protocol = old->protocol;
new->dst = dst_clone(old->dst);
@@ -419,15 +465,18 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
new->mac.raw = old->mac.raw + offset;
memcpy(new->cb, old->cb, sizeof(old->cb));
new->local_df = old->local_df;
+ new->fclone = SKB_FCLONE_UNAVAILABLE;
new->pkt_type = old->pkt_type;
- new->stamp = old->stamp;
+ new->tstamp = old->tstamp;
new->destructor = NULL;
#ifdef CONFIG_NETFILTER
new->nfmark = old->nfmark;
- new->nfcache = old->nfcache;
new->nfct = old->nfct;
nf_conntrack_get(old->nfct);
new->nfctinfo = old->nfctinfo;
+#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
+ new->ipvs_property = old->ipvs_property;
+#endif
#ifdef CONFIG_BRIDGE_NETFILTER
new->nf_bridge = old->nf_bridge;
nf_bridge_get(old->nf_bridge);
@@ -461,7 +510,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
* header is going to be modified. Use pskb_copy() instead.
*/
-struct sk_buff *skb_copy(const struct sk_buff *skb, unsigned int __nocast gfp_mask)
+struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
{
int headerlen = skb->data - skb->head;
/*
@@ -500,7 +549,7 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, unsigned int __nocast gfp_ma
* The returned buffer has a reference count of 1.
*/
-struct sk_buff *pskb_copy(struct sk_buff *skb, unsigned int __nocast gfp_mask)
+struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
{
/*
* Allocate the copy buffer
@@ -559,7 +608,7 @@ out:
*/
int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
- unsigned int __nocast gfp_mask)
+ gfp_t gfp_mask)
{
int i;
u8 *data;
@@ -650,7 +699,7 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
*/
struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
int newheadroom, int newtailroom,
- unsigned int __nocast gfp_mask)
+ gfp_t gfp_mask)
{
/*
* Allocate the copy buffer
@@ -1344,50 +1393,43 @@ void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
__skb_queue_tail(list, newsk);
spin_unlock_irqrestore(&list->lock, flags);
}
+
/**
* skb_unlink - remove a buffer from a list
* @skb: buffer to remove
+ * @list: list to use
*
- * Place a packet after a given packet in a list. The list locks are taken
- * and this function is atomic with respect to other list locked calls
+ * Remove a packet from a list. The list locks are taken and this
+ * function is atomic with respect to other list locked calls
*
- * Works even without knowing the list it is sitting on, which can be
- * handy at times. It also means that THE LIST MUST EXIST when you
- * unlink. Thus a list must have its contents unlinked before it is
- * destroyed.
+ * You must know what list the SKB is on.
*/
-void skb_unlink(struct sk_buff *skb)
+void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
{
- struct sk_buff_head *list = skb->list;
-
- if (list) {
- unsigned long flags;
+ unsigned long flags;
- spin_lock_irqsave(&list->lock, flags);
- if (skb->list == list)
- __skb_unlink(skb, skb->list);
- spin_unlock_irqrestore(&list->lock, flags);
- }
+ spin_lock_irqsave(&list->lock, flags);
+ __skb_unlink(skb, list);
+ spin_unlock_irqrestore(&list->lock, flags);
}
-
/**
* skb_append - append a buffer
* @old: buffer to insert after
* @newsk: buffer to insert
+ * @list: list to use
*
* Place a packet after a given packet in a list. The list locks are taken
* and this function is atomic with respect to other list locked calls.
* A buffer cannot be placed on two lists at the same time.
*/
-
-void skb_append(struct sk_buff *old, struct sk_buff *newsk)
+void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
{
unsigned long flags;
- spin_lock_irqsave(&old->list->lock, flags);
- __skb_append(old, newsk);
- spin_unlock_irqrestore(&old->list->lock, flags);
+ spin_lock_irqsave(&list->lock, flags);
+ __skb_append(old, newsk, list);
+ spin_unlock_irqrestore(&list->lock, flags);
}
@@ -1395,19 +1437,21 @@ void skb_append(struct sk_buff *old, struct sk_buff *newsk)
* skb_insert - insert a buffer
* @old: buffer to insert before
* @newsk: buffer to insert
+ * @list: list to use
+ *
+ * Place a packet before a given packet in a list. The list locks are
+ * taken and this function is atomic with respect to other list locked
+ * calls.
*
- * Place a packet before a given packet in a list. The list locks are taken
- * and this function is atomic with respect to other list locked calls
* A buffer cannot be placed on two lists at the same time.
*/
-
-void skb_insert(struct sk_buff *old, struct sk_buff *newsk)
+void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
{
unsigned long flags;
- spin_lock_irqsave(&old->list->lock, flags);
- __skb_insert(newsk, old->prev, old, old->list);
- spin_unlock_irqrestore(&old->list->lock, flags);
+ spin_lock_irqsave(&list->lock, flags);
+ __skb_insert(newsk, old->prev, old, list);
+ spin_unlock_irqrestore(&list->lock, flags);
}
#if 0
@@ -1654,6 +1698,78 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
return textsearch_find(config, state);
}
+/**
+ * skb_append_datato_frags: - append the user data to a skb
+ * @sk: sock structure
+ * @skb: skb structure to be appened with user data.
+ * @getfrag: call back function to be used for getting the user data
+ * @from: pointer to user message iov
+ * @length: length of the iov message
+ *
+ * Description: This procedure append the user data in the fragment part
+ * of the skb if any page alloc fails user this procedure returns -ENOMEM
+ */
+int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
+ int getfrag(void *from, char *to, int offset,
+ int len, int odd, struct sk_buff *skb),
+ void *from, int length)
+{
+ int frg_cnt = 0;
+ skb_frag_t *frag = NULL;
+ struct page *page = NULL;
+ int copy, left;
+ int offset = 0;
+ int ret;
+
+ do {
+ /* Return error if we don't have space for new frag */
+ frg_cnt = skb_shinfo(skb)->nr_frags;
+ if (frg_cnt >= MAX_SKB_FRAGS)
+ return -EFAULT;
+
+ /* allocate a new page for next frag */
+ page = alloc_pages(sk->sk_allocation, 0);
+
+ /* If alloc_page fails just return failure and caller will
+ * free previous allocated pages by doing kfree_skb()
+ */
+ if (page == NULL)
+ return -ENOMEM;
+
+ /* initialize the next frag */
+ sk->sk_sndmsg_page = page;
+ sk->sk_sndmsg_off = 0;
+ skb_fill_page_desc(skb, frg_cnt, page, 0, 0);
+ skb->truesize += PAGE_SIZE;
+ atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
+
+ /* get the new initialized frag */
+ frg_cnt = skb_shinfo(skb)->nr_frags;
+ frag = &skb_shinfo(skb)->frags[frg_cnt - 1];
+
+ /* copy the user data to page */
+ left = PAGE_SIZE - frag->page_offset;
+ copy = (length > left)? left : length;
+
+ ret = getfrag(from, (page_address(frag->page) +
+ frag->page_offset + frag->size),
+ offset, copy, 0, skb);
+ if (ret < 0)
+ return -EFAULT;
+
+ /* copy was successful so update the size parameters */
+ sk->sk_sndmsg_off += copy;
+ frag->size += copy;
+ skb->len += copy;
+ skb->data_len += copy;
+ offset += copy;
+ length -= copy;
+
+ } while (length > 0);
+
+ return 0;
+}
+
void __init skb_init(void)
{
skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
@@ -1663,12 +1779,21 @@ void __init skb_init(void)
NULL, NULL);
if (!skbuff_head_cache)
panic("cannot create skbuff cache");
+
+ skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
+ (2*sizeof(struct sk_buff)) +
+ sizeof(atomic_t),
+ 0,
+ SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+ if (!skbuff_fclone_cache)
+ panic("cannot create skbuff cache");
}
EXPORT_SYMBOL(___pskb_trim);
EXPORT_SYMBOL(__kfree_skb);
EXPORT_SYMBOL(__pskb_pull_tail);
-EXPORT_SYMBOL(alloc_skb);
+EXPORT_SYMBOL(__alloc_skb);
EXPORT_SYMBOL(pskb_copy);
EXPORT_SYMBOL(pskb_expand_head);
EXPORT_SYMBOL(skb_checksum);
@@ -1696,3 +1821,4 @@ EXPORT_SYMBOL(skb_prepare_seq_read);
EXPORT_SYMBOL(skb_seq_read);
EXPORT_SYMBOL(skb_abort_seq_read);
EXPORT_SYMBOL(skb_find_text);
+EXPORT_SYMBOL(skb_append_datato_frags);
diff --git a/net/core/sock.c b/net/core/sock.c
index 12f6d9a2a52..9602ceb3bac 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -260,7 +260,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
if (val > sysctl_wmem_max)
val = sysctl_wmem_max;
-
+set_sndbuf:
sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
if ((val * 2) < SOCK_MIN_SNDBUF)
sk->sk_sndbuf = SOCK_MIN_SNDBUF;
@@ -274,6 +274,13 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
sk->sk_write_space(sk);
break;
+ case SO_SNDBUFFORCE:
+ if (!capable(CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ break;
+ }
+ goto set_sndbuf;
+
case SO_RCVBUF:
/* Don't error on this BSD doesn't and if you think
about it this is right. Otherwise apps have to
@@ -282,7 +289,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
if (val > sysctl_rmem_max)
val = sysctl_rmem_max;
-
+set_rcvbuf:
sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
/* FIXME: is this lower bound the right one? */
if ((val * 2) < SOCK_MIN_RCVBUF)
@@ -291,6 +298,13 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
sk->sk_rcvbuf = val * 2;
break;
+ case SO_RCVBUFFORCE:
+ if (!capable(CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ break;
+ }
+ goto set_rcvbuf;
+
case SO_KEEPALIVE:
#ifdef CONFIG_INET
if (sk->sk_protocol == IPPROTO_TCP)
@@ -327,11 +341,11 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
sock_reset_flag(sk, SOCK_LINGER);
else {
#if (BITS_PER_LONG == 32)
- if (ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
+ if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
else
#endif
- sk->sk_lingertime = ling.l_linger * HZ;
+ sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
sock_set_flag(sk, SOCK_LINGER);
}
break;
@@ -623,7 +637,7 @@ lenout:
* @prot: struct proto associated with this new sock instance
* @zero_it: if we should zero the newly allocated sock
*/
-struct sock *sk_alloc(int family, unsigned int __nocast priority,
+struct sock *sk_alloc(int family, gfp_t priority,
struct proto *prot, int zero_it)
{
struct sock *sk = NULL;
@@ -646,16 +660,20 @@ struct sock *sk_alloc(int family, unsigned int __nocast priority,
sock_lock_init(sk);
}
- if (security_sk_alloc(sk, family, priority)) {
- if (slab != NULL)
- kmem_cache_free(slab, sk);
- else
- kfree(sk);
- sk = NULL;
- } else
- __module_get(prot->owner);
+ if (security_sk_alloc(sk, family, priority))
+ goto out_free;
+
+ if (!try_module_get(prot->owner))
+ goto out_free;
}
return sk;
+
+out_free:
+ if (slab != NULL)
+ kmem_cache_free(slab, sk);
+ else
+ kfree(sk);
+ return NULL;
}
void sk_free(struct sock *sk)
@@ -686,6 +704,80 @@ void sk_free(struct sock *sk)
module_put(owner);
}
+struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
+{
+ struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
+
+ if (newsk != NULL) {
+ struct sk_filter *filter;
+
+ memcpy(newsk, sk, sk->sk_prot->obj_size);
+
+ /* SANITY */
+ sk_node_init(&newsk->sk_node);
+ sock_lock_init(newsk);
+ bh_lock_sock(newsk);
+
+ atomic_set(&newsk->sk_rmem_alloc, 0);
+ atomic_set(&newsk->sk_wmem_alloc, 0);
+ atomic_set(&newsk->sk_omem_alloc, 0);
+ skb_queue_head_init(&newsk->sk_receive_queue);
+ skb_queue_head_init(&newsk->sk_write_queue);
+
+ rwlock_init(&newsk->sk_dst_lock);
+ rwlock_init(&newsk->sk_callback_lock);
+
+ newsk->sk_dst_cache = NULL;
+ newsk->sk_wmem_queued = 0;
+ newsk->sk_forward_alloc = 0;
+ newsk->sk_send_head = NULL;
+ newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
+ newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
+
+ sock_reset_flag(newsk, SOCK_DONE);
+ skb_queue_head_init(&newsk->sk_error_queue);
+
+ filter = newsk->sk_filter;
+ if (filter != NULL)
+ sk_filter_charge(newsk, filter);
+
+ if (unlikely(xfrm_sk_clone_policy(newsk))) {
+ /* It is still raw copy of parent, so invalidate
+ * destructor and make plain sk_free() */
+ newsk->sk_destruct = NULL;
+ sk_free(newsk);
+ newsk = NULL;
+ goto out;
+ }
+
+ newsk->sk_err = 0;
+ newsk->sk_priority = 0;
+ atomic_set(&newsk->sk_refcnt, 2);
+
+ /*
+ * Increment the counter in the same struct proto as the master
+ * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
+ * is the same as sk->sk_prot->socks, as this field was copied
+ * with memcpy).
+ *
+ * This _changes_ the previous behaviour, where
+ * tcp_create_openreq_child always was incrementing the
+ * equivalent to tcp_prot->socks (inet_sock_nr), so this have
+ * to be taken into account in all callers. -acme
+ */
+ sk_refcnt_debug_inc(newsk);
+ newsk->sk_socket = NULL;
+ newsk->sk_sleep = NULL;
+
+ if (newsk->sk_prot->sockets_allocated)
+ atomic_inc(newsk->sk_prot->sockets_allocated);
+ }
+out:
+ return newsk;
+}
+
+EXPORT_SYMBOL_GPL(sk_clone);
+
void __init sk_init(void)
{
if (num_physpages <= 4096) {
@@ -753,7 +845,7 @@ unsigned long sock_i_ino(struct sock *sk)
* Allocate a skb from the socket's send buffer.
*/
struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
- unsigned int __nocast priority)
+ gfp_t priority)
{
if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
struct sk_buff * skb = alloc_skb(size, priority);
@@ -769,7 +861,7 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
* Allocate a skb from the socket's receive buffer.
*/
struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
- unsigned int __nocast priority)
+ gfp_t priority)
{
if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
struct sk_buff *skb = alloc_skb(size, priority);
@@ -784,7 +876,7 @@ struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
/*
* Allocate a memory block from the socket's option memory buffer.
*/
-void *sock_kmalloc(struct sock *sk, int size, unsigned int __nocast priority)
+void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
{
if ((unsigned)size <= sysctl_optmem_max &&
atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
@@ -848,7 +940,7 @@ static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
int noblock, int *errcode)
{
struct sk_buff *skb;
- unsigned int gfp_mask;
+ gfp_t gfp_mask;
long timeo;
int err;
@@ -1353,11 +1445,7 @@ void sk_common_release(struct sock *sk)
xfrm_sk_free_policy(sk);
-#ifdef INET_REFCNT_DEBUG
- if (atomic_read(&sk->sk_refcnt) != 1)
- printk(KERN_DEBUG "Destruction of the socket %p delayed, c=%d\n",
- sk, atomic_read(&sk->sk_refcnt));
-#endif
+ sk_refcnt_debug_release(sk);
sock_put(sk);
}
@@ -1368,7 +1456,8 @@ static LIST_HEAD(proto_list);
int proto_register(struct proto *prot, int alloc_slab)
{
- char *request_sock_slab_name;
+ char *request_sock_slab_name = NULL;
+ char *timewait_sock_slab_name;
int rc = -ENOBUFS;
if (alloc_slab) {
@@ -1399,6 +1488,23 @@ int proto_register(struct proto *prot, int alloc_slab)
goto out_free_request_sock_slab_name;
}
}
+
+ if (prot->twsk_obj_size) {
+ static const char mask[] = "tw_sock_%s";
+
+ timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
+
+ if (timewait_sock_slab_name == NULL)
+ goto out_free_request_sock_slab;
+
+ sprintf(timewait_sock_slab_name, mask, prot->name);
+ prot->twsk_slab = kmem_cache_create(timewait_sock_slab_name,
+ prot->twsk_obj_size,
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+ if (prot->twsk_slab == NULL)
+ goto out_free_timewait_sock_slab_name;
+ }
}
write_lock(&proto_list_lock);
@@ -1407,6 +1513,13 @@ int proto_register(struct proto *prot, int alloc_slab)
rc = 0;
out:
return rc;
+out_free_timewait_sock_slab_name:
+ kfree(timewait_sock_slab_name);
+out_free_request_sock_slab:
+ if (prot->rsk_prot && prot->rsk_prot->slab) {
+ kmem_cache_destroy(prot->rsk_prot->slab);
+ prot->rsk_prot->slab = NULL;
+ }
out_free_request_sock_slab_name:
kfree(request_sock_slab_name);
out_free_sock_slab:
@@ -1420,6 +1533,8 @@ EXPORT_SYMBOL(proto_register);
void proto_unregister(struct proto *prot)
{
write_lock(&proto_list_lock);
+ list_del(&prot->node);
+ write_unlock(&proto_list_lock);
if (prot->slab != NULL) {
kmem_cache_destroy(prot->slab);
@@ -1434,8 +1549,13 @@ void proto_unregister(struct proto *prot)
prot->rsk_prot->slab = NULL;
}
- list_del(&prot->node);
- write_unlock(&proto_list_lock);
+ if (prot->twsk_slab != NULL) {
+ const char *name = kmem_cache_name(prot->twsk_slab);
+
+ kmem_cache_destroy(prot->twsk_slab);
+ kfree(name);
+ prot->twsk_slab = NULL;
+ }
}
EXPORT_SYMBOL(proto_unregister);
@@ -1602,8 +1722,8 @@ EXPORT_SYMBOL(sock_wfree);
EXPORT_SYMBOL(sock_wmalloc);
EXPORT_SYMBOL(sock_i_uid);
EXPORT_SYMBOL(sock_i_ino);
-#ifdef CONFIG_SYSCTL
EXPORT_SYMBOL(sysctl_optmem_max);
+#ifdef CONFIG_SYSCTL
EXPORT_SYMBOL(sysctl_rmem_max);
EXPORT_SYMBOL(sysctl_wmem_max);
#endif
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 8f817ad9f54..2f278c8e474 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -9,23 +9,18 @@
#include <linux/sysctl.h>
#include <linux/config.h>
#include <linux/module.h>
+#include <linux/socket.h>
+#include <net/sock.h>
#ifdef CONFIG_SYSCTL
extern int netdev_max_backlog;
-extern int netdev_budget;
extern int weight_p;
-extern int net_msg_cost;
-extern int net_msg_burst;
extern __u32 sysctl_wmem_max;
extern __u32 sysctl_rmem_max;
-extern __u32 sysctl_wmem_default;
-extern __u32 sysctl_rmem_default;
extern int sysctl_core_destroy_delay;
-extern int sysctl_optmem_max;
-extern int sysctl_somaxconn;
#ifdef CONFIG_NET_DIVERT
extern char sysctl_divert_version[];
diff --git a/net/core/utils.c b/net/core/utils.c
index 88eb8b68e26..7b5970fc9e4 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -16,7 +16,9 @@
#include <linux/module.h>
#include <linux/jiffies.h>
#include <linux/kernel.h>
+#include <linux/inet.h>
#include <linux/mm.h>
+#include <linux/net.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/random.h>
diff --git a/net/core/wireless.c b/net/core/wireless.c
index 3ff5639c0b7..271ddb35b0b 100644
--- a/net/core/wireless.c
+++ b/net/core/wireless.c
@@ -58,6 +58,13 @@
* o Add wmb() in iw_handler_set_spy() for non-coherent archs/cpus
* Based on patch from Pavel Roskin <proski@gnu.org> :
* o Fix kernel data leak to user space in private handler handling
+ *
+ * v7 - 18.3.05 - Jean II
+ * o Remove (struct iw_point *)->pointer from events and streams
+ * o Remove spy_offset from struct iw_handler_def
+ * o Start deprecating dev->get_wireless_stats, output a warning
+ * o If IW_QUAL_DBM is set, show dBm values in /proc/net/wireless
+ * o Don't loose INVALID/DBM flags when clearing UPDATED flags (iwstats)
*/
/***************************** INCLUDES *****************************/
@@ -446,10 +453,19 @@ static inline struct iw_statistics *get_wireless_stats(struct net_device *dev)
(dev->wireless_handlers->get_wireless_stats != NULL))
return dev->wireless_handlers->get_wireless_stats(dev);
- /* Old location, will be phased out in next WE */
- return (dev->get_wireless_stats ?
- dev->get_wireless_stats(dev) :
- (struct iw_statistics *) NULL);
+ /* Old location, field to be removed in next WE */
+ if(dev->get_wireless_stats) {
+ static int printed_message;
+
+ if (!printed_message++)
+ printk(KERN_DEBUG "%s (WE) : Driver using old /proc/net/wireless support, please fix driver !\n",
+ dev->name);
+
+ return dev->get_wireless_stats(dev);
+ }
+
+ /* Not found */
+ return (struct iw_statistics *) NULL;
}
/* ---------------------------------------------------------------- */
@@ -541,16 +557,18 @@ static __inline__ void wireless_seq_printf_stats(struct seq_file *seq,
dev->name, stats->status, stats->qual.qual,
stats->qual.updated & IW_QUAL_QUAL_UPDATED
? '.' : ' ',
- ((__u8) stats->qual.level),
+ ((__s32) stats->qual.level) -
+ ((stats->qual.updated & IW_QUAL_DBM) ? 0x100 : 0),
stats->qual.updated & IW_QUAL_LEVEL_UPDATED
? '.' : ' ',
- ((__u8) stats->qual.noise),
+ ((__s32) stats->qual.noise) -
+ ((stats->qual.updated & IW_QUAL_DBM) ? 0x100 : 0),
stats->qual.updated & IW_QUAL_NOISE_UPDATED
? '.' : ' ',
stats->discard.nwid, stats->discard.code,
stats->discard.fragment, stats->discard.retries,
stats->discard.misc, stats->miss.beacon);
- stats->qual.updated = 0;
+ stats->qual.updated &= ~IW_QUAL_ALL_UPDATED;
}
}
@@ -571,10 +589,6 @@ static int wireless_seq_show(struct seq_file *seq, void *v)
return 0;
}
-extern void *dev_seq_start(struct seq_file *seq, loff_t *pos);
-extern void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos);
-extern void dev_seq_stop(struct seq_file *seq, void *v);
-
static struct seq_operations wireless_seq_ops = {
.start = dev_seq_start,
.next = dev_seq_next,
@@ -597,6 +611,7 @@ static struct file_operations wireless_seq_fops = {
int __init wireless_proc_init(void)
{
+ /* Create /proc/net/wireless entry */
if (!proc_net_fops_create("wireless", S_IRUGO, &wireless_seq_fops))
return -ENOMEM;
@@ -631,9 +646,9 @@ static inline int dev_iwstats(struct net_device *dev, struct ifreq *ifr)
sizeof(struct iw_statistics)))
return -EFAULT;
- /* Check if we need to clear the update flag */
+ /* Check if we need to clear the updated flag */
if(wrq->u.data.flags != 0)
- stats->qual.updated = 0;
+ stats->qual.updated &= ~IW_QUAL_ALL_UPDATED;
return 0;
} else
return -EOPNOTSUPP;
@@ -1144,8 +1159,8 @@ static inline void rtmsg_iwinfo(struct net_device * dev,
kfree_skb(skb);
return;
}
- NETLINK_CB(skb).dst_groups = RTMGRP_LINK;
- netlink_broadcast(rtnl, skb, 0, RTMGRP_LINK, GFP_ATOMIC);
+ NETLINK_CB(skb).dst_group = RTNLGRP_LINK;
+ netlink_broadcast(rtnl, skb, 0, RTNLGRP_LINK, GFP_ATOMIC);
}
#endif /* WE_EVENT_NETLINK */
@@ -1165,10 +1180,11 @@ void wireless_send_event(struct net_device * dev,
struct iw_event *event; /* Mallocated whole event */
int event_len; /* Its size */
int hdr_len; /* Size of the event header */
+ int wrqu_off = 0; /* Offset in wrqu */
/* Don't "optimise" the following variable, it will crash */
unsigned cmd_index; /* *MUST* be unsigned */
- /* Get the description of the IOCTL */
+ /* Get the description of the Event */
if(cmd <= SIOCIWLAST) {
cmd_index = cmd - SIOCIWFIRST;
if(cmd_index < standard_ioctl_num)
@@ -1211,6 +1227,8 @@ void wireless_send_event(struct net_device * dev,
/* Calculate extra_len - extra is NULL for restricted events */
if(extra != NULL)
extra_len = wrqu->data.length * descr->token_size;
+ /* Always at an offset in wrqu */
+ wrqu_off = IW_EV_POINT_OFF;
#ifdef WE_EVENT_DEBUG
printk(KERN_DEBUG "%s (WE) : Event 0x%04X, tokens %d, extra_len %d\n", dev->name, cmd, wrqu->data.length, extra_len);
#endif /* WE_EVENT_DEBUG */
@@ -1221,7 +1239,7 @@ void wireless_send_event(struct net_device * dev,
event_len = hdr_len + extra_len;
#ifdef WE_EVENT_DEBUG
- printk(KERN_DEBUG "%s (WE) : Event 0x%04X, hdr_len %d, event_len %d\n", dev->name, cmd, hdr_len, event_len);
+ printk(KERN_DEBUG "%s (WE) : Event 0x%04X, hdr_len %d, wrqu_off %d, event_len %d\n", dev->name, cmd, hdr_len, wrqu_off, event_len);
#endif /* WE_EVENT_DEBUG */
/* Create temporary buffer to hold the event */
@@ -1232,7 +1250,7 @@ void wireless_send_event(struct net_device * dev,
/* Fill event */
event->len = event_len;
event->cmd = cmd;
- memcpy(&event->u, wrqu, hdr_len - IW_EV_LCP_LEN);
+ memcpy(&event->u, ((char *) wrqu) + wrqu_off, hdr_len - IW_EV_LCP_LEN);
if(extra != NULL)
memcpy(((char *) event) + hdr_len, extra, extra_len);
@@ -1253,7 +1271,7 @@ void wireless_send_event(struct net_device * dev,
* Now, the driver can delegate this task to Wireless Extensions.
* It needs to use those standard spy iw_handler in struct iw_handler_def,
* push data to us via wireless_spy_update() and include struct iw_spy_data
- * in its private part (and advertise it in iw_handler_def->spy_offset).
+ * in its private part (and export it in net_device->wireless_data->spy_data).
* One of the main advantage of centralising spy support here is that
* it becomes much easier to improve and extend it without having to touch
* the drivers. One example is the addition of the Spy-Threshold events.
@@ -1270,10 +1288,7 @@ static inline struct iw_spy_data * get_spydata(struct net_device *dev)
/* This is the new way */
if(dev->wireless_data)
return(dev->wireless_data->spy_data);
-
- /* This is the old way. Doesn't work for multi-headed drivers.
- * It will be removed in the next version of WE. */
- return (dev->priv + dev->wireless_handlers->spy_offset);
+ return NULL;
}
/*------------------------------------------------------------------*/
@@ -1288,10 +1303,6 @@ int iw_handler_set_spy(struct net_device * dev,
struct iw_spy_data * spydata = get_spydata(dev);
struct sockaddr * address = (struct sockaddr *) extra;
- if(!dev->wireless_data)
- /* Help user know that driver needs updating */
- printk(KERN_DEBUG "%s (WE) : Driver using old/buggy spy support, please fix driver !\n",
- dev->name);
/* Make sure driver is not buggy or using the old API */
if(!spydata)
return -EOPNOTSUPP;
@@ -1322,7 +1333,7 @@ int iw_handler_set_spy(struct net_device * dev,
sizeof(struct iw_quality) * IW_MAX_SPY);
#ifdef WE_SPY_DEBUG
- printk(KERN_DEBUG "iw_handler_set_spy() : offset %ld, spydata %p, num %d\n", dev->wireless_handlers->spy_offset, spydata, wrqu->data.length);
+ printk(KERN_DEBUG "iw_handler_set_spy() : wireless_data %p, spydata %p, num %d\n", dev->wireless_data, spydata, wrqu->data.length);
for (i = 0; i < wrqu->data.length; i++)
printk(KERN_DEBUG
"%02X:%02X:%02X:%02X:%02X:%02X \n",
@@ -1375,7 +1386,7 @@ int iw_handler_get_spy(struct net_device * dev,
sizeof(struct iw_quality) * spydata->spy_number);
/* Reset updated flags. */
for(i = 0; i < spydata->spy_number; i++)
- spydata->spy_stat[i].updated = 0;
+ spydata->spy_stat[i].updated &= ~IW_QUAL_ALL_UPDATED;
return 0;
}
@@ -1490,7 +1501,7 @@ void wireless_spy_update(struct net_device * dev,
return;
#ifdef WE_SPY_DEBUG
- printk(KERN_DEBUG "wireless_spy_update() : offset %ld, spydata %p, address %02X:%02X:%02X:%02X:%02X:%02X\n", dev->wireless_handlers->spy_offset, spydata, address[0], address[1], address[2], address[3], address[4], address[5]);
+ printk(KERN_DEBUG "wireless_spy_update() : wireless_data %p, spydata %p, address %02X:%02X:%02X:%02X:%02X:%02X\n", dev->wireless_data, spydata, address[0], address[1], address[2], address[3], address[4], address[5]);
#endif /* WE_SPY_DEBUG */
/* Update all records that match */
diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig
new file mode 100644
index 00000000000..187ac182e24
--- /dev/null
+++ b/net/dccp/Kconfig
@@ -0,0 +1,50 @@
+menu "DCCP Configuration (EXPERIMENTAL)"
+ depends on INET && EXPERIMENTAL
+
+config IP_DCCP
+ tristate "The DCCP Protocol (EXPERIMENTAL)"
+ ---help---
+ Datagram Congestion Control Protocol
+
+ From draft-ietf-dccp-spec-11 <http://www.icir.org/kohler/dcp/draft-ietf-dccp-spec-11.txt>.
+
+ The Datagram Congestion Control Protocol (DCCP) is a transport
+ protocol that implements bidirectional, unicast connections of
+ congestion-controlled, unreliable datagrams. It should be suitable
+ for use by applications such as streaming media, Internet telephony,
+ and on-line games
+
+ To compile this protocol support as a module, choose M here: the
+ module will be called dccp.
+
+ If in doubt, say N.
+
+config INET_DCCP_DIAG
+ depends on IP_DCCP && INET_DIAG
+ def_tristate y if (IP_DCCP = y && INET_DIAG = y)
+ def_tristate m
+
+source "net/dccp/ccids/Kconfig"
+
+menu "DCCP Kernel Hacking"
+ depends on IP_DCCP && DEBUG_KERNEL=y
+
+config IP_DCCP_DEBUG
+ bool "DCCP debug messages"
+ ---help---
+ Only use this if you're hacking DCCP.
+
+ Just say N.
+
+config IP_DCCP_UNLOAD_HACK
+ depends on IP_DCCP=m && IP_DCCP_CCID3=m
+ bool "DCCP control sock unload hack"
+ ---help---
+ Enable this to be able to unload the dccp module when the it
+ has only one refcount held, the control sock one. Just execute
+ "rmmod dccp_ccid3 dccp"
+
+ Just say N.
+endmenu
+
+endmenu
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
new file mode 100644
index 00000000000..344a8da153f
--- /dev/null
+++ b/net/dccp/Makefile
@@ -0,0 +1,12 @@
+obj-$(CONFIG_IP_DCCP) += dccp.o
+
+dccp-y := ccid.o input.o ipv4.o minisocks.o options.o output.o proto.o \
+ timer.o
+
+dccp-$(CONFIG_IP_DCCP_ACKVEC) += ackvec.o
+
+obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o
+
+dccp_diag-y := diag.o
+
+obj-y += ccids/
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
new file mode 100644
index 00000000000..c9a62cca22f
--- /dev/null
+++ b/net/dccp/ackvec.c
@@ -0,0 +1,419 @@
+/*
+ * net/dccp/ackvec.c
+ *
+ * An implementation of the DCCP protocol
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; version 2 of the License;
+ */
+
+#include "ackvec.h"
+#include "dccp.h"
+
+#include <linux/dccp.h>
+#include <linux/skbuff.h>
+
+#include <net/sock.h>
+
+int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
+ int len = av->dccpav_vec_len + 2;
+ struct timeval now;
+ u32 elapsed_time;
+ unsigned char *to, *from;
+
+ dccp_timestamp(sk, &now);
+ elapsed_time = timeval_delta(&now, &av->dccpav_time) / 10;
+
+ if (elapsed_time != 0)
+ dccp_insert_option_elapsed_time(sk, skb, elapsed_time);
+
+ if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
+ return -1;
+
+ /*
+ * XXX: now we have just one ack vector sent record, so
+ * we have to wait for it to be cleared.
+ *
+ * Of course this is not acceptable, but this is just for
+ * basic testing now.
+ */
+ if (av->dccpav_ack_seqno != DCCP_MAX_SEQNO + 1)
+ return -1;
+
+ DCCP_SKB_CB(skb)->dccpd_opt_len += len;
+
+ to = skb_push(skb, len);
+ *to++ = DCCPO_ACK_VECTOR_0;
+ *to++ = len;
+
+ len = av->dccpav_vec_len;
+ from = av->dccpav_buf + av->dccpav_buf_head;
+
+ /* Check if buf_head wraps */
+ if (av->dccpav_buf_head + len > av->dccpav_vec_len) {
+ const u32 tailsize = (av->dccpav_vec_len - av->dccpav_buf_head);
+
+ memcpy(to, from, tailsize);
+ to += tailsize;
+ len -= tailsize;
+ from = av->dccpav_buf;
+ }
+
+ memcpy(to, from, len);
+ /*
+ * From draft-ietf-dccp-spec-11.txt:
+ *
+ * For each acknowledgement it sends, the HC-Receiver will add an
+ * acknowledgement record. ack_seqno will equal the HC-Receiver
+ * sequence number it used for the ack packet; ack_ptr will equal
+ * buf_head; ack_ackno will equal buf_ackno; and ack_nonce will
+ * equal buf_nonce.
+ *
+ * This implemention uses just one ack record for now.
+ */
+ av->dccpav_ack_seqno = DCCP_SKB_CB(skb)->dccpd_seq;
+ av->dccpav_ack_ptr = av->dccpav_buf_head;
+ av->dccpav_ack_ackno = av->dccpav_buf_ackno;
+ av->dccpav_ack_nonce = av->dccpav_buf_nonce;
+ av->dccpav_sent_len = av->dccpav_vec_len;
+
+ dccp_pr_debug("%sACK Vector 0, len=%d, ack_seqno=%llu, "
+ "ack_ackno=%llu\n",
+ debug_prefix, av->dccpav_sent_len,
+ (unsigned long long)av->dccpav_ack_seqno,
+ (unsigned long long)av->dccpav_ack_ackno);
+ return -1;
+}
+
+struct dccp_ackvec *dccp_ackvec_alloc(const unsigned int len,
+ const gfp_t priority)
+{
+ struct dccp_ackvec *av = kmalloc(sizeof(*av) + len, priority);
+
+ if (av != NULL) {
+ av->dccpav_buf_len = len;
+ av->dccpav_buf_head =
+ av->dccpav_buf_tail = av->dccpav_buf_len - 1;
+ av->dccpav_buf_ackno =
+ av->dccpav_ack_ackno = av->dccpav_ack_seqno = ~0LLU;
+ av->dccpav_buf_nonce = av->dccpav_buf_nonce = 0;
+ av->dccpav_ack_ptr = 0;
+ av->dccpav_time.tv_sec = 0;
+ av->dccpav_time.tv_usec = 0;
+ av->dccpav_sent_len = av->dccpav_vec_len = 0;
+ }
+
+ return av;
+}
+
+void dccp_ackvec_free(struct dccp_ackvec *av)
+{
+ kfree(av);
+}
+
+static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av,
+ const unsigned int index)
+{
+ return av->dccpav_buf[index] & DCCP_ACKVEC_STATE_MASK;
+}
+
+static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av,
+ const unsigned int index)
+{
+ return av->dccpav_buf[index] & DCCP_ACKVEC_LEN_MASK;
+}
+
+/*
+ * If several packets are missing, the HC-Receiver may prefer to enter multiple
+ * bytes with run length 0, rather than a single byte with a larger run length;
+ * this simplifies table updates if one of the missing packets arrives.
+ */
+static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av,
+ const unsigned int packets,
+ const unsigned char state)
+{
+ unsigned int gap;
+ signed long new_head;
+
+ if (av->dccpav_vec_len + packets > av->dccpav_buf_len)
+ return -ENOBUFS;
+
+ gap = packets - 1;
+ new_head = av->dccpav_buf_head - packets;
+
+ if (new_head < 0) {
+ if (gap > 0) {
+ memset(av->dccpav_buf, DCCP_ACKVEC_STATE_NOT_RECEIVED,
+ gap + new_head + 1);
+ gap = -new_head;
+ }
+ new_head += av->dccpav_buf_len;
+ }
+
+ av->dccpav_buf_head = new_head;
+
+ if (gap > 0)
+ memset(av->dccpav_buf + av->dccpav_buf_head + 1,
+ DCCP_ACKVEC_STATE_NOT_RECEIVED, gap);
+
+ av->dccpav_buf[av->dccpav_buf_head] = state;
+ av->dccpav_vec_len += packets;
+ return 0;
+}
+
+/*
+ * Implements the draft-ietf-dccp-spec-11.txt Appendix A
+ */
+int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
+ const u64 ackno, const u8 state)
+{
+ /*
+ * Check at the right places if the buffer is full, if it is, tell the
+ * caller to start dropping packets till the HC-Sender acks our ACK
+ * vectors, when we will free up space in dccpav_buf.
+ *
+ * We may well decide to do buffer compression, etc, but for now lets
+ * just drop.
+ *
+ * From Appendix A:
+ *
+ * Of course, the circular buffer may overflow, either when the
+ * HC-Sender is sending data at a very high rate, when the
+ * HC-Receiver's acknowledgements are not reaching the HC-Sender,
+ * or when the HC-Sender is forgetting to acknowledge those acks
+ * (so the HC-Receiver is unable to clean up old state). In this
+ * case, the HC-Receiver should either compress the buffer (by
+ * increasing run lengths when possible), transfer its state to
+ * a larger buffer, or, as a last resort, drop all received
+ * packets, without processing them whatsoever, until its buffer
+ * shrinks again.
+ */
+
+ /* See if this is the first ackno being inserted */
+ if (av->dccpav_vec_len == 0) {
+ av->dccpav_buf[av->dccpav_buf_head] = state;
+ av->dccpav_vec_len = 1;
+ } else if (after48(ackno, av->dccpav_buf_ackno)) {
+ const u64 delta = dccp_delta_seqno(av->dccpav_buf_ackno,
+ ackno);
+
+ /*
+ * Look if the state of this packet is the same as the
+ * previous ackno and if so if we can bump the head len.
+ */
+ if (delta == 1 &&
+ dccp_ackvec_state(av, av->dccpav_buf_head) == state &&
+ (dccp_ackvec_len(av, av->dccpav_buf_head) <
+ DCCP_ACKVEC_LEN_MASK))
+ av->dccpav_buf[av->dccpav_buf_head]++;
+ else if (dccp_ackvec_set_buf_head_state(av, delta, state))
+ return -ENOBUFS;
+ } else {
+ /*
+ * A.1.2. Old Packets
+ *
+ * When a packet with Sequence Number S arrives, and
+ * S <= buf_ackno, the HC-Receiver will scan the table
+ * for the byte corresponding to S. (Indexing structures
+ * could reduce the complexity of this scan.)
+ */
+ u64 delta = dccp_delta_seqno(ackno, av->dccpav_buf_ackno);
+ unsigned int index = av->dccpav_buf_head;
+
+ while (1) {
+ const u8 len = dccp_ackvec_len(av, index);
+ const u8 state = dccp_ackvec_state(av, index);
+ /*
+ * valid packets not yet in dccpav_buf have a reserved
+ * entry, with a len equal to 0.
+ */
+ if (state == DCCP_ACKVEC_STATE_NOT_RECEIVED &&
+ len == 0 && delta == 0) { /* Found our
+ reserved seat! */
+ dccp_pr_debug("Found %llu reserved seat!\n",
+ (unsigned long long)ackno);
+ av->dccpav_buf[index] = state;
+ goto out;
+ }
+ /* len == 0 means one packet */
+ if (delta < len + 1)
+ goto out_duplicate;
+
+ delta -= len + 1;
+ if (++index == av->dccpav_buf_len)
+ index = 0;
+ }
+ }
+
+ av->dccpav_buf_ackno = ackno;
+ dccp_timestamp(sk, &av->dccpav_time);
+out:
+ dccp_pr_debug("");
+ return 0;
+
+out_duplicate:
+ /* Duplicate packet */
+ dccp_pr_debug("Received a dup or already considered lost "
+ "packet: %llu\n", (unsigned long long)ackno);
+ return -EILSEQ;
+}
+
+#ifdef CONFIG_IP_DCCP_DEBUG
+void dccp_ackvector_print(const u64 ackno, const unsigned char *vector, int len)
+{
+ if (!dccp_debug)
+ return;
+
+ printk("ACK vector len=%d, ackno=%llu |", len,
+ (unsigned long long)ackno);
+
+ while (len--) {
+ const u8 state = (*vector & DCCP_ACKVEC_STATE_MASK) >> 6;
+ const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK;
+
+ printk("%d,%d|", state, rl);
+ ++vector;
+ }
+
+ printk("\n");
+}
+
+void dccp_ackvec_print(const struct dccp_ackvec *av)
+{
+ dccp_ackvector_print(av->dccpav_buf_ackno,
+ av->dccpav_buf + av->dccpav_buf_head,
+ av->dccpav_vec_len);
+}
+#endif
+
+static void dccp_ackvec_trow_away_ack_record(struct dccp_ackvec *av)
+{
+ /*
+ * As we're keeping track of the ack vector size (dccpav_vec_len) and
+ * the sent ack vector size (dccpav_sent_len) we don't need
+ * dccpav_buf_tail at all, but keep this code here as in the future
+ * we'll implement a vector of ack records, as suggested in
+ * draft-ietf-dccp-spec-11.txt Appendix A. -acme
+ */
+#if 0
+ av->dccpav_buf_tail = av->dccpav_ack_ptr + 1;
+ if (av->dccpav_buf_tail >= av->dccpav_vec_len)
+ av->dccpav_buf_tail -= av->dccpav_vec_len;
+#endif
+ av->dccpav_vec_len -= av->dccpav_sent_len;
+}
+
+void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk,
+ const u64 ackno)
+{
+ /* Check if we actually sent an ACK vector */
+ if (av->dccpav_ack_seqno == DCCP_MAX_SEQNO + 1)
+ return;
+
+ if (ackno == av->dccpav_ack_seqno) {
+#ifdef CONFIG_IP_DCCP_DEBUG
+ struct dccp_sock *dp = dccp_sk(sk);
+ const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ?
+ "CLIENT rx ack: " : "server rx ack: ";
+#endif
+ dccp_pr_debug("%sACK packet 0, len=%d, ack_seqno=%llu, "
+ "ack_ackno=%llu, ACKED!\n",
+ debug_prefix, 1,
+ (unsigned long long)av->dccpav_ack_seqno,
+ (unsigned long long)av->dccpav_ack_ackno);
+ dccp_ackvec_trow_away_ack_record(av);
+ av->dccpav_ack_seqno = DCCP_MAX_SEQNO + 1;
+ }
+}
+
+static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av,
+ struct sock *sk, u64 ackno,
+ const unsigned char len,
+ const unsigned char *vector)
+{
+ unsigned char i;
+
+ /* Check if we actually sent an ACK vector */
+ if (av->dccpav_ack_seqno == DCCP_MAX_SEQNO + 1)
+ return;
+ /*
+ * We're in the receiver half connection, so if the received an ACK
+ * vector ackno (e.g. 50) before dccpav_ack_seqno (e.g. 52), we're
+ * not interested.
+ *
+ * Extra explanation with example:
+ *
+ * if we received an ACK vector with ackno 50, it can only be acking
+ * 50, 49, 48, etc, not 52 (the seqno for the ACK vector we sent).
+ */
+ /* dccp_pr_debug("is %llu < %llu? ", ackno, av->dccpav_ack_seqno); */
+ if (before48(ackno, av->dccpav_ack_seqno)) {
+ /* dccp_pr_debug_cat("yes\n"); */
+ return;
+ }
+ /* dccp_pr_debug_cat("no\n"); */
+
+ i = len;
+ while (i--) {
+ const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK;
+ u64 ackno_end_rl;
+
+ dccp_set_seqno(&ackno_end_rl, ackno - rl);
+
+ /*
+ * dccp_pr_debug("is %llu <= %llu <= %llu? ", ackno_end_rl,
+ * av->dccpav_ack_seqno, ackno);
+ */
+ if (between48(av->dccpav_ack_seqno, ackno_end_rl, ackno)) {
+ const u8 state = (*vector &
+ DCCP_ACKVEC_STATE_MASK) >> 6;
+ /* dccp_pr_debug_cat("yes\n"); */
+
+ if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED) {
+#ifdef CONFIG_IP_DCCP_DEBUG
+ struct dccp_sock *dp = dccp_sk(sk);
+ const char *debug_prefix =
+ dp->dccps_role == DCCP_ROLE_CLIENT ?
+ "CLIENT rx ack: " : "server rx ack: ";
+#endif
+ dccp_pr_debug("%sACK vector 0, len=%d, "
+ "ack_seqno=%llu, ack_ackno=%llu, "
+ "ACKED!\n",
+ debug_prefix, len,
+ (unsigned long long)
+ av->dccpav_ack_seqno,
+ (unsigned long long)
+ av->dccpav_ack_ackno);
+ dccp_ackvec_trow_away_ack_record(av);
+ }
+ /*
+ * If dccpav_ack_seqno was not received, no problem
+ * we'll send another ACK vector.
+ */
+ av->dccpav_ack_seqno = DCCP_MAX_SEQNO + 1;
+ break;
+ }
+ /* dccp_pr_debug_cat("no\n"); */
+
+ dccp_set_seqno(&ackno, ackno_end_rl - 1);
+ ++vector;
+ }
+}
+
+int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
+ const u8 opt, const u8 *value, const u8 len)
+{
+ if (len > DCCP_MAX_ACKVEC_LEN)
+ return -1;
+
+ /* dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq, value, len); */
+ dccp_ackvec_check_rcv_ackvector(dccp_sk(sk)->dccps_hc_rx_ackvec, sk,
+ DCCP_SKB_CB(skb)->dccpd_ack_seq,
+ len, value);
+ return 0;
+}
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
new file mode 100644
index 00000000000..d0fd6c60c57
--- /dev/null
+++ b/net/dccp/ackvec.h
@@ -0,0 +1,133 @@
+#ifndef _ACKVEC_H
+#define _ACKVEC_H
+/*
+ * net/dccp/ackvec.h
+ *
+ * An implementation of the DCCP protocol
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@mandriva.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/config.h>
+#include <linux/compiler.h>
+#include <linux/time.h>
+#include <linux/types.h>
+
+/* Read about the ECN nonce to see why it is 253 */
+#define DCCP_MAX_ACKVEC_LEN 253
+
+#define DCCP_ACKVEC_STATE_RECEIVED 0
+#define DCCP_ACKVEC_STATE_ECN_MARKED (1 << 6)
+#define DCCP_ACKVEC_STATE_NOT_RECEIVED (3 << 6)
+
+#define DCCP_ACKVEC_STATE_MASK 0xC0 /* 11000000 */
+#define DCCP_ACKVEC_LEN_MASK 0x3F /* 00111111 */
+
+/** struct dccp_ackvec - ack vector
+ *
+ * This data structure is the one defined in the DCCP draft
+ * Appendix A.
+ *
+ * @dccpav_buf_head - circular buffer head
+ * @dccpav_buf_tail - circular buffer tail
+ * @dccpav_buf_ackno - ack # of the most recent packet acknowledgeable in the
+ * buffer (i.e. %dccpav_buf_head)
+ * @dccpav_buf_nonce - the one-bit sum of the ECN Nonces on all packets acked
+ * by the buffer with State 0
+ *
+ * Additionally, the HC-Receiver must keep some information about the
+ * Ack Vectors it has recently sent. For each packet sent carrying an
+ * Ack Vector, it remembers four variables:
+ *
+ * @dccpav_ack_seqno - the Sequence Number used for the packet
+ * (HC-Receiver seqno)
+ * @dccpav_ack_ptr - the value of buf_head at the time of acknowledgement.
+ * @dccpav_ack_ackno - the Acknowledgement Number used for the packet
+ * (HC-Sender seqno)
+ * @dccpav_ack_nonce - the one-bit sum of the ECN Nonces for all State 0.
+ *
+ * @dccpav_buf_len - circular buffer length
+ * @dccpav_time - the time in usecs
+ * @dccpav_buf - circular buffer of acknowledgeable packets
+ */
+struct dccp_ackvec {
+ unsigned int dccpav_buf_head;
+ unsigned int dccpav_buf_tail;
+ u64 dccpav_buf_ackno;
+ u64 dccpav_ack_seqno;
+ u64 dccpav_ack_ackno;
+ unsigned int dccpav_ack_ptr;
+ unsigned int dccpav_sent_len;
+ unsigned int dccpav_vec_len;
+ unsigned int dccpav_buf_len;
+ struct timeval dccpav_time;
+ u8 dccpav_buf_nonce;
+ u8 dccpav_ack_nonce;
+ u8 dccpav_buf[0];
+};
+
+struct sock;
+struct sk_buff;
+
+#ifdef CONFIG_IP_DCCP_ACKVEC
+extern struct dccp_ackvec *dccp_ackvec_alloc(unsigned int len,
+ const gfp_t priority);
+extern void dccp_ackvec_free(struct dccp_ackvec *av);
+
+extern int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
+ const u64 ackno, const u8 state);
+
+extern void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av,
+ struct sock *sk, const u64 ackno);
+extern int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
+ const u8 opt, const u8 *value, const u8 len);
+
+extern int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb);
+
+static inline int dccp_ackvec_pending(const struct dccp_ackvec *av)
+{
+ return av->dccpav_sent_len != av->dccpav_vec_len;
+}
+#else /* CONFIG_IP_DCCP_ACKVEC */
+static inline struct dccp_ackvec *dccp_ackvec_alloc(unsigned int len,
+ const gfp_t priority)
+{
+ return NULL;
+}
+
+static inline void dccp_ackvec_free(struct dccp_ackvec *av)
+{
+}
+
+static inline int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
+ const u64 ackno, const u8 state)
+{
+ return -1;
+}
+
+static inline void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av,
+ struct sock *sk, const u64 ackno)
+{
+}
+
+static inline int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
+ const u8 opt, const u8 *value, const u8 len)
+{
+ return -1;
+}
+
+static inline int dccp_insert_option_ackvec(const struct sock *sk,
+ const struct sk_buff *skb)
+{
+ return -1;
+}
+
+static inline int dccp_ackvec_pending(const struct dccp_ackvec *av)
+{
+ return 0;
+}
+#endif /* CONFIG_IP_DCCP_ACKVEC */
+#endif /* _ACKVEC_H */
diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c
new file mode 100644
index 00000000000..9d8fc0e289e
--- /dev/null
+++ b/net/dccp/ccid.c
@@ -0,0 +1,139 @@
+/*
+ * net/dccp/ccid.c
+ *
+ * An implementation of the DCCP protocol
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * CCID infrastructure
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "ccid.h"
+
+static struct ccid *ccids[CCID_MAX];
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
+static atomic_t ccids_lockct = ATOMIC_INIT(0);
+static DEFINE_SPINLOCK(ccids_lock);
+
+/*
+ * The strategy is: modifications ccids vector are short, do not sleep and
+ * veeery rare, but read access should be free of any exclusive locks.
+ */
+static void ccids_write_lock(void)
+{
+ spin_lock(&ccids_lock);
+ while (atomic_read(&ccids_lockct) != 0) {
+ spin_unlock(&ccids_lock);
+ yield();
+ spin_lock(&ccids_lock);
+ }
+}
+
+static inline void ccids_write_unlock(void)
+{
+ spin_unlock(&ccids_lock);
+}
+
+static inline void ccids_read_lock(void)
+{
+ atomic_inc(&ccids_lockct);
+ spin_unlock_wait(&ccids_lock);
+}
+
+static inline void ccids_read_unlock(void)
+{
+ atomic_dec(&ccids_lockct);
+}
+
+#else
+#define ccids_write_lock() do { } while(0)
+#define ccids_write_unlock() do { } while(0)
+#define ccids_read_lock() do { } while(0)
+#define ccids_read_unlock() do { } while(0)
+#endif
+
+int ccid_register(struct ccid *ccid)
+{
+ int err;
+
+ if (ccid->ccid_init == NULL)
+ return -1;
+
+ ccids_write_lock();
+ err = -EEXIST;
+ if (ccids[ccid->ccid_id] == NULL) {
+ ccids[ccid->ccid_id] = ccid;
+ err = 0;
+ }
+ ccids_write_unlock();
+ if (err == 0)
+ pr_info("CCID: Registered CCID %d (%s)\n",
+ ccid->ccid_id, ccid->ccid_name);
+ return err;
+}
+
+EXPORT_SYMBOL_GPL(ccid_register);
+
+int ccid_unregister(struct ccid *ccid)
+{
+ ccids_write_lock();
+ ccids[ccid->ccid_id] = NULL;
+ ccids_write_unlock();
+ pr_info("CCID: Unregistered CCID %d (%s)\n",
+ ccid->ccid_id, ccid->ccid_name);
+ return 0;
+}
+
+EXPORT_SYMBOL_GPL(ccid_unregister);
+
+struct ccid *ccid_init(unsigned char id, struct sock *sk)
+{
+ struct ccid *ccid;
+
+#ifdef CONFIG_KMOD
+ if (ccids[id] == NULL)
+ request_module("net-dccp-ccid-%d", id);
+#endif
+ ccids_read_lock();
+
+ ccid = ccids[id];
+ if (ccid == NULL)
+ goto out;
+
+ if (!try_module_get(ccid->ccid_owner))
+ goto out_err;
+
+ if (ccid->ccid_init(sk) != 0)
+ goto out_module_put;
+out:
+ ccids_read_unlock();
+ return ccid;
+out_module_put:
+ module_put(ccid->ccid_owner);
+out_err:
+ ccid = NULL;
+ goto out;
+}
+
+EXPORT_SYMBOL_GPL(ccid_init);
+
+void ccid_exit(struct ccid *ccid, struct sock *sk)
+{
+ if (ccid == NULL)
+ return;
+
+ ccids_read_lock();
+
+ if (ccids[ccid->ccid_id] != NULL) {
+ if (ccid->ccid_exit != NULL)
+ ccid->ccid_exit(sk);
+ module_put(ccid->ccid_owner);
+ }
+
+ ccids_read_unlock();
+}
+
+EXPORT_SYMBOL_GPL(ccid_exit);
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
new file mode 100644
index 00000000000..c37eeeaf5c6
--- /dev/null
+++ b/net/dccp/ccid.h
@@ -0,0 +1,211 @@
+#ifndef _CCID_H
+#define _CCID_H
+/*
+ * net/dccp/ccid.h
+ *
+ * An implementation of the DCCP protocol
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * CCID infrastructure
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <net/sock.h>
+#include <linux/compiler.h>
+#include <linux/dccp.h>
+#include <linux/list.h>
+#include <linux/module.h>
+
+#define CCID_MAX 255
+
+struct ccid {
+ unsigned char ccid_id;
+ const char *ccid_name;
+ struct module *ccid_owner;
+ int (*ccid_init)(struct sock *sk);
+ void (*ccid_exit)(struct sock *sk);
+ int (*ccid_hc_rx_init)(struct sock *sk);
+ int (*ccid_hc_tx_init)(struct sock *sk);
+ void (*ccid_hc_rx_exit)(struct sock *sk);
+ void (*ccid_hc_tx_exit)(struct sock *sk);
+ void (*ccid_hc_rx_packet_recv)(struct sock *sk,
+ struct sk_buff *skb);
+ int (*ccid_hc_rx_parse_options)(struct sock *sk,
+ unsigned char option,
+ unsigned char len, u16 idx,
+ unsigned char* value);
+ void (*ccid_hc_rx_insert_options)(struct sock *sk,
+ struct sk_buff *skb);
+ void (*ccid_hc_tx_insert_options)(struct sock *sk,
+ struct sk_buff *skb);
+ void (*ccid_hc_tx_packet_recv)(struct sock *sk,
+ struct sk_buff *skb);
+ int (*ccid_hc_tx_parse_options)(struct sock *sk,
+ unsigned char option,
+ unsigned char len, u16 idx,
+ unsigned char* value);
+ int (*ccid_hc_tx_send_packet)(struct sock *sk,
+ struct sk_buff *skb, int len);
+ void (*ccid_hc_tx_packet_sent)(struct sock *sk, int more,
+ int len);
+ void (*ccid_hc_rx_get_info)(struct sock *sk,
+ struct tcp_info *info);
+ void (*ccid_hc_tx_get_info)(struct sock *sk,
+ struct tcp_info *info);
+ int (*ccid_hc_rx_getsockopt)(struct sock *sk,
+ const int optname, int len,
+ u32 __user *optval,
+ int __user *optlen);
+ int (*ccid_hc_tx_getsockopt)(struct sock *sk,
+ const int optname, int len,
+ u32 __user *optval,
+ int __user *optlen);
+};
+
+extern int ccid_register(struct ccid *ccid);
+extern int ccid_unregister(struct ccid *ccid);
+
+extern struct ccid *ccid_init(unsigned char id, struct sock *sk);
+extern void ccid_exit(struct ccid *ccid, struct sock *sk);
+
+static inline void __ccid_get(struct ccid *ccid)
+{
+ __module_get(ccid->ccid_owner);
+}
+
+static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk,
+ struct sk_buff *skb, int len)
+{
+ int rc = 0;
+ if (ccid->ccid_hc_tx_send_packet != NULL)
+ rc = ccid->ccid_hc_tx_send_packet(sk, skb, len);
+ return rc;
+}
+
+static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk,
+ int more, int len)
+{
+ if (ccid->ccid_hc_tx_packet_sent != NULL)
+ ccid->ccid_hc_tx_packet_sent(sk, more, len);
+}
+
+static inline int ccid_hc_rx_init(struct ccid *ccid, struct sock *sk)
+{
+ int rc = 0;
+ if (ccid->ccid_hc_rx_init != NULL)
+ rc = ccid->ccid_hc_rx_init(sk);
+ return rc;
+}
+
+static inline int ccid_hc_tx_init(struct ccid *ccid, struct sock *sk)
+{
+ int rc = 0;
+ if (ccid->ccid_hc_tx_init != NULL)
+ rc = ccid->ccid_hc_tx_init(sk);
+ return rc;
+}
+
+static inline void ccid_hc_rx_exit(struct ccid *ccid, struct sock *sk)
+{
+ if (ccid != NULL && ccid->ccid_hc_rx_exit != NULL &&
+ dccp_sk(sk)->dccps_hc_rx_ccid_private != NULL)
+ ccid->ccid_hc_rx_exit(sk);
+}
+
+static inline void ccid_hc_tx_exit(struct ccid *ccid, struct sock *sk)
+{
+ if (ccid != NULL && ccid->ccid_hc_tx_exit != NULL &&
+ dccp_sk(sk)->dccps_hc_tx_ccid_private != NULL)
+ ccid->ccid_hc_tx_exit(sk);
+}
+
+static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk,
+ struct sk_buff *skb)
+{
+ if (ccid->ccid_hc_rx_packet_recv != NULL)
+ ccid->ccid_hc_rx_packet_recv(sk, skb);
+}
+
+static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk,
+ struct sk_buff *skb)
+{
+ if (ccid->ccid_hc_tx_packet_recv != NULL)
+ ccid->ccid_hc_tx_packet_recv(sk, skb);
+}
+
+static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk,
+ unsigned char option,
+ unsigned char len, u16 idx,
+ unsigned char* value)
+{
+ int rc = 0;
+ if (ccid->ccid_hc_tx_parse_options != NULL)
+ rc = ccid->ccid_hc_tx_parse_options(sk, option, len, idx,
+ value);
+ return rc;
+}
+
+static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk,
+ unsigned char option,
+ unsigned char len, u16 idx,
+ unsigned char* value)
+{
+ int rc = 0;
+ if (ccid->ccid_hc_rx_parse_options != NULL)
+ rc = ccid->ccid_hc_rx_parse_options(sk, option, len, idx, value);
+ return rc;
+}
+
+static inline void ccid_hc_tx_insert_options(struct ccid *ccid, struct sock *sk,
+ struct sk_buff *skb)
+{
+ if (ccid->ccid_hc_tx_insert_options != NULL)
+ ccid->ccid_hc_tx_insert_options(sk, skb);
+}
+
+static inline void ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk,
+ struct sk_buff *skb)
+{
+ if (ccid->ccid_hc_rx_insert_options != NULL)
+ ccid->ccid_hc_rx_insert_options(sk, skb);
+}
+
+static inline void ccid_hc_rx_get_info(struct ccid *ccid, struct sock *sk,
+ struct tcp_info *info)
+{
+ if (ccid->ccid_hc_rx_get_info != NULL)
+ ccid->ccid_hc_rx_get_info(sk, info);
+}
+
+static inline void ccid_hc_tx_get_info(struct ccid *ccid, struct sock *sk,
+ struct tcp_info *info)
+{
+ if (ccid->ccid_hc_tx_get_info != NULL)
+ ccid->ccid_hc_tx_get_info(sk, info);
+}
+
+static inline int ccid_hc_rx_getsockopt(struct ccid *ccid, struct sock *sk,
+ const int optname, int len,
+ u32 __user *optval, int __user *optlen)
+{
+ int rc = -ENOPROTOOPT;
+ if (ccid->ccid_hc_rx_getsockopt != NULL)
+ rc = ccid->ccid_hc_rx_getsockopt(sk, optname, len,
+ optval, optlen);
+ return rc;
+}
+
+static inline int ccid_hc_tx_getsockopt(struct ccid *ccid, struct sock *sk,
+ const int optname, int len,
+ u32 __user *optval, int __user *optlen)
+{
+ int rc = -ENOPROTOOPT;
+ if (ccid->ccid_hc_tx_getsockopt != NULL)
+ rc = ccid->ccid_hc_tx_getsockopt(sk, optname, len,
+ optval, optlen);
+ return rc;
+}
+#endif /* _CCID_H */
diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig
new file mode 100644
index 00000000000..7684d83946a
--- /dev/null
+++ b/net/dccp/ccids/Kconfig
@@ -0,0 +1,29 @@
+menu "DCCP CCIDs Configuration (EXPERIMENTAL)"
+ depends on IP_DCCP && EXPERIMENTAL
+
+config IP_DCCP_CCID3
+ tristate "CCID3 (TFRC) (EXPERIMENTAL)"
+ depends on IP_DCCP
+ ---help---
+ CCID 3 denotes TCP-Friendly Rate Control (TFRC), an equation-based
+ rate-controlled congestion control mechanism. TFRC is designed to
+ be reasonably fair when competing for bandwidth with TCP-like flows,
+ where a flow is "reasonably fair" if its sending rate is generally
+ within a factor of two of the sending rate of a TCP flow under the
+ same conditions. However, TFRC has a much lower variation of
+ throughput over time compared with TCP, which makes CCID 3 more
+ suitable than CCID 2 for applications such streaming media where a
+ relatively smooth sending rate is of importance.
+
+ CCID 3 is further described in [CCID 3 PROFILE]. The TFRC
+ congestion control algorithms were initially described in RFC 3448.
+
+ This text was extracted from draft-ietf-dccp-spec-11.txt.
+
+ If in doubt, say M.
+
+config IP_DCCP_TFRC_LIB
+ depends on IP_DCCP_CCID3
+ def_tristate IP_DCCP_CCID3
+
+endmenu
diff --git a/net/dccp/ccids/Makefile b/net/dccp/ccids/Makefile
new file mode 100644
index 00000000000..956f79f5074
--- /dev/null
+++ b/net/dccp/ccids/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_IP_DCCP_CCID3) += dccp_ccid3.o
+
+dccp_ccid3-y := ccid3.o
+
+obj-y += lib/
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
new file mode 100644
index 00000000000..aa68e0ab274
--- /dev/null
+++ b/net/dccp/ccids/ccid3.c
@@ -0,0 +1,1270 @@
+/*
+ * net/dccp/ccids/ccid3.c
+ *
+ * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
+ *
+ * An implementation of the DCCP protocol
+ *
+ * This code has been developed by the University of Waikato WAND
+ * research group. For further information please see http://www.wand.net.nz/
+ *
+ * This code also uses code from Lulea University, rereleased as GPL by its
+ * authors:
+ * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ * Changes to meet Linux coding standards, to make it meet latest ccid3 draft
+ * and to make it work as a loadable module in the DCCP stack written by
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
+ *
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/config.h>
+#include "../ccid.h"
+#include "../dccp.h"
+#include "lib/packet_history.h"
+#include "lib/loss_interval.h"
+#include "lib/tfrc.h"
+#include "ccid3.h"
+
+/*
+ * Reason for maths here is to avoid 32 bit overflow when a is big.
+ * With this we get close to the limit.
+ */
+static inline u32 usecs_div(const u32 a, const u32 b)
+{
+ const u32 div = a < (UINT_MAX / (USEC_PER_SEC / 10)) ? 10 :
+ a < (UINT_MAX / (USEC_PER_SEC / 50)) ? 50 :
+ a < (UINT_MAX / (USEC_PER_SEC / 100)) ? 100 :
+ a < (UINT_MAX / (USEC_PER_SEC / 500)) ? 500 :
+ a < (UINT_MAX / (USEC_PER_SEC / 1000)) ? 1000 :
+ a < (UINT_MAX / (USEC_PER_SEC / 5000)) ? 5000 :
+ a < (UINT_MAX / (USEC_PER_SEC / 10000)) ? 10000 :
+ a < (UINT_MAX / (USEC_PER_SEC / 50000)) ? 50000 :
+ 100000;
+ const u32 tmp = a * (USEC_PER_SEC / div);
+ return (b >= 2 * div) ? tmp / (b / div) : tmp;
+}
+
+static int ccid3_debug;
+
+#ifdef CCID3_DEBUG
+#define ccid3_pr_debug(format, a...) \
+ do { if (ccid3_debug) \
+ printk(KERN_DEBUG "%s: " format, __FUNCTION__, ##a); \
+ } while (0)
+#else
+#define ccid3_pr_debug(format, a...)
+#endif
+
+static struct dccp_tx_hist *ccid3_tx_hist;
+static struct dccp_rx_hist *ccid3_rx_hist;
+static struct dccp_li_hist *ccid3_li_hist;
+
+static int ccid3_init(struct sock *sk)
+{
+ return 0;
+}
+
+static void ccid3_exit(struct sock *sk)
+{
+}
+
+/* TFRC sender states */
+enum ccid3_hc_tx_states {
+ TFRC_SSTATE_NO_SENT = 1,
+ TFRC_SSTATE_NO_FBACK,
+ TFRC_SSTATE_FBACK,
+ TFRC_SSTATE_TERM,
+};
+
+#ifdef CCID3_DEBUG
+static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state)
+{
+ static char *ccid3_state_names[] = {
+ [TFRC_SSTATE_NO_SENT] = "NO_SENT",
+ [TFRC_SSTATE_NO_FBACK] = "NO_FBACK",
+ [TFRC_SSTATE_FBACK] = "FBACK",
+ [TFRC_SSTATE_TERM] = "TERM",
+ };
+
+ return ccid3_state_names[state];
+}
+#endif
+
+static inline void ccid3_hc_tx_set_state(struct sock *sk,
+ enum ccid3_hc_tx_states state)
+{
+ struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
+ enum ccid3_hc_tx_states oldstate = hctx->ccid3hctx_state;
+
+ ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
+ dccp_role(sk), sk, ccid3_tx_state_name(oldstate),
+ ccid3_tx_state_name(state));
+ WARN_ON(state == oldstate);
+ hctx->ccid3hctx_state = state;
+}
+
+/* Calculate new t_ipi (inter packet interval) by t_ipi = s / X_inst */
+static inline void ccid3_calc_new_t_ipi(struct ccid3_hc_tx_sock *hctx)
+{
+ /*
+ * If no feedback spec says t_ipi is 1 second (set elsewhere and then
+ * doubles after every no feedback timer (separate function)
+ */
+ if (hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK)
+ hctx->ccid3hctx_t_ipi = usecs_div(hctx->ccid3hctx_s,
+ hctx->ccid3hctx_x);
+}
+
+/* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */
+static inline void ccid3_calc_new_delta(struct ccid3_hc_tx_sock *hctx)
+{
+ hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2,
+ TFRC_OPSYS_HALF_TIME_GRAN);
+}
+
+/*
+ * Update X by
+ * If (p > 0)
+ * x_calc = calcX(s, R, p);
+ * X = max(min(X_calc, 2 * X_recv), s / t_mbi);
+ * Else
+ * If (now - tld >= R)
+ * X = max(min(2 * X, 2 * X_recv), s / R);
+ * tld = now;
+ */
+static void ccid3_hc_tx_update_x(struct sock *sk)
+{
+ struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
+
+ /* To avoid large error in calcX */
+ if (hctx->ccid3hctx_p >= TFRC_SMALLEST_P) {
+ hctx->ccid3hctx_x_calc = tfrc_calc_x(hctx->ccid3hctx_s,
+ hctx->ccid3hctx_rtt,
+ hctx->ccid3hctx_p);
+ hctx->ccid3hctx_x = max_t(u32, min_t(u32, hctx->ccid3hctx_x_calc,
+ 2 * hctx->ccid3hctx_x_recv),
+ (hctx->ccid3hctx_s /
+ TFRC_MAX_BACK_OFF_TIME));
+ } else {
+ struct timeval now;
+
+ dccp_timestamp(sk, &now);
+ if (timeval_delta(&now, &hctx->ccid3hctx_t_ld) >=
+ hctx->ccid3hctx_rtt) {
+ hctx->ccid3hctx_x = max_t(u32, min_t(u32, hctx->ccid3hctx_x_recv,
+ hctx->ccid3hctx_x) * 2,
+ usecs_div(hctx->ccid3hctx_s,
+ hctx->ccid3hctx_rtt));
+ hctx->ccid3hctx_t_ld = now;
+ }
+ }
+}
+
+static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
+{
+ struct sock *sk = (struct sock *)data;
+ unsigned long next_tmout = 0;
+ struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
+
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk)) {
+ /* Try again later. */
+ /* XXX: set some sensible MIB */
+ sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
+ jiffies + HZ / 5);
+ goto out;
+ }
+
+ ccid3_pr_debug("%s, sk=%p, state=%s\n", dccp_role(sk), sk,
+ ccid3_tx_state_name(hctx->ccid3hctx_state));
+
+ switch (hctx->ccid3hctx_state) {
+ case TFRC_SSTATE_TERM:
+ goto out;
+ case TFRC_SSTATE_NO_FBACK:
+ /* Halve send rate */
+ hctx->ccid3hctx_x /= 2;
+ if (hctx->ccid3hctx_x < (hctx->ccid3hctx_s /
+ TFRC_MAX_BACK_OFF_TIME))
+ hctx->ccid3hctx_x = (hctx->ccid3hctx_s /
+ TFRC_MAX_BACK_OFF_TIME);
+
+ ccid3_pr_debug("%s, sk=%p, state=%s, updated tx rate to %d "
+ "bytes/s\n",
+ dccp_role(sk), sk,
+ ccid3_tx_state_name(hctx->ccid3hctx_state),
+ hctx->ccid3hctx_x);
+ next_tmout = max_t(u32, 2 * usecs_div(hctx->ccid3hctx_s,
+ hctx->ccid3hctx_x),
+ TFRC_INITIAL_TIMEOUT);
+ /*
+ * FIXME - not sure above calculation is correct. See section
+ * 5 of CCID3 11 should adjust tx_t_ipi and double that to
+ * achieve it really
+ */
+ break;
+ case TFRC_SSTATE_FBACK:
+ /*
+ * Check if IDLE since last timeout and recv rate is less than
+ * 4 packets per RTT
+ */
+ if (!hctx->ccid3hctx_idle ||
+ (hctx->ccid3hctx_x_recv >=
+ 4 * usecs_div(hctx->ccid3hctx_s, hctx->ccid3hctx_rtt))) {
+ ccid3_pr_debug("%s, sk=%p, state=%s, not idle\n",
+ dccp_role(sk), sk,
+ ccid3_tx_state_name(hctx->ccid3hctx_state));
+ /* Halve sending rate */
+
+ /* If (X_calc > 2 * X_recv)
+ * X_recv = max(X_recv / 2, s / (2 * t_mbi));
+ * Else
+ * X_recv = X_calc / 4;
+ */
+ BUG_ON(hctx->ccid3hctx_p >= TFRC_SMALLEST_P &&
+ hctx->ccid3hctx_x_calc == 0);
+
+ /* check also if p is zero -> x_calc is infinity? */
+ if (hctx->ccid3hctx_p < TFRC_SMALLEST_P ||
+ hctx->ccid3hctx_x_calc > 2 * hctx->ccid3hctx_x_recv)
+ hctx->ccid3hctx_x_recv = max_t(u32, hctx->ccid3hctx_x_recv / 2,
+ hctx->ccid3hctx_s / (2 * TFRC_MAX_BACK_OFF_TIME));
+ else
+ hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc / 4;
+
+ /* Update sending rate */
+ ccid3_hc_tx_update_x(sk);
+ }
+ /*
+ * Schedule no feedback timer to expire in
+ * max(4 * R, 2 * s / X)
+ */
+ next_tmout = max_t(u32, hctx->ccid3hctx_t_rto,
+ 2 * usecs_div(hctx->ccid3hctx_s,
+ hctx->ccid3hctx_x));
+ break;
+ default:
+ printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
+ __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state);
+ dump_stack();
+ goto out;
+ }
+
+ sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
+ jiffies + max_t(u32, 1, usecs_to_jiffies(next_tmout)));
+ hctx->ccid3hctx_idle = 1;
+out:
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+static int ccid3_hc_tx_send_packet(struct sock *sk,
+ struct sk_buff *skb, int len)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
+ struct dccp_tx_hist_entry *new_packet;
+ struct timeval now;
+ long delay;
+ int rc = -ENOTCONN;
+
+ BUG_ON(hctx == NULL || hctx->ccid3hctx_state == TFRC_SSTATE_TERM);
+
+ /* Check if pure ACK or Terminating*/
+ /*
+ * XXX: We only call this function for DATA and DATAACK, on, these
+ * packets can have zero length, but why the comment about "pure ACK"?
+ */
+ if (unlikely(len == 0))
+ goto out;
+
+ /* See if last packet allocated was not sent */
+ new_packet = dccp_tx_hist_head(&hctx->ccid3hctx_hist);
+ if (new_packet == NULL || new_packet->dccphtx_sent) {
+ new_packet = dccp_tx_hist_entry_new(ccid3_tx_hist,
+ SLAB_ATOMIC);
+
+ rc = -ENOBUFS;
+ if (unlikely(new_packet == NULL)) {
+ LIMIT_NETDEBUG(KERN_WARNING "%s: %s, sk=%p, not enough "
+ "mem to add to history, send refused\n",
+ __FUNCTION__, dccp_role(sk), sk);
+ goto out;
+ }
+
+ dccp_tx_hist_add_entry(&hctx->ccid3hctx_hist, new_packet);
+ }
+
+ dccp_timestamp(sk, &now);
+
+ switch (hctx->ccid3hctx_state) {
+ case TFRC_SSTATE_NO_SENT:
+ hctx->ccid3hctx_no_feedback_timer.function = ccid3_hc_tx_no_feedback_timer;
+ hctx->ccid3hctx_no_feedback_timer.data = (unsigned long)sk;
+ sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
+ jiffies + usecs_to_jiffies(TFRC_INITIAL_TIMEOUT));
+ hctx->ccid3hctx_last_win_count = 0;
+ hctx->ccid3hctx_t_last_win_count = now;
+ ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
+ hctx->ccid3hctx_t_ipi = TFRC_INITIAL_IPI;
+
+ /* Set nominal send time for initial packet */
+ hctx->ccid3hctx_t_nom = now;
+ timeval_add_usecs(&hctx->ccid3hctx_t_nom,
+ hctx->ccid3hctx_t_ipi);
+ ccid3_calc_new_delta(hctx);
+ rc = 0;
+ break;
+ case TFRC_SSTATE_NO_FBACK:
+ case TFRC_SSTATE_FBACK:
+ delay = (timeval_delta(&now, &hctx->ccid3hctx_t_nom) -
+ hctx->ccid3hctx_delta);
+ delay /= -1000;
+ /* divide by -1000 is to convert to ms and get sign right */
+ rc = delay > 0 ? delay : 0;
+ break;
+ default:
+ printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
+ __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state);
+ dump_stack();
+ rc = -EINVAL;
+ break;
+ }
+
+ /* Can we send? if so add options and add to packet history */
+ if (rc == 0) {
+ dp->dccps_hc_tx_insert_options = 1;
+ new_packet->dccphtx_ccval =
+ DCCP_SKB_CB(skb)->dccpd_ccval =
+ hctx->ccid3hctx_last_win_count;
+ }
+out:
+ return rc;
+}
+
+static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, int len)
+{
+ const struct dccp_sock *dp = dccp_sk(sk);
+ struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
+ struct timeval now;
+
+ BUG_ON(hctx == NULL || hctx->ccid3hctx_state == TFRC_SSTATE_TERM);
+
+ dccp_timestamp(sk, &now);
+
+ /* check if we have sent a data packet */
+ if (len > 0) {
+ unsigned long quarter_rtt;
+ struct dccp_tx_hist_entry *packet;
+
+ packet = dccp_tx_hist_head(&hctx->ccid3hctx_hist);
+ if (unlikely(packet == NULL)) {
+ LIMIT_NETDEBUG(KERN_WARNING "%s: packet doesn't "
+ "exists in history!\n", __FUNCTION__);
+ return;
+ }
+ if (unlikely(packet->dccphtx_sent)) {
+ LIMIT_NETDEBUG(KERN_WARNING "%s: no unsent packet in "
+ "history!\n", __FUNCTION__);
+ return;
+ }
+ packet->dccphtx_tstamp = now;
+ packet->dccphtx_seqno = dp->dccps_gss;
+ /*
+ * Check if win_count have changed
+ * Algorithm in "8.1. Window Counter Valuer" in
+ * draft-ietf-dccp-ccid3-11.txt
+ */
+ quarter_rtt = timeval_delta(&now, &hctx->ccid3hctx_t_last_win_count);
+ if (likely(hctx->ccid3hctx_rtt > 8))
+ quarter_rtt /= hctx->ccid3hctx_rtt / 4;
+
+ if (quarter_rtt > 0) {
+ hctx->ccid3hctx_t_last_win_count = now;
+ hctx->ccid3hctx_last_win_count = (hctx->ccid3hctx_last_win_count +
+ min_t(unsigned long, quarter_rtt, 5)) % 16;
+ ccid3_pr_debug("%s, sk=%p, window changed from "
+ "%u to %u!\n",
+ dccp_role(sk), sk,
+ packet->dccphtx_ccval,
+ hctx->ccid3hctx_last_win_count);
+ }
+
+ hctx->ccid3hctx_idle = 0;
+ packet->dccphtx_rtt = hctx->ccid3hctx_rtt;
+ packet->dccphtx_sent = 1;
+ } else
+ ccid3_pr_debug("%s, sk=%p, seqno=%llu NOT inserted!\n",
+ dccp_role(sk), sk, dp->dccps_gss);
+
+ switch (hctx->ccid3hctx_state) {
+ case TFRC_SSTATE_NO_SENT:
+ /* if first wasn't pure ack */
+ if (len != 0)
+ printk(KERN_CRIT "%s: %s, First packet sent is noted "
+ "as a data packet\n",
+ __FUNCTION__, dccp_role(sk));
+ return;
+ case TFRC_SSTATE_NO_FBACK:
+ case TFRC_SSTATE_FBACK:
+ if (len > 0) {
+ hctx->ccid3hctx_t_nom = now;
+ ccid3_calc_new_t_ipi(hctx);
+ ccid3_calc_new_delta(hctx);
+ timeval_add_usecs(&hctx->ccid3hctx_t_nom,
+ hctx->ccid3hctx_t_ipi);
+ }
+ break;
+ default:
+ printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
+ __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state);
+ dump_stack();
+ break;
+ }
+}
+
+static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
+{
+ const struct dccp_sock *dp = dccp_sk(sk);
+ struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
+ struct ccid3_options_received *opt_recv;
+ struct dccp_tx_hist_entry *packet;
+ struct timeval now;
+ unsigned long next_tmout;
+ u32 t_elapsed;
+ u32 pinv;
+ u32 x_recv;
+ u32 r_sample;
+
+ BUG_ON(hctx == NULL || hctx->ccid3hctx_state == TFRC_SSTATE_TERM);
+
+ /* we are only interested in ACKs */
+ if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK ||
+ DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK))
+ return;
+
+ opt_recv = &hctx->ccid3hctx_options_received;
+
+ t_elapsed = dp->dccps_options_received.dccpor_elapsed_time * 10;
+ x_recv = opt_recv->ccid3or_receive_rate;
+ pinv = opt_recv->ccid3or_loss_event_rate;
+
+ switch (hctx->ccid3hctx_state) {
+ case TFRC_SSTATE_NO_SENT:
+ /* FIXME: what to do here? */
+ return;
+ case TFRC_SSTATE_NO_FBACK:
+ case TFRC_SSTATE_FBACK:
+ /* Calculate new round trip sample by
+ * R_sample = (now - t_recvdata) - t_delay */
+ /* get t_recvdata from history */
+ packet = dccp_tx_hist_find_entry(&hctx->ccid3hctx_hist,
+ DCCP_SKB_CB(skb)->dccpd_ack_seq);
+ if (unlikely(packet == NULL)) {
+ LIMIT_NETDEBUG(KERN_WARNING "%s: %s, sk=%p, seqno "
+ "%llu(%s) does't exist in history!\n",
+ __FUNCTION__, dccp_role(sk), sk,
+ (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq,
+ dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type));
+ return;
+ }
+
+ /* Update RTT */
+ dccp_timestamp(sk, &now);
+ r_sample = timeval_delta(&now, &packet->dccphtx_tstamp);
+ if (unlikely(r_sample <= t_elapsed))
+ LIMIT_NETDEBUG(KERN_WARNING "%s: r_sample=%uus, "
+ "t_elapsed=%uus\n",
+ __FUNCTION__, r_sample, t_elapsed);
+ else
+ r_sample -= t_elapsed;
+
+ /* Update RTT estimate by
+ * If (No feedback recv)
+ * R = R_sample;
+ * Else
+ * R = q * R + (1 - q) * R_sample;
+ *
+ * q is a constant, RFC 3448 recomments 0.9
+ */
+ if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) {
+ ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK);
+ hctx->ccid3hctx_rtt = r_sample;
+ } else
+ hctx->ccid3hctx_rtt = (hctx->ccid3hctx_rtt * 9) / 10 +
+ r_sample / 10;
+
+ ccid3_pr_debug("%s, sk=%p, New RTT estimate=%uus, "
+ "r_sample=%us\n", dccp_role(sk), sk,
+ hctx->ccid3hctx_rtt, r_sample);
+
+ /* Update timeout interval */
+ hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt,
+ USEC_PER_SEC);
+
+ /* Update receive rate */
+ hctx->ccid3hctx_x_recv = x_recv;/* X_recv in bytes per sec */
+
+ /* Update loss event rate */
+ if (pinv == ~0 || pinv == 0)
+ hctx->ccid3hctx_p = 0;
+ else {
+ hctx->ccid3hctx_p = 1000000 / pinv;
+
+ if (hctx->ccid3hctx_p < TFRC_SMALLEST_P) {
+ hctx->ccid3hctx_p = TFRC_SMALLEST_P;
+ ccid3_pr_debug("%s, sk=%p, Smallest p used!\n",
+ dccp_role(sk), sk);
+ }
+ }
+
+ /* unschedule no feedback timer */
+ sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);
+
+ /* Update sending rate */
+ ccid3_hc_tx_update_x(sk);
+
+ /* Update next send time */
+ timeval_sub_usecs(&hctx->ccid3hctx_t_nom,
+ hctx->ccid3hctx_t_ipi);
+ ccid3_calc_new_t_ipi(hctx);
+ timeval_add_usecs(&hctx->ccid3hctx_t_nom,
+ hctx->ccid3hctx_t_ipi);
+ ccid3_calc_new_delta(hctx);
+
+ /* remove all packets older than the one acked from history */
+ dccp_tx_hist_purge_older(ccid3_tx_hist,
+ &hctx->ccid3hctx_hist, packet);
+ /*
+ * As we have calculated new ipi, delta, t_nom it is possible that
+ * we now can send a packet, so wake up dccp_wait_for_ccids.
+ */
+ sk->sk_write_space(sk);
+
+ /*
+ * Schedule no feedback timer to expire in
+ * max(4 * R, 2 * s / X)
+ */
+ next_tmout = max(hctx->ccid3hctx_t_rto,
+ 2 * usecs_div(hctx->ccid3hctx_s,
+ hctx->ccid3hctx_x));
+
+ ccid3_pr_debug("%s, sk=%p, Scheduled no feedback timer to "
+ "expire in %lu jiffies (%luus)\n",
+ dccp_role(sk), sk,
+ usecs_to_jiffies(next_tmout), next_tmout);
+
+ sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
+ jiffies + max_t(u32, 1, usecs_to_jiffies(next_tmout)));
+
+ /* set idle flag */
+ hctx->ccid3hctx_idle = 1;
+ break;
+ default:
+ printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
+ __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state);
+ dump_stack();
+ break;
+ }
+}
+
+static void ccid3_hc_tx_insert_options(struct sock *sk, struct sk_buff *skb)
+{
+ const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
+
+ BUG_ON(hctx == NULL);
+
+ if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN))
+ return;
+
+ DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count;
+}
+
+static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option,
+ unsigned char len, u16 idx,
+ unsigned char *value)
+{
+ int rc = 0;
+ const struct dccp_sock *dp = dccp_sk(sk);
+ struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
+ struct ccid3_options_received *opt_recv;
+
+ BUG_ON(hctx == NULL);
+
+ opt_recv = &hctx->ccid3hctx_options_received;
+
+ if (opt_recv->ccid3or_seqno != dp->dccps_gsr) {
+ opt_recv->ccid3or_seqno = dp->dccps_gsr;
+ opt_recv->ccid3or_loss_event_rate = ~0;
+ opt_recv->ccid3or_loss_intervals_idx = 0;
+ opt_recv->ccid3or_loss_intervals_len = 0;
+ opt_recv->ccid3or_receive_rate = 0;
+ }
+
+ switch (option) {
+ case TFRC_OPT_LOSS_EVENT_RATE:
+ if (unlikely(len != 4)) {
+ LIMIT_NETDEBUG(KERN_WARNING "%s: %s, sk=%p, invalid "
+ "len for TFRC_OPT_LOSS_EVENT_RATE\n",
+ __FUNCTION__, dccp_role(sk), sk);
+ rc = -EINVAL;
+ } else {
+ opt_recv->ccid3or_loss_event_rate = ntohl(*(u32 *)value);
+ ccid3_pr_debug("%s, sk=%p, LOSS_EVENT_RATE=%u\n",
+ dccp_role(sk), sk,
+ opt_recv->ccid3or_loss_event_rate);
+ }
+ break;
+ case TFRC_OPT_LOSS_INTERVALS:
+ opt_recv->ccid3or_loss_intervals_idx = idx;
+ opt_recv->ccid3or_loss_intervals_len = len;
+ ccid3_pr_debug("%s, sk=%p, LOSS_INTERVALS=(%u, %u)\n",
+ dccp_role(sk), sk,
+ opt_recv->ccid3or_loss_intervals_idx,
+ opt_recv->ccid3or_loss_intervals_len);
+ break;
+ case TFRC_OPT_RECEIVE_RATE:
+ if (unlikely(len != 4)) {
+ LIMIT_NETDEBUG(KERN_WARNING "%s: %s, sk=%p, invalid "
+ "len for TFRC_OPT_RECEIVE_RATE\n",
+ __FUNCTION__, dccp_role(sk), sk);
+ rc = -EINVAL;
+ } else {
+ opt_recv->ccid3or_receive_rate = ntohl(*(u32 *)value);
+ ccid3_pr_debug("%s, sk=%p, RECEIVE_RATE=%u\n",
+ dccp_role(sk), sk,
+ opt_recv->ccid3or_receive_rate);
+ }
+ break;
+ }
+
+ return rc;
+}
+
+static int ccid3_hc_tx_init(struct sock *sk)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct ccid3_hc_tx_sock *hctx;
+
+ dp->dccps_hc_tx_ccid_private = kmalloc(sizeof(*hctx), gfp_any());
+ if (dp->dccps_hc_tx_ccid_private == NULL)
+ return -ENOMEM;
+
+ hctx = ccid3_hc_tx_sk(sk);
+ memset(hctx, 0, sizeof(*hctx));
+
+ if (dp->dccps_packet_size >= TFRC_MIN_PACKET_SIZE &&
+ dp->dccps_packet_size <= TFRC_MAX_PACKET_SIZE)
+ hctx->ccid3hctx_s = dp->dccps_packet_size;
+ else
+ hctx->ccid3hctx_s = TFRC_STD_PACKET_SIZE;
+
+ /* Set transmission rate to 1 packet per second */
+ hctx->ccid3hctx_x = hctx->ccid3hctx_s;
+ hctx->ccid3hctx_t_rto = USEC_PER_SEC;
+ hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT;
+ INIT_LIST_HEAD(&hctx->ccid3hctx_hist);
+ init_timer(&hctx->ccid3hctx_no_feedback_timer);
+
+ return 0;
+}
+
+static void ccid3_hc_tx_exit(struct sock *sk)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
+
+ BUG_ON(hctx == NULL);
+
+ ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM);
+ sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);
+
+ /* Empty packet history */
+ dccp_tx_hist_purge(ccid3_tx_hist, &hctx->ccid3hctx_hist);
+
+ kfree(dp->dccps_hc_tx_ccid_private);
+ dp->dccps_hc_tx_ccid_private = NULL;
+}
+
+/*
+ * RX Half Connection methods
+ */
+
+/* TFRC receiver states */
+enum ccid3_hc_rx_states {
+ TFRC_RSTATE_NO_DATA = 1,
+ TFRC_RSTATE_DATA,
+ TFRC_RSTATE_TERM = 127,
+};
+
+#ifdef CCID3_DEBUG
+static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state)
+{
+ static char *ccid3_rx_state_names[] = {
+ [TFRC_RSTATE_NO_DATA] = "NO_DATA",
+ [TFRC_RSTATE_DATA] = "DATA",
+ [TFRC_RSTATE_TERM] = "TERM",
+ };
+
+ return ccid3_rx_state_names[state];
+}
+#endif
+
+static inline void ccid3_hc_rx_set_state(struct sock *sk,
+ enum ccid3_hc_rx_states state)
+{
+ struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
+ enum ccid3_hc_rx_states oldstate = hcrx->ccid3hcrx_state;
+
+ ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
+ dccp_role(sk), sk, ccid3_rx_state_name(oldstate),
+ ccid3_rx_state_name(state));
+ WARN_ON(state == oldstate);
+ hcrx->ccid3hcrx_state = state;
+}
+
+static void ccid3_hc_rx_send_feedback(struct sock *sk)
+{
+ struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct dccp_rx_hist_entry *packet;
+ struct timeval now;
+
+ ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
+
+ dccp_timestamp(sk, &now);
+
+ switch (hcrx->ccid3hcrx_state) {
+ case TFRC_RSTATE_NO_DATA:
+ hcrx->ccid3hcrx_x_recv = 0;
+ break;
+ case TFRC_RSTATE_DATA: {
+ const u32 delta = timeval_delta(&now,
+ &hcrx->ccid3hcrx_tstamp_last_feedback);
+ hcrx->ccid3hcrx_x_recv = usecs_div(hcrx->ccid3hcrx_bytes_recv,
+ delta);
+ }
+ break;
+ default:
+ printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
+ __FUNCTION__, dccp_role(sk), sk, hcrx->ccid3hcrx_state);
+ dump_stack();
+ return;
+ }
+
+ packet = dccp_rx_hist_find_data_packet(&hcrx->ccid3hcrx_hist);
+ if (unlikely(packet == NULL)) {
+ LIMIT_NETDEBUG(KERN_WARNING "%s: %s, sk=%p, no data packet "
+ "in history!\n",
+ __FUNCTION__, dccp_role(sk), sk);
+ return;
+ }
+
+ hcrx->ccid3hcrx_tstamp_last_feedback = now;
+ hcrx->ccid3hcrx_last_counter = packet->dccphrx_ccval;
+ hcrx->ccid3hcrx_seqno_last_counter = packet->dccphrx_seqno;
+ hcrx->ccid3hcrx_bytes_recv = 0;
+
+ /* Convert to multiples of 10us */
+ hcrx->ccid3hcrx_elapsed_time =
+ timeval_delta(&now, &packet->dccphrx_tstamp) / 10;
+ if (hcrx->ccid3hcrx_p == 0)
+ hcrx->ccid3hcrx_pinv = ~0;
+ else
+ hcrx->ccid3hcrx_pinv = 1000000 / hcrx->ccid3hcrx_p;
+ dp->dccps_hc_rx_insert_options = 1;
+ dccp_send_ack(sk);
+}
+
+static void ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
+{
+ const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
+ u32 x_recv, pinv;
+
+ BUG_ON(hcrx == NULL);
+
+ if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN))
+ return;
+
+ DCCP_SKB_CB(skb)->dccpd_ccval = hcrx->ccid3hcrx_last_counter;
+
+ if (dccp_packet_without_ack(skb))
+ return;
+
+ if (hcrx->ccid3hcrx_elapsed_time != 0)
+ dccp_insert_option_elapsed_time(sk, skb,
+ hcrx->ccid3hcrx_elapsed_time);
+ dccp_insert_option_timestamp(sk, skb);
+ x_recv = htonl(hcrx->ccid3hcrx_x_recv);
+ pinv = htonl(hcrx->ccid3hcrx_pinv);
+ dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE,
+ &pinv, sizeof(pinv));
+ dccp_insert_option(sk, skb, TFRC_OPT_RECEIVE_RATE,
+ &x_recv, sizeof(x_recv));
+}
+
+/* calculate first loss interval
+ *
+ * returns estimated loss interval in usecs */
+
+static u32 ccid3_hc_rx_calc_first_li(struct sock *sk)
+{
+ struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
+ struct dccp_rx_hist_entry *entry, *next, *tail = NULL;
+ u32 rtt, delta, x_recv, fval, p, tmp2;
+ struct timeval tstamp = { 0, };
+ int interval = 0;
+ int win_count = 0;
+ int step = 0;
+ u64 tmp1;
+
+ list_for_each_entry_safe(entry, next, &hcrx->ccid3hcrx_hist,
+ dccphrx_node) {
+ if (dccp_rx_hist_entry_data_packet(entry)) {
+ tail = entry;
+
+ switch (step) {
+ case 0:
+ tstamp = entry->dccphrx_tstamp;
+ win_count = entry->dccphrx_ccval;
+ step = 1;
+ break;
+ case 1:
+ interval = win_count - entry->dccphrx_ccval;
+ if (interval < 0)
+ interval += TFRC_WIN_COUNT_LIMIT;
+ if (interval > 4)
+ goto found;
+ break;
+ }
+ }
+ }
+
+ if (unlikely(step == 0)) {
+ LIMIT_NETDEBUG(KERN_WARNING "%s: %s, sk=%p, packet history "
+ "contains no data packets!\n",
+ __FUNCTION__, dccp_role(sk), sk);
+ return ~0;
+ }
+
+ if (unlikely(interval == 0)) {
+ LIMIT_NETDEBUG(KERN_WARNING "%s: %s, sk=%p, Could not find a "
+ "win_count interval > 0. Defaulting to 1\n",
+ __FUNCTION__, dccp_role(sk), sk);
+ interval = 1;
+ }
+found:
+ rtt = timeval_delta(&tstamp, &tail->dccphrx_tstamp) * 4 / interval;
+ ccid3_pr_debug("%s, sk=%p, approximated RTT to %uus\n",
+ dccp_role(sk), sk, rtt);
+ if (rtt == 0)
+ rtt = 1;
+
+ dccp_timestamp(sk, &tstamp);
+ delta = timeval_delta(&tstamp, &hcrx->ccid3hcrx_tstamp_last_feedback);
+ x_recv = usecs_div(hcrx->ccid3hcrx_bytes_recv, delta);
+
+ tmp1 = (u64)x_recv * (u64)rtt;
+ do_div(tmp1,10000000);
+ tmp2 = (u32)tmp1;
+ fval = (hcrx->ccid3hcrx_s * 100000) / tmp2;
+ /* do not alter order above or you will get overflow on 32 bit */
+ p = tfrc_calc_x_reverse_lookup(fval);
+ ccid3_pr_debug("%s, sk=%p, receive rate=%u bytes/s, implied "
+ "loss rate=%u\n", dccp_role(sk), sk, x_recv, p);
+
+ if (p == 0)
+ return ~0;
+ else
+ return 1000000 / p;
+}
+
+static void ccid3_hc_rx_update_li(struct sock *sk, u64 seq_loss, u8 win_loss)
+{
+ struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
+
+ if (seq_loss != DCCP_MAX_SEQNO + 1 &&
+ list_empty(&hcrx->ccid3hcrx_li_hist)) {
+ struct dccp_li_hist_entry *li_tail;
+
+ li_tail = dccp_li_hist_interval_new(ccid3_li_hist,
+ &hcrx->ccid3hcrx_li_hist,
+ seq_loss, win_loss);
+ if (li_tail == NULL)
+ return;
+ li_tail->dccplih_interval = ccid3_hc_rx_calc_first_li(sk);
+ } else
+ LIMIT_NETDEBUG(KERN_WARNING "%s: FIXME: find end of "
+ "interval\n", __FUNCTION__);
+}
+
+static void ccid3_hc_rx_detect_loss(struct sock *sk)
+{
+ struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
+ u8 win_loss;
+ const u64 seq_loss = dccp_rx_hist_detect_loss(&hcrx->ccid3hcrx_hist,
+ &hcrx->ccid3hcrx_li_hist,
+ &win_loss);
+
+ ccid3_hc_rx_update_li(sk, seq_loss, win_loss);
+}
+
+static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
+{
+ struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
+ const struct dccp_options_received *opt_recv;
+ struct dccp_rx_hist_entry *packet;
+ struct timeval now;
+ u8 win_count;
+ u32 p_prev, r_sample, t_elapsed;
+ int ins;
+
+ BUG_ON(hcrx == NULL ||
+ !(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA ||
+ hcrx->ccid3hcrx_state == TFRC_RSTATE_DATA));
+
+ opt_recv = &dccp_sk(sk)->dccps_options_received;
+
+ switch (DCCP_SKB_CB(skb)->dccpd_type) {
+ case DCCP_PKT_ACK:
+ if (hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA)
+ return;
+ case DCCP_PKT_DATAACK:
+ if (opt_recv->dccpor_timestamp_echo == 0)
+ break;
+ p_prev = hcrx->ccid3hcrx_rtt;
+ dccp_timestamp(sk, &now);
+ timeval_sub_usecs(&now, opt_recv->dccpor_timestamp_echo * 10);
+ r_sample = timeval_usecs(&now);
+ t_elapsed = opt_recv->dccpor_elapsed_time * 10;
+
+ if (unlikely(r_sample <= t_elapsed))
+ LIMIT_NETDEBUG(KERN_WARNING "%s: r_sample=%uus, "
+ "t_elapsed=%uus\n",
+ __FUNCTION__, r_sample, t_elapsed);
+ else
+ r_sample -= t_elapsed;
+
+ if (hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA)
+ hcrx->ccid3hcrx_rtt = r_sample;
+ else
+ hcrx->ccid3hcrx_rtt = (hcrx->ccid3hcrx_rtt * 9) / 10 +
+ r_sample / 10;
+
+ if (p_prev != hcrx->ccid3hcrx_rtt)
+ ccid3_pr_debug("%s, New RTT=%luus, elapsed time=%u\n",
+ dccp_role(sk), hcrx->ccid3hcrx_rtt,
+ opt_recv->dccpor_elapsed_time);
+ break;
+ case DCCP_PKT_DATA:
+ break;
+ default: /* We're not interested in other packet types, move along */
+ return;
+ }
+
+ packet = dccp_rx_hist_entry_new(ccid3_rx_hist, sk, opt_recv->dccpor_ndp,
+ skb, SLAB_ATOMIC);
+ if (unlikely(packet == NULL)) {
+ LIMIT_NETDEBUG(KERN_WARNING "%s: %s, sk=%p, Not enough mem to "
+ "add rx packet to history, consider it lost!\n",
+ __FUNCTION__, dccp_role(sk), sk);
+ return;
+ }
+
+ win_count = packet->dccphrx_ccval;
+
+ ins = dccp_rx_hist_add_packet(ccid3_rx_hist, &hcrx->ccid3hcrx_hist,
+ &hcrx->ccid3hcrx_li_hist, packet);
+
+ if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK)
+ return;
+
+ switch (hcrx->ccid3hcrx_state) {
+ case TFRC_RSTATE_NO_DATA:
+ ccid3_pr_debug("%s, sk=%p(%s), skb=%p, sending initial "
+ "feedback\n",
+ dccp_role(sk), sk,
+ dccp_state_name(sk->sk_state), skb);
+ ccid3_hc_rx_send_feedback(sk);
+ ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA);
+ return;
+ case TFRC_RSTATE_DATA:
+ hcrx->ccid3hcrx_bytes_recv += skb->len -
+ dccp_hdr(skb)->dccph_doff * 4;
+ if (ins != 0)
+ break;
+
+ dccp_timestamp(sk, &now);
+ if (timeval_delta(&now, &hcrx->ccid3hcrx_tstamp_last_ack) >=
+ hcrx->ccid3hcrx_rtt) {
+ hcrx->ccid3hcrx_tstamp_last_ack = now;
+ ccid3_hc_rx_send_feedback(sk);
+ }
+ return;
+ default:
+ printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
+ __FUNCTION__, dccp_role(sk), sk, hcrx->ccid3hcrx_state);
+ dump_stack();
+ return;
+ }
+
+ /* Dealing with packet loss */
+ ccid3_pr_debug("%s, sk=%p(%s), data loss! Reacting...\n",
+ dccp_role(sk), sk, dccp_state_name(sk->sk_state));
+
+ ccid3_hc_rx_detect_loss(sk);
+ p_prev = hcrx->ccid3hcrx_p;
+
+ /* Calculate loss event rate */
+ if (!list_empty(&hcrx->ccid3hcrx_li_hist))
+ /* Scaling up by 1000000 as fixed decimal */
+ hcrx->ccid3hcrx_p = 1000000 / dccp_li_hist_calc_i_mean(&hcrx->ccid3hcrx_li_hist);
+
+ if (hcrx->ccid3hcrx_p > p_prev) {
+ ccid3_hc_rx_send_feedback(sk);
+ return;
+ }
+}
+
+static int ccid3_hc_rx_init(struct sock *sk)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct ccid3_hc_rx_sock *hcrx;
+
+ ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
+
+ dp->dccps_hc_rx_ccid_private = kmalloc(sizeof(*hcrx), gfp_any());
+ if (dp->dccps_hc_rx_ccid_private == NULL)
+ return -ENOMEM;
+
+ hcrx = ccid3_hc_rx_sk(sk);
+ memset(hcrx, 0, sizeof(*hcrx));
+
+ if (dp->dccps_packet_size >= TFRC_MIN_PACKET_SIZE &&
+ dp->dccps_packet_size <= TFRC_MAX_PACKET_SIZE)
+ hcrx->ccid3hcrx_s = dp->dccps_packet_size;
+ else
+ hcrx->ccid3hcrx_s = TFRC_STD_PACKET_SIZE;
+
+ hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA;
+ INIT_LIST_HEAD(&hcrx->ccid3hcrx_hist);
+ INIT_LIST_HEAD(&hcrx->ccid3hcrx_li_hist);
+ dccp_timestamp(sk, &hcrx->ccid3hcrx_tstamp_last_ack);
+ hcrx->ccid3hcrx_tstamp_last_feedback = hcrx->ccid3hcrx_tstamp_last_ack;
+ hcrx->ccid3hcrx_rtt = 5000; /* XXX 5ms for now... */
+ return 0;
+}
+
+static void ccid3_hc_rx_exit(struct sock *sk)
+{
+ struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ BUG_ON(hcrx == NULL);
+
+ ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM);
+
+ /* Empty packet history */
+ dccp_rx_hist_purge(ccid3_rx_hist, &hcrx->ccid3hcrx_hist);
+
+ /* Empty loss interval history */
+ dccp_li_hist_purge(ccid3_li_hist, &hcrx->ccid3hcrx_li_hist);
+
+ kfree(dp->dccps_hc_rx_ccid_private);
+ dp->dccps_hc_rx_ccid_private = NULL;
+}
+
+static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
+{
+ const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
+
+ /* Listen socks doesn't have a private CCID block */
+ if (sk->sk_state == DCCP_LISTEN)
+ return;
+
+ BUG_ON(hcrx == NULL);
+
+ info->tcpi_ca_state = hcrx->ccid3hcrx_state;
+ info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
+ info->tcpi_rcv_rtt = hcrx->ccid3hcrx_rtt;
+}
+
+static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
+{
+ const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
+
+ /* Listen socks doesn't have a private CCID block */
+ if (sk->sk_state == DCCP_LISTEN)
+ return;
+
+ BUG_ON(hctx == NULL);
+
+ info->tcpi_rto = hctx->ccid3hctx_t_rto;
+ info->tcpi_rtt = hctx->ccid3hctx_rtt;
+}
+
+static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
+ u32 __user *optval, int __user *optlen)
+{
+ const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
+ const void *val;
+
+ /* Listen socks doesn't have a private CCID block */
+ if (sk->sk_state == DCCP_LISTEN)
+ return -EINVAL;
+
+ switch (optname) {
+ case DCCP_SOCKOPT_CCID_RX_INFO:
+ if (len < sizeof(hcrx->ccid3hcrx_tfrc))
+ return -EINVAL;
+ len = sizeof(hcrx->ccid3hcrx_tfrc);
+ val = &hcrx->ccid3hcrx_tfrc;
+ break;
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ if (put_user(len, optlen) || copy_to_user(optval, val, len))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
+ u32 __user *optval, int __user *optlen)
+{
+ const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
+ const void *val;
+
+ /* Listen socks doesn't have a private CCID block */
+ if (sk->sk_state == DCCP_LISTEN)
+ return -EINVAL;
+
+ switch (optname) {
+ case DCCP_SOCKOPT_CCID_TX_INFO:
+ if (len < sizeof(hctx->ccid3hctx_tfrc))
+ return -EINVAL;
+ len = sizeof(hctx->ccid3hctx_tfrc);
+ val = &hctx->ccid3hctx_tfrc;
+ break;
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ if (put_user(len, optlen) || copy_to_user(optval, val, len))
+ return -EFAULT;
+
+ return 0;
+}
+
+static struct ccid ccid3 = {
+ .ccid_id = 3,
+ .ccid_name = "ccid3",
+ .ccid_owner = THIS_MODULE,
+ .ccid_init = ccid3_init,
+ .ccid_exit = ccid3_exit,
+ .ccid_hc_tx_init = ccid3_hc_tx_init,
+ .ccid_hc_tx_exit = ccid3_hc_tx_exit,
+ .ccid_hc_tx_send_packet = ccid3_hc_tx_send_packet,
+ .ccid_hc_tx_packet_sent = ccid3_hc_tx_packet_sent,
+ .ccid_hc_tx_packet_recv = ccid3_hc_tx_packet_recv,
+ .ccid_hc_tx_insert_options = ccid3_hc_tx_insert_options,
+ .ccid_hc_tx_parse_options = ccid3_hc_tx_parse_options,
+ .ccid_hc_rx_init = ccid3_hc_rx_init,
+ .ccid_hc_rx_exit = ccid3_hc_rx_exit,
+ .ccid_hc_rx_insert_options = ccid3_hc_rx_insert_options,
+ .ccid_hc_rx_packet_recv = ccid3_hc_rx_packet_recv,
+ .ccid_hc_rx_get_info = ccid3_hc_rx_get_info,
+ .ccid_hc_tx_get_info = ccid3_hc_tx_get_info,
+ .ccid_hc_rx_getsockopt = ccid3_hc_rx_getsockopt,
+ .ccid_hc_tx_getsockopt = ccid3_hc_tx_getsockopt,
+};
+
+module_param(ccid3_debug, int, 0444);
+MODULE_PARM_DESC(ccid3_debug, "Enable debug messages");
+
+static __init int ccid3_module_init(void)
+{
+ int rc = -ENOBUFS;
+
+ ccid3_rx_hist = dccp_rx_hist_new("ccid3");
+ if (ccid3_rx_hist == NULL)
+ goto out;
+
+ ccid3_tx_hist = dccp_tx_hist_new("ccid3");
+ if (ccid3_tx_hist == NULL)
+ goto out_free_rx;
+
+ ccid3_li_hist = dccp_li_hist_new("ccid3");
+ if (ccid3_li_hist == NULL)
+ goto out_free_tx;
+
+ rc = ccid_register(&ccid3);
+ if (rc != 0)
+ goto out_free_loss_interval_history;
+out:
+ return rc;
+
+out_free_loss_interval_history:
+ dccp_li_hist_delete(ccid3_li_hist);
+ ccid3_li_hist = NULL;
+out_free_tx:
+ dccp_tx_hist_delete(ccid3_tx_hist);
+ ccid3_tx_hist = NULL;
+out_free_rx:
+ dccp_rx_hist_delete(ccid3_rx_hist);
+ ccid3_rx_hist = NULL;
+ goto out;
+}
+module_init(ccid3_module_init);
+
+static __exit void ccid3_module_exit(void)
+{
+#ifdef CONFIG_IP_DCCP_UNLOAD_HACK
+ /*
+ * Hack to use while developing, so that we get rid of the control
+ * sock, that is what keeps a refcount on dccp.ko -acme
+ */
+ extern void dccp_ctl_sock_exit(void);
+
+ dccp_ctl_sock_exit();
+#endif
+ ccid_unregister(&ccid3);
+
+ if (ccid3_tx_hist != NULL) {
+ dccp_tx_hist_delete(ccid3_tx_hist);
+ ccid3_tx_hist = NULL;
+ }
+ if (ccid3_rx_hist != NULL) {
+ dccp_rx_hist_delete(ccid3_rx_hist);
+ ccid3_rx_hist = NULL;
+ }
+ if (ccid3_li_hist != NULL) {
+ dccp_li_hist_delete(ccid3_li_hist);
+ ccid3_li_hist = NULL;
+ }
+}
+module_exit(ccid3_module_exit);
+
+MODULE_AUTHOR("Ian McDonald <iam4@cs.waikato.ac.nz>, "
+ "Arnaldo Carvalho de Melo <acme@ghostprotocols.net>");
+MODULE_DESCRIPTION("DCCP TFRC CCID3 CCID");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("net-dccp-ccid-3");
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
new file mode 100644
index 00000000000..0bde4583d09
--- /dev/null
+++ b/net/dccp/ccids/ccid3.h
@@ -0,0 +1,146 @@
+/*
+ * net/dccp/ccids/ccid3.h
+ *
+ * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ *
+ * An implementation of the DCCP protocol
+ *
+ * This code has been developed by the University of Waikato WAND
+ * research group. For further information please see http://www.wand.net.nz/
+ * or e-mail Ian McDonald - iam4@cs.waikato.ac.nz
+ *
+ * This code also uses code from Lulea University, rereleased as GPL by its
+ * authors:
+ * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ * Changes to meet Linux coding standards, to make it meet latest ccid3 draft
+ * and to make it work as a loadable module in the DCCP stack written by
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
+ *
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#ifndef _DCCP_CCID3_H_
+#define _DCCP_CCID3_H_
+
+#include <linux/config.h>
+#include <linux/list.h>
+#include <linux/time.h>
+#include <linux/types.h>
+#include <linux/tfrc.h>
+
+#define TFRC_MIN_PACKET_SIZE 16
+#define TFRC_STD_PACKET_SIZE 256
+#define TFRC_MAX_PACKET_SIZE 65535
+
+/* Two seconds as per CCID3 spec */
+#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC)
+
+#define TFRC_INITIAL_IPI (USEC_PER_SEC / 4)
+
+/* In usecs - half the scheduling granularity as per RFC3448 4.6 */
+#define TFRC_OPSYS_HALF_TIME_GRAN (USEC_PER_SEC / (2 * HZ))
+
+/* In seconds */
+#define TFRC_MAX_BACK_OFF_TIME 64
+
+#define TFRC_SMALLEST_P 40
+
+enum ccid3_options {
+ TFRC_OPT_LOSS_EVENT_RATE = 192,
+ TFRC_OPT_LOSS_INTERVALS = 193,
+ TFRC_OPT_RECEIVE_RATE = 194,
+};
+
+struct ccid3_options_received {
+ u64 ccid3or_seqno:48,
+ ccid3or_loss_intervals_idx:16;
+ u16 ccid3or_loss_intervals_len;
+ u32 ccid3or_loss_event_rate;
+ u32 ccid3or_receive_rate;
+};
+
+/** struct ccid3_hc_tx_sock - CCID3 sender half connection sock
+ *
+ * @ccid3hctx_state - Sender state
+ * @ccid3hctx_x - Current sending rate
+ * @ccid3hctx_x_recv - Receive rate
+ * @ccid3hctx_x_calc - Calculated send (?) rate
+ * @ccid3hctx_s - Packet size
+ * @ccid3hctx_rtt - Estimate of current round trip time in usecs
+ * @@ccid3hctx_p - Current loss event rate (0-1) scaled by 1000000
+ * @ccid3hctx_last_win_count - Last window counter sent
+ * @ccid3hctx_t_last_win_count - Timestamp of earliest packet
+ * with last_win_count value sent
+ * @ccid3hctx_no_feedback_timer - Handle to no feedback timer
+ * @ccid3hctx_idle - FIXME
+ * @ccid3hctx_t_ld - Time last doubled during slow start
+ * @ccid3hctx_t_nom - Nominal send time of next packet
+ * @ccid3hctx_t_ipi - Interpacket (send) interval
+ * @ccid3hctx_delta - Send timer delta
+ * @ccid3hctx_hist - Packet history
+ */
+struct ccid3_hc_tx_sock {
+ struct tfrc_tx_info ccid3hctx_tfrc;
+#define ccid3hctx_x ccid3hctx_tfrc.tfrctx_x
+#define ccid3hctx_x_recv ccid3hctx_tfrc.tfrctx_x_recv
+#define ccid3hctx_x_calc ccid3hctx_tfrc.tfrctx_x_calc
+#define ccid3hctx_rtt ccid3hctx_tfrc.tfrctx_rtt
+#define ccid3hctx_p ccid3hctx_tfrc.tfrctx_p
+#define ccid3hctx_t_rto ccid3hctx_tfrc.tfrctx_rto
+#define ccid3hctx_t_ipi ccid3hctx_tfrc.tfrctx_ipi
+ u16 ccid3hctx_s;
+ u8 ccid3hctx_state;
+ u8 ccid3hctx_last_win_count;
+ u8 ccid3hctx_idle;
+ struct timeval ccid3hctx_t_last_win_count;
+ struct timer_list ccid3hctx_no_feedback_timer;
+ struct timeval ccid3hctx_t_ld;
+ struct timeval ccid3hctx_t_nom;
+ u32 ccid3hctx_delta;
+ struct list_head ccid3hctx_hist;
+ struct ccid3_options_received ccid3hctx_options_received;
+};
+
+struct ccid3_hc_rx_sock {
+ struct tfrc_rx_info ccid3hcrx_tfrc;
+#define ccid3hcrx_x_recv ccid3hcrx_tfrc.tfrcrx_x_recv
+#define ccid3hcrx_rtt ccid3hcrx_tfrc.tfrcrx_rtt
+#define ccid3hcrx_p ccid3hcrx_tfrc.tfrcrx_p
+ u64 ccid3hcrx_seqno_last_counter:48,
+ ccid3hcrx_state:8,
+ ccid3hcrx_last_counter:4;
+ u32 ccid3hcrx_bytes_recv;
+ struct timeval ccid3hcrx_tstamp_last_feedback;
+ struct timeval ccid3hcrx_tstamp_last_ack;
+ struct list_head ccid3hcrx_hist;
+ struct list_head ccid3hcrx_li_hist;
+ u16 ccid3hcrx_s;
+ u32 ccid3hcrx_pinv;
+ u32 ccid3hcrx_elapsed_time;
+};
+
+static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
+{
+ return dccp_sk(sk)->dccps_hc_tx_ccid_private;
+}
+
+static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk)
+{
+ return dccp_sk(sk)->dccps_hc_rx_ccid_private;
+}
+
+#endif /* _DCCP_CCID3_H_ */
diff --git a/net/dccp/ccids/lib/Makefile b/net/dccp/ccids/lib/Makefile
new file mode 100644
index 00000000000..5f940a6cbac
--- /dev/null
+++ b/net/dccp/ccids/lib/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_IP_DCCP_TFRC_LIB) += dccp_tfrc_lib.o
+
+dccp_tfrc_lib-y := loss_interval.o packet_history.o tfrc_equation.o
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
new file mode 100644
index 00000000000..4c01a54143a
--- /dev/null
+++ b/net/dccp/ccids/lib/loss_interval.c
@@ -0,0 +1,144 @@
+/*
+ * net/dccp/ccids/lib/loss_interval.c
+ *
+ * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+
+#include "loss_interval.h"
+
+struct dccp_li_hist *dccp_li_hist_new(const char *name)
+{
+ struct dccp_li_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC);
+ static const char dccp_li_hist_mask[] = "li_hist_%s";
+ char *slab_name;
+
+ if (hist == NULL)
+ goto out;
+
+ slab_name = kmalloc(strlen(name) + sizeof(dccp_li_hist_mask) - 1,
+ GFP_ATOMIC);
+ if (slab_name == NULL)
+ goto out_free_hist;
+
+ sprintf(slab_name, dccp_li_hist_mask, name);
+ hist->dccplih_slab = kmem_cache_create(slab_name,
+ sizeof(struct dccp_li_hist_entry),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+ if (hist->dccplih_slab == NULL)
+ goto out_free_slab_name;
+out:
+ return hist;
+out_free_slab_name:
+ kfree(slab_name);
+out_free_hist:
+ kfree(hist);
+ hist = NULL;
+ goto out;
+}
+
+EXPORT_SYMBOL_GPL(dccp_li_hist_new);
+
+void dccp_li_hist_delete(struct dccp_li_hist *hist)
+{
+ const char* name = kmem_cache_name(hist->dccplih_slab);
+
+ kmem_cache_destroy(hist->dccplih_slab);
+ kfree(name);
+ kfree(hist);
+}
+
+EXPORT_SYMBOL_GPL(dccp_li_hist_delete);
+
+void dccp_li_hist_purge(struct dccp_li_hist *hist, struct list_head *list)
+{
+ struct dccp_li_hist_entry *entry, *next;
+
+ list_for_each_entry_safe(entry, next, list, dccplih_node) {
+ list_del_init(&entry->dccplih_node);
+ kmem_cache_free(hist->dccplih_slab, entry);
+ }
+}
+
+EXPORT_SYMBOL_GPL(dccp_li_hist_purge);
+
+/* Weights used to calculate loss event rate */
+/*
+ * These are integers as per section 8 of RFC3448. We can then divide by 4 *
+ * when we use it.
+ */
+static const int dccp_li_hist_w[DCCP_LI_HIST_IVAL_F_LENGTH] = {
+ 4, 4, 4, 4, 3, 2, 1, 1,
+};
+
+u32 dccp_li_hist_calc_i_mean(struct list_head *list)
+{
+ struct dccp_li_hist_entry *li_entry, *li_next;
+ int i = 0;
+ u32 i_tot;
+ u32 i_tot0 = 0;
+ u32 i_tot1 = 0;
+ u32 w_tot = 0;
+
+ list_for_each_entry_safe(li_entry, li_next, list, dccplih_node) {
+ if (i < DCCP_LI_HIST_IVAL_F_LENGTH) {
+ i_tot0 += li_entry->dccplih_interval * dccp_li_hist_w[i];
+ w_tot += dccp_li_hist_w[i];
+ }
+
+ if (i != 0)
+ i_tot1 += li_entry->dccplih_interval * dccp_li_hist_w[i - 1];
+
+ if (++i > DCCP_LI_HIST_IVAL_F_LENGTH)
+ break;
+ }
+
+ if (i != DCCP_LI_HIST_IVAL_F_LENGTH)
+ return 0;
+
+ i_tot = max(i_tot0, i_tot1);
+
+ /* FIXME: Why do we do this? -Ian McDonald */
+ if (i_tot * 4 < w_tot)
+ i_tot = w_tot * 4;
+
+ return i_tot * 4 / w_tot;
+}
+
+EXPORT_SYMBOL_GPL(dccp_li_hist_calc_i_mean);
+
+struct dccp_li_hist_entry *dccp_li_hist_interval_new(struct dccp_li_hist *hist,
+ struct list_head *list,
+ const u64 seq_loss,
+ const u8 win_loss)
+{
+ struct dccp_li_hist_entry *tail = NULL, *entry;
+ int i;
+
+ for (i = 0; i <= DCCP_LI_HIST_IVAL_F_LENGTH; ++i) {
+ entry = dccp_li_hist_entry_new(hist, SLAB_ATOMIC);
+ if (entry == NULL) {
+ dccp_li_hist_purge(hist, list);
+ return NULL;
+ }
+ if (tail == NULL)
+ tail = entry;
+ list_add(&entry->dccplih_node, list);
+ }
+
+ entry->dccplih_seqno = seq_loss;
+ entry->dccplih_win_count = win_loss;
+ return tail;
+}
+
+EXPORT_SYMBOL_GPL(dccp_li_hist_interval_new);
diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h
new file mode 100644
index 00000000000..417d9d82df3
--- /dev/null
+++ b/net/dccp/ccids/lib/loss_interval.h
@@ -0,0 +1,61 @@
+#ifndef _DCCP_LI_HIST_
+#define _DCCP_LI_HIST_
+/*
+ * net/dccp/ccids/lib/loss_interval.h
+ *
+ * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+
+#define DCCP_LI_HIST_IVAL_F_LENGTH 8
+
+struct dccp_li_hist {
+ kmem_cache_t *dccplih_slab;
+};
+
+extern struct dccp_li_hist *dccp_li_hist_new(const char *name);
+extern void dccp_li_hist_delete(struct dccp_li_hist *hist);
+
+struct dccp_li_hist_entry {
+ struct list_head dccplih_node;
+ u64 dccplih_seqno:48,
+ dccplih_win_count:4;
+ u32 dccplih_interval;
+};
+
+static inline struct dccp_li_hist_entry *
+ dccp_li_hist_entry_new(struct dccp_li_hist *hist,
+ const gfp_t prio)
+{
+ return kmem_cache_alloc(hist->dccplih_slab, prio);
+}
+
+static inline void dccp_li_hist_entry_delete(struct dccp_li_hist *hist,
+ struct dccp_li_hist_entry *entry)
+{
+ if (entry != NULL)
+ kmem_cache_free(hist->dccplih_slab, entry);
+}
+
+extern void dccp_li_hist_purge(struct dccp_li_hist *hist,
+ struct list_head *list);
+
+extern u32 dccp_li_hist_calc_i_mean(struct list_head *list);
+
+extern struct dccp_li_hist_entry *
+ dccp_li_hist_interval_new(struct dccp_li_hist *hist,
+ struct list_head *list,
+ const u64 seq_loss,
+ const u8 win_loss);
+#endif /* _DCCP_LI_HIST_ */
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
new file mode 100644
index 00000000000..d3f9d205383
--- /dev/null
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -0,0 +1,398 @@
+/*
+ * net/dccp/packet_history.h
+ *
+ * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ *
+ * An implementation of the DCCP protocol
+ *
+ * This code has been developed by the University of Waikato WAND
+ * research group. For further information please see http://www.wand.net.nz/
+ * or e-mail Ian McDonald - iam4@cs.waikato.ac.nz
+ *
+ * This code also uses code from Lulea University, rereleased as GPL by its
+ * authors:
+ * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ * Changes to meet Linux coding standards, to make it meet latest ccid3 draft
+ * and to make it work as a loadable module in the DCCP stack written by
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
+ *
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include "packet_history.h"
+
+struct dccp_rx_hist *dccp_rx_hist_new(const char *name)
+{
+ struct dccp_rx_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC);
+ static const char dccp_rx_hist_mask[] = "rx_hist_%s";
+ char *slab_name;
+
+ if (hist == NULL)
+ goto out;
+
+ slab_name = kmalloc(strlen(name) + sizeof(dccp_rx_hist_mask) - 1,
+ GFP_ATOMIC);
+ if (slab_name == NULL)
+ goto out_free_hist;
+
+ sprintf(slab_name, dccp_rx_hist_mask, name);
+ hist->dccprxh_slab = kmem_cache_create(slab_name,
+ sizeof(struct dccp_rx_hist_entry),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+ if (hist->dccprxh_slab == NULL)
+ goto out_free_slab_name;
+out:
+ return hist;
+out_free_slab_name:
+ kfree(slab_name);
+out_free_hist:
+ kfree(hist);
+ hist = NULL;
+ goto out;
+}
+
+EXPORT_SYMBOL_GPL(dccp_rx_hist_new);
+
+void dccp_rx_hist_delete(struct dccp_rx_hist *hist)
+{
+ const char* name = kmem_cache_name(hist->dccprxh_slab);
+
+ kmem_cache_destroy(hist->dccprxh_slab);
+ kfree(name);
+ kfree(hist);
+}
+
+EXPORT_SYMBOL_GPL(dccp_rx_hist_delete);
+
+void dccp_rx_hist_purge(struct dccp_rx_hist *hist, struct list_head *list)
+{
+ struct dccp_rx_hist_entry *entry, *next;
+
+ list_for_each_entry_safe(entry, next, list, dccphrx_node) {
+ list_del_init(&entry->dccphrx_node);
+ kmem_cache_free(hist->dccprxh_slab, entry);
+ }
+}
+
+EXPORT_SYMBOL_GPL(dccp_rx_hist_purge);
+
+struct dccp_rx_hist_entry *
+ dccp_rx_hist_find_data_packet(const struct list_head *list)
+{
+ struct dccp_rx_hist_entry *entry, *packet = NULL;
+
+ list_for_each_entry(entry, list, dccphrx_node)
+ if (entry->dccphrx_type == DCCP_PKT_DATA ||
+ entry->dccphrx_type == DCCP_PKT_DATAACK) {
+ packet = entry;
+ break;
+ }
+
+ return packet;
+}
+
+EXPORT_SYMBOL_GPL(dccp_rx_hist_find_data_packet);
+
+int dccp_rx_hist_add_packet(struct dccp_rx_hist *hist,
+ struct list_head *rx_list,
+ struct list_head *li_list,
+ struct dccp_rx_hist_entry *packet)
+{
+ struct dccp_rx_hist_entry *entry, *next, *iter;
+ u8 num_later = 0;
+
+ iter = dccp_rx_hist_head(rx_list);
+ if (iter == NULL)
+ dccp_rx_hist_add_entry(rx_list, packet);
+ else {
+ const u64 seqno = packet->dccphrx_seqno;
+
+ if (after48(seqno, iter->dccphrx_seqno))
+ dccp_rx_hist_add_entry(rx_list, packet);
+ else {
+ if (dccp_rx_hist_entry_data_packet(iter))
+ num_later = 1;
+
+ list_for_each_entry_continue(iter, rx_list,
+ dccphrx_node) {
+ if (after48(seqno, iter->dccphrx_seqno)) {
+ dccp_rx_hist_add_entry(&iter->dccphrx_node,
+ packet);
+ goto trim_history;
+ }
+
+ if (dccp_rx_hist_entry_data_packet(iter))
+ num_later++;
+
+ if (num_later == TFRC_RECV_NUM_LATE_LOSS) {
+ dccp_rx_hist_entry_delete(hist, packet);
+ return 1;
+ }
+ }
+
+ if (num_later < TFRC_RECV_NUM_LATE_LOSS)
+ dccp_rx_hist_add_entry(rx_list, packet);
+ /*
+ * FIXME: else what? should we destroy the packet
+ * like above?
+ */
+ }
+ }
+
+trim_history:
+ /*
+ * Trim history (remove all packets after the NUM_LATE_LOSS + 1
+ * data packets)
+ */
+ num_later = TFRC_RECV_NUM_LATE_LOSS + 1;
+
+ if (!list_empty(li_list)) {
+ list_for_each_entry_safe(entry, next, rx_list, dccphrx_node) {
+ if (num_later == 0) {
+ list_del_init(&entry->dccphrx_node);
+ dccp_rx_hist_entry_delete(hist, entry);
+ } else if (dccp_rx_hist_entry_data_packet(entry))
+ --num_later;
+ }
+ } else {
+ int step = 0;
+ u8 win_count = 0; /* Not needed, but lets shut up gcc */
+ int tmp;
+ /*
+ * We have no loss interval history so we need at least one
+ * rtt:s of data packets to approximate rtt.
+ */
+ list_for_each_entry_safe(entry, next, rx_list, dccphrx_node) {
+ if (num_later == 0) {
+ switch (step) {
+ case 0:
+ step = 1;
+ /* OK, find next data packet */
+ num_later = 1;
+ break;
+ case 1:
+ step = 2;
+ /* OK, find next data packet */
+ num_later = 1;
+ win_count = entry->dccphrx_ccval;
+ break;
+ case 2:
+ tmp = win_count - entry->dccphrx_ccval;
+ if (tmp < 0)
+ tmp += TFRC_WIN_COUNT_LIMIT;
+ if (tmp > TFRC_WIN_COUNT_PER_RTT + 1) {
+ /*
+ * We have found a packet older
+ * than one rtt remove the rest
+ */
+ step = 3;
+ } else /* OK, find next data packet */
+ num_later = 1;
+ break;
+ case 3:
+ list_del_init(&entry->dccphrx_node);
+ dccp_rx_hist_entry_delete(hist, entry);
+ break;
+ }
+ } else if (dccp_rx_hist_entry_data_packet(entry))
+ --num_later;
+ }
+ }
+
+ return 0;
+}
+
+EXPORT_SYMBOL_GPL(dccp_rx_hist_add_packet);
+
+u64 dccp_rx_hist_detect_loss(struct list_head *rx_list,
+ struct list_head *li_list, u8 *win_loss)
+{
+ struct dccp_rx_hist_entry *entry, *next, *packet;
+ struct dccp_rx_hist_entry *a_loss = NULL;
+ struct dccp_rx_hist_entry *b_loss = NULL;
+ u64 seq_loss = DCCP_MAX_SEQNO + 1;
+ u8 num_later = TFRC_RECV_NUM_LATE_LOSS;
+
+ list_for_each_entry_safe(entry, next, rx_list, dccphrx_node) {
+ if (num_later == 0) {
+ b_loss = entry;
+ break;
+ } else if (dccp_rx_hist_entry_data_packet(entry))
+ --num_later;
+ }
+
+ if (b_loss == NULL)
+ goto out;
+
+ num_later = 1;
+ list_for_each_entry_safe_continue(entry, next, rx_list, dccphrx_node) {
+ if (num_later == 0) {
+ a_loss = entry;
+ break;
+ } else if (dccp_rx_hist_entry_data_packet(entry))
+ --num_later;
+ }
+
+ if (a_loss == NULL) {
+ if (list_empty(li_list)) {
+ /* no loss event have occured yet */
+ LIMIT_NETDEBUG("%s: TODO: find a lost data packet by "
+ "comparing to initial seqno\n",
+ __FUNCTION__);
+ goto out;
+ } else {
+ LIMIT_NETDEBUG("%s: Less than 4 data pkts in history!",
+ __FUNCTION__);
+ goto out;
+ }
+ }
+
+ /* Locate a lost data packet */
+ entry = packet = b_loss;
+ list_for_each_entry_safe_continue(entry, next, rx_list, dccphrx_node) {
+ u64 delta = dccp_delta_seqno(entry->dccphrx_seqno,
+ packet->dccphrx_seqno);
+
+ if (delta != 0) {
+ if (dccp_rx_hist_entry_data_packet(packet))
+ --delta;
+ /*
+ * FIXME: check this, probably this % usage is because
+ * in earlier drafts the ndp count was just 8 bits
+ * long, but now it cam be up to 24 bits long.
+ */
+#if 0
+ if (delta % DCCP_NDP_LIMIT !=
+ (packet->dccphrx_ndp -
+ entry->dccphrx_ndp) % DCCP_NDP_LIMIT)
+#endif
+ if (delta != packet->dccphrx_ndp - entry->dccphrx_ndp) {
+ seq_loss = entry->dccphrx_seqno;
+ dccp_inc_seqno(&seq_loss);
+ }
+ }
+ packet = entry;
+ if (packet == a_loss)
+ break;
+ }
+out:
+ if (seq_loss != DCCP_MAX_SEQNO + 1)
+ *win_loss = a_loss->dccphrx_ccval;
+ else
+ *win_loss = 0; /* Paranoia */
+
+ return seq_loss;
+}
+
+EXPORT_SYMBOL_GPL(dccp_rx_hist_detect_loss);
+
+struct dccp_tx_hist *dccp_tx_hist_new(const char *name)
+{
+ struct dccp_tx_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC);
+ static const char dccp_tx_hist_mask[] = "tx_hist_%s";
+ char *slab_name;
+
+ if (hist == NULL)
+ goto out;
+
+ slab_name = kmalloc(strlen(name) + sizeof(dccp_tx_hist_mask) - 1,
+ GFP_ATOMIC);
+ if (slab_name == NULL)
+ goto out_free_hist;
+
+ sprintf(slab_name, dccp_tx_hist_mask, name);
+ hist->dccptxh_slab = kmem_cache_create(slab_name,
+ sizeof(struct dccp_tx_hist_entry),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+ if (hist->dccptxh_slab == NULL)
+ goto out_free_slab_name;
+out:
+ return hist;
+out_free_slab_name:
+ kfree(slab_name);
+out_free_hist:
+ kfree(hist);
+ hist = NULL;
+ goto out;
+}
+
+EXPORT_SYMBOL_GPL(dccp_tx_hist_new);
+
+void dccp_tx_hist_delete(struct dccp_tx_hist *hist)
+{
+ const char* name = kmem_cache_name(hist->dccptxh_slab);
+
+ kmem_cache_destroy(hist->dccptxh_slab);
+ kfree(name);
+ kfree(hist);
+}
+
+EXPORT_SYMBOL_GPL(dccp_tx_hist_delete);
+
+struct dccp_tx_hist_entry *
+ dccp_tx_hist_find_entry(const struct list_head *list, const u64 seq)
+{
+ struct dccp_tx_hist_entry *packet = NULL, *entry;
+
+ list_for_each_entry(entry, list, dccphtx_node)
+ if (entry->dccphtx_seqno == seq) {
+ packet = entry;
+ break;
+ }
+
+ return packet;
+}
+
+EXPORT_SYMBOL_GPL(dccp_tx_hist_find_entry);
+
+void dccp_tx_hist_purge_older(struct dccp_tx_hist *hist,
+ struct list_head *list,
+ struct dccp_tx_hist_entry *packet)
+{
+ struct dccp_tx_hist_entry *next;
+
+ list_for_each_entry_safe_continue(packet, next, list, dccphtx_node) {
+ list_del_init(&packet->dccphtx_node);
+ dccp_tx_hist_entry_delete(hist, packet);
+ }
+}
+
+EXPORT_SYMBOL_GPL(dccp_tx_hist_purge_older);
+
+void dccp_tx_hist_purge(struct dccp_tx_hist *hist, struct list_head *list)
+{
+ struct dccp_tx_hist_entry *entry, *next;
+
+ list_for_each_entry_safe(entry, next, list, dccphtx_node) {
+ list_del_init(&entry->dccphtx_node);
+ dccp_tx_hist_entry_delete(hist, entry);
+ }
+}
+
+EXPORT_SYMBOL_GPL(dccp_tx_hist_purge);
+
+MODULE_AUTHOR("Ian McDonald <iam4@cs.waikato.ac.nz>, "
+ "Arnaldo Carvalho de Melo <acme@ghostprotocols.net>");
+MODULE_DESCRIPTION("DCCP TFRC library");
+MODULE_LICENSE("GPL");
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h
new file mode 100644
index 00000000000..122e96737ff
--- /dev/null
+++ b/net/dccp/ccids/lib/packet_history.h
@@ -0,0 +1,200 @@
+/*
+ * net/dccp/packet_history.h
+ *
+ * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ *
+ * An implementation of the DCCP protocol
+ *
+ * This code has been developed by the University of Waikato WAND
+ * research group. For further information please see http://www.wand.net.nz/
+ * or e-mail Ian McDonald - iam4@cs.waikato.ac.nz
+ *
+ * This code also uses code from Lulea University, rereleased as GPL by its
+ * authors:
+ * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ * Changes to meet Linux coding standards, to make it meet latest ccid3 draft
+ * and to make it work as a loadable module in the DCCP stack written by
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
+ *
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _DCCP_PKT_HIST_
+#define _DCCP_PKT_HIST_
+
+#include <linux/config.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+
+#include "../../dccp.h"
+
+/* Number of later packets received before one is considered lost */
+#define TFRC_RECV_NUM_LATE_LOSS 3
+
+#define TFRC_WIN_COUNT_PER_RTT 4
+#define TFRC_WIN_COUNT_LIMIT 16
+
+struct dccp_tx_hist_entry {
+ struct list_head dccphtx_node;
+ u64 dccphtx_seqno:48,
+ dccphtx_ccval:4,
+ dccphtx_sent:1;
+ u32 dccphtx_rtt;
+ struct timeval dccphtx_tstamp;
+};
+
+struct dccp_rx_hist_entry {
+ struct list_head dccphrx_node;
+ u64 dccphrx_seqno:48,
+ dccphrx_ccval:4,
+ dccphrx_type:4;
+ u32 dccphrx_ndp; /* In fact it is from 8 to 24 bits */
+ struct timeval dccphrx_tstamp;
+};
+
+struct dccp_tx_hist {
+ kmem_cache_t *dccptxh_slab;
+};
+
+extern struct dccp_tx_hist *dccp_tx_hist_new(const char *name);
+extern void dccp_tx_hist_delete(struct dccp_tx_hist *hist);
+
+struct dccp_rx_hist {
+ kmem_cache_t *dccprxh_slab;
+};
+
+extern struct dccp_rx_hist *dccp_rx_hist_new(const char *name);
+extern void dccp_rx_hist_delete(struct dccp_rx_hist *hist);
+extern struct dccp_rx_hist_entry *
+ dccp_rx_hist_find_data_packet(const struct list_head *list);
+
+static inline struct dccp_tx_hist_entry *
+ dccp_tx_hist_entry_new(struct dccp_tx_hist *hist,
+ const gfp_t prio)
+{
+ struct dccp_tx_hist_entry *entry = kmem_cache_alloc(hist->dccptxh_slab,
+ prio);
+
+ if (entry != NULL)
+ entry->dccphtx_sent = 0;
+
+ return entry;
+}
+
+static inline void dccp_tx_hist_entry_delete(struct dccp_tx_hist *hist,
+ struct dccp_tx_hist_entry *entry)
+{
+ if (entry != NULL)
+ kmem_cache_free(hist->dccptxh_slab, entry);
+}
+
+extern struct dccp_tx_hist_entry *
+ dccp_tx_hist_find_entry(const struct list_head *list,
+ const u64 seq);
+
+static inline void dccp_tx_hist_add_entry(struct list_head *list,
+ struct dccp_tx_hist_entry *entry)
+{
+ list_add(&entry->dccphtx_node, list);
+}
+
+extern void dccp_tx_hist_purge_older(struct dccp_tx_hist *hist,
+ struct list_head *list,
+ struct dccp_tx_hist_entry *next);
+
+extern void dccp_tx_hist_purge(struct dccp_tx_hist *hist,
+ struct list_head *list);
+
+static inline struct dccp_tx_hist_entry *
+ dccp_tx_hist_head(struct list_head *list)
+{
+ struct dccp_tx_hist_entry *head = NULL;
+
+ if (!list_empty(list))
+ head = list_entry(list->next, struct dccp_tx_hist_entry,
+ dccphtx_node);
+ return head;
+}
+
+static inline struct dccp_rx_hist_entry *
+ dccp_rx_hist_entry_new(struct dccp_rx_hist *hist,
+ const struct sock *sk,
+ const u32 ndp,
+ const struct sk_buff *skb,
+ const gfp_t prio)
+{
+ struct dccp_rx_hist_entry *entry = kmem_cache_alloc(hist->dccprxh_slab,
+ prio);
+
+ if (entry != NULL) {
+ const struct dccp_hdr *dh = dccp_hdr(skb);
+
+ entry->dccphrx_seqno = DCCP_SKB_CB(skb)->dccpd_seq;
+ entry->dccphrx_ccval = dh->dccph_ccval;
+ entry->dccphrx_type = dh->dccph_type;
+ entry->dccphrx_ndp = ndp;
+ dccp_timestamp(sk, &entry->dccphrx_tstamp);
+ }
+
+ return entry;
+}
+
+static inline void dccp_rx_hist_entry_delete(struct dccp_rx_hist *hist,
+ struct dccp_rx_hist_entry *entry)
+{
+ if (entry != NULL)
+ kmem_cache_free(hist->dccprxh_slab, entry);
+}
+
+extern void dccp_rx_hist_purge(struct dccp_rx_hist *hist,
+ struct list_head *list);
+
+static inline void dccp_rx_hist_add_entry(struct list_head *list,
+ struct dccp_rx_hist_entry *entry)
+{
+ list_add(&entry->dccphrx_node, list);
+}
+
+static inline struct dccp_rx_hist_entry *
+ dccp_rx_hist_head(struct list_head *list)
+{
+ struct dccp_rx_hist_entry *head = NULL;
+
+ if (!list_empty(list))
+ head = list_entry(list->next, struct dccp_rx_hist_entry,
+ dccphrx_node);
+ return head;
+}
+
+static inline int
+ dccp_rx_hist_entry_data_packet(const struct dccp_rx_hist_entry *entry)
+{
+ return entry->dccphrx_type == DCCP_PKT_DATA ||
+ entry->dccphrx_type == DCCP_PKT_DATAACK;
+}
+
+extern int dccp_rx_hist_add_packet(struct dccp_rx_hist *hist,
+ struct list_head *rx_list,
+ struct list_head *li_list,
+ struct dccp_rx_hist_entry *packet);
+
+extern u64 dccp_rx_hist_detect_loss(struct list_head *rx_list,
+ struct list_head *li_list, u8 *win_loss);
+
+#endif /* _DCCP_PKT_HIST_ */
diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h
new file mode 100644
index 00000000000..130c4c40cfe
--- /dev/null
+++ b/net/dccp/ccids/lib/tfrc.h
@@ -0,0 +1,22 @@
+#ifndef _TFRC_H_
+#define _TFRC_H_
+/*
+ * net/dccp/ccids/lib/tfrc.h
+ *
+ * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/types.h>
+
+extern u32 tfrc_calc_x(u16 s, u32 R, u32 p);
+extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue);
+
+#endif /* _TFRC_H_ */
diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
new file mode 100644
index 00000000000..d2b5933b451
--- /dev/null
+++ b/net/dccp/ccids/lib/tfrc_equation.c
@@ -0,0 +1,644 @@
+/*
+ * net/dccp/ccids/lib/tfrc_equation.c
+ *
+ * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+
+#include <asm/bug.h>
+#include <asm/div64.h>
+
+#include "tfrc.h"
+
+#define TFRC_CALC_X_ARRSIZE 500
+
+#define TFRC_CALC_X_SPLIT 50000
+/* equivalent to 0.05 */
+
+static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = {
+ { 37172, 8172 },
+ { 53499, 11567 },
+ { 66664, 14180 },
+ { 78298, 16388 },
+ { 89021, 18339 },
+ { 99147, 20108 },
+ { 108858, 21738 },
+ { 118273, 23260 },
+ { 127474, 24693 },
+ { 136520, 26052 },
+ { 145456, 27348 },
+ { 154316, 28589 },
+ { 163130, 29783 },
+ { 171919, 30935 },
+ { 180704, 32049 },
+ { 189502, 33130 },
+ { 198328, 34180 },
+ { 207194, 35202 },
+ { 216114, 36198 },
+ { 225097, 37172 },
+ { 234153, 38123 },
+ { 243294, 39055 },
+ { 252527, 39968 },
+ { 261861, 40864 },
+ { 271305, 41743 },
+ { 280866, 42607 },
+ { 290553, 43457 },
+ { 300372, 44293 },
+ { 310333, 45117 },
+ { 320441, 45929 },
+ { 330705, 46729 },
+ { 341131, 47518 },
+ { 351728, 48297 },
+ { 362501, 49066 },
+ { 373460, 49826 },
+ { 384609, 50577 },
+ { 395958, 51320 },
+ { 407513, 52054 },
+ { 419281, 52780 },
+ { 431270, 53499 },
+ { 443487, 54211 },
+ { 455940, 54916 },
+ { 468635, 55614 },
+ { 481581, 56306 },
+ { 494785, 56991 },
+ { 508254, 57671 },
+ { 521996, 58345 },
+ { 536019, 59014 },
+ { 550331, 59677 },
+ { 564939, 60335 },
+ { 579851, 60988 },
+ { 595075, 61636 },
+ { 610619, 62279 },
+ { 626491, 62918 },
+ { 642700, 63553 },
+ { 659253, 64183 },
+ { 676158, 64809 },
+ { 693424, 65431 },
+ { 711060, 66050 },
+ { 729073, 66664 },
+ { 747472, 67275 },
+ { 766266, 67882 },
+ { 785464, 68486 },
+ { 805073, 69087 },
+ { 825103, 69684 },
+ { 845562, 70278 },
+ { 866460, 70868 },
+ { 887805, 71456 },
+ { 909606, 72041 },
+ { 931873, 72623 },
+ { 954614, 73202 },
+ { 977839, 73778 },
+ { 1001557, 74352 },
+ { 1025777, 74923 },
+ { 1050508, 75492 },
+ { 1075761, 76058 },
+ { 1101544, 76621 },
+ { 1127867, 77183 },
+ { 1154739, 77741 },
+ { 1182172, 78298 },
+ { 1210173, 78852 },
+ { 1238753, 79405 },
+ { 1267922, 79955 },
+ { 1297689, 80503 },
+ { 1328066, 81049 },
+ { 1359060, 81593 },
+ { 1390684, 82135 },
+ { 1422947, 82675 },
+ { 1455859, 83213 },
+ { 1489430, 83750 },
+ { 1523671, 84284 },
+ { 1558593, 84817 },
+ { 1594205, 85348 },
+ { 1630518, 85878 },
+ { 1667543, 86406 },
+ { 1705290, 86932 },
+ { 1743770, 87457 },
+ { 1782994, 87980 },
+ { 1822973, 88501 },
+ { 1863717, 89021 },
+ { 1905237, 89540 },
+ { 1947545, 90057 },
+ { 1990650, 90573 },
+ { 2034566, 91087 },
+ { 2079301, 91600 },
+ { 2124869, 92111 },
+ { 2171279, 92622 },
+ { 2218543, 93131 },
+ { 2266673, 93639 },
+ { 2315680, 94145 },
+ { 2365575, 94650 },
+ { 2416371, 95154 },
+ { 2468077, 95657 },
+ { 2520707, 96159 },
+ { 2574271, 96660 },
+ { 2628782, 97159 },
+ { 2684250, 97658 },
+ { 2740689, 98155 },
+ { 2798110, 98651 },
+ { 2856524, 99147 },
+ { 2915944, 99641 },
+ { 2976382, 100134 },
+ { 3037850, 100626 },
+ { 3100360, 101117 },
+ { 3163924, 101608 },
+ { 3228554, 102097 },
+ { 3294263, 102586 },
+ { 3361063, 103073 },
+ { 3428966, 103560 },
+ { 3497984, 104045 },
+ { 3568131, 104530 },
+ { 3639419, 105014 },
+ { 3711860, 105498 },
+ { 3785467, 105980 },
+ { 3860253, 106462 },
+ { 3936229, 106942 },
+ { 4013410, 107422 },
+ { 4091808, 107902 },
+ { 4171435, 108380 },
+ { 4252306, 108858 },
+ { 4334431, 109335 },
+ { 4417825, 109811 },
+ { 4502501, 110287 },
+ { 4588472, 110762 },
+ { 4675750, 111236 },
+ { 4764349, 111709 },
+ { 4854283, 112182 },
+ { 4945564, 112654 },
+ { 5038206, 113126 },
+ { 5132223, 113597 },
+ { 5227627, 114067 },
+ { 5324432, 114537 },
+ { 5422652, 115006 },
+ { 5522299, 115474 },
+ { 5623389, 115942 },
+ { 5725934, 116409 },
+ { 5829948, 116876 },
+ { 5935446, 117342 },
+ { 6042439, 117808 },
+ { 6150943, 118273 },
+ { 6260972, 118738 },
+ { 6372538, 119202 },
+ { 6485657, 119665 },
+ { 6600342, 120128 },
+ { 6716607, 120591 },
+ { 6834467, 121053 },
+ { 6953935, 121514 },
+ { 7075025, 121976 },
+ { 7197752, 122436 },
+ { 7322131, 122896 },
+ { 7448175, 123356 },
+ { 7575898, 123815 },
+ { 7705316, 124274 },
+ { 7836442, 124733 },
+ { 7969291, 125191 },
+ { 8103877, 125648 },
+ { 8240216, 126105 },
+ { 8378321, 126562 },
+ { 8518208, 127018 },
+ { 8659890, 127474 },
+ { 8803384, 127930 },
+ { 8948702, 128385 },
+ { 9095861, 128840 },
+ { 9244875, 129294 },
+ { 9395760, 129748 },
+ { 9548529, 130202 },
+ { 9703198, 130655 },
+ { 9859782, 131108 },
+ { 10018296, 131561 },
+ { 10178755, 132014 },
+ { 10341174, 132466 },
+ { 10505569, 132917 },
+ { 10671954, 133369 },
+ { 10840345, 133820 },
+ { 11010757, 134271 },
+ { 11183206, 134721 },
+ { 11357706, 135171 },
+ { 11534274, 135621 },
+ { 11712924, 136071 },
+ { 11893673, 136520 },
+ { 12076536, 136969 },
+ { 12261527, 137418 },
+ { 12448664, 137867 },
+ { 12637961, 138315 },
+ { 12829435, 138763 },
+ { 13023101, 139211 },
+ { 13218974, 139658 },
+ { 13417071, 140106 },
+ { 13617407, 140553 },
+ { 13819999, 140999 },
+ { 14024862, 141446 },
+ { 14232012, 141892 },
+ { 14441465, 142339 },
+ { 14653238, 142785 },
+ { 14867346, 143230 },
+ { 15083805, 143676 },
+ { 15302632, 144121 },
+ { 15523842, 144566 },
+ { 15747453, 145011 },
+ { 15973479, 145456 },
+ { 16201939, 145900 },
+ { 16432847, 146345 },
+ { 16666221, 146789 },
+ { 16902076, 147233 },
+ { 17140429, 147677 },
+ { 17381297, 148121 },
+ { 17624696, 148564 },
+ { 17870643, 149007 },
+ { 18119154, 149451 },
+ { 18370247, 149894 },
+ { 18623936, 150336 },
+ { 18880241, 150779 },
+ { 19139176, 151222 },
+ { 19400759, 151664 },
+ { 19665007, 152107 },
+ { 19931936, 152549 },
+ { 20201564, 152991 },
+ { 20473907, 153433 },
+ { 20748982, 153875 },
+ { 21026807, 154316 },
+ { 21307399, 154758 },
+ { 21590773, 155199 },
+ { 21876949, 155641 },
+ { 22165941, 156082 },
+ { 22457769, 156523 },
+ { 22752449, 156964 },
+ { 23049999, 157405 },
+ { 23350435, 157846 },
+ { 23653774, 158287 },
+ { 23960036, 158727 },
+ { 24269236, 159168 },
+ { 24581392, 159608 },
+ { 24896521, 160049 },
+ { 25214642, 160489 },
+ { 25535772, 160929 },
+ { 25859927, 161370 },
+ { 26187127, 161810 },
+ { 26517388, 162250 },
+ { 26850728, 162690 },
+ { 27187165, 163130 },
+ { 27526716, 163569 },
+ { 27869400, 164009 },
+ { 28215234, 164449 },
+ { 28564236, 164889 },
+ { 28916423, 165328 },
+ { 29271815, 165768 },
+ { 29630428, 166208 },
+ { 29992281, 166647 },
+ { 30357392, 167087 },
+ { 30725779, 167526 },
+ { 31097459, 167965 },
+ { 31472452, 168405 },
+ { 31850774, 168844 },
+ { 32232445, 169283 },
+ { 32617482, 169723 },
+ { 33005904, 170162 },
+ { 33397730, 170601 },
+ { 33792976, 171041 },
+ { 34191663, 171480 },
+ { 34593807, 171919 },
+ { 34999428, 172358 },
+ { 35408544, 172797 },
+ { 35821174, 173237 },
+ { 36237335, 173676 },
+ { 36657047, 174115 },
+ { 37080329, 174554 },
+ { 37507197, 174993 },
+ { 37937673, 175433 },
+ { 38371773, 175872 },
+ { 38809517, 176311 },
+ { 39250924, 176750 },
+ { 39696012, 177190 },
+ { 40144800, 177629 },
+ { 40597308, 178068 },
+ { 41053553, 178507 },
+ { 41513554, 178947 },
+ { 41977332, 179386 },
+ { 42444904, 179825 },
+ { 42916290, 180265 },
+ { 43391509, 180704 },
+ { 43870579, 181144 },
+ { 44353520, 181583 },
+ { 44840352, 182023 },
+ { 45331092, 182462 },
+ { 45825761, 182902 },
+ { 46324378, 183342 },
+ { 46826961, 183781 },
+ { 47333531, 184221 },
+ { 47844106, 184661 },
+ { 48358706, 185101 },
+ { 48877350, 185541 },
+ { 49400058, 185981 },
+ { 49926849, 186421 },
+ { 50457743, 186861 },
+ { 50992759, 187301 },
+ { 51531916, 187741 },
+ { 52075235, 188181 },
+ { 52622735, 188622 },
+ { 53174435, 189062 },
+ { 53730355, 189502 },
+ { 54290515, 189943 },
+ { 54854935, 190383 },
+ { 55423634, 190824 },
+ { 55996633, 191265 },
+ { 56573950, 191706 },
+ { 57155606, 192146 },
+ { 57741621, 192587 },
+ { 58332014, 193028 },
+ { 58926806, 193470 },
+ { 59526017, 193911 },
+ { 60129666, 194352 },
+ { 60737774, 194793 },
+ { 61350361, 195235 },
+ { 61967446, 195677 },
+ { 62589050, 196118 },
+ { 63215194, 196560 },
+ { 63845897, 197002 },
+ { 64481179, 197444 },
+ { 65121061, 197886 },
+ { 65765563, 198328 },
+ { 66414705, 198770 },
+ { 67068508, 199213 },
+ { 67726992, 199655 },
+ { 68390177, 200098 },
+ { 69058085, 200540 },
+ { 69730735, 200983 },
+ { 70408147, 201426 },
+ { 71090343, 201869 },
+ { 71777343, 202312 },
+ { 72469168, 202755 },
+ { 73165837, 203199 },
+ { 73867373, 203642 },
+ { 74573795, 204086 },
+ { 75285124, 204529 },
+ { 76001380, 204973 },
+ { 76722586, 205417 },
+ { 77448761, 205861 },
+ { 78179926, 206306 },
+ { 78916102, 206750 },
+ { 79657310, 207194 },
+ { 80403571, 207639 },
+ { 81154906, 208084 },
+ { 81911335, 208529 },
+ { 82672880, 208974 },
+ { 83439562, 209419 },
+ { 84211402, 209864 },
+ { 84988421, 210309 },
+ { 85770640, 210755 },
+ { 86558080, 211201 },
+ { 87350762, 211647 },
+ { 88148708, 212093 },
+ { 88951938, 212539 },
+ { 89760475, 212985 },
+ { 90574339, 213432 },
+ { 91393551, 213878 },
+ { 92218133, 214325 },
+ { 93048107, 214772 },
+ { 93883493, 215219 },
+ { 94724314, 215666 },
+ { 95570590, 216114 },
+ { 96422343, 216561 },
+ { 97279594, 217009 },
+ { 98142366, 217457 },
+ { 99010679, 217905 },
+ { 99884556, 218353 },
+ { 100764018, 218801 },
+ { 101649086, 219250 },
+ { 102539782, 219698 },
+ { 103436128, 220147 },
+ { 104338146, 220596 },
+ { 105245857, 221046 },
+ { 106159284, 221495 },
+ { 107078448, 221945 },
+ { 108003370, 222394 },
+ { 108934074, 222844 },
+ { 109870580, 223294 },
+ { 110812910, 223745 },
+ { 111761087, 224195 },
+ { 112715133, 224646 },
+ { 113675069, 225097 },
+ { 114640918, 225548 },
+ { 115612702, 225999 },
+ { 116590442, 226450 },
+ { 117574162, 226902 },
+ { 118563882, 227353 },
+ { 119559626, 227805 },
+ { 120561415, 228258 },
+ { 121569272, 228710 },
+ { 122583219, 229162 },
+ { 123603278, 229615 },
+ { 124629471, 230068 },
+ { 125661822, 230521 },
+ { 126700352, 230974 },
+ { 127745083, 231428 },
+ { 128796039, 231882 },
+ { 129853241, 232336 },
+ { 130916713, 232790 },
+ { 131986475, 233244 },
+ { 133062553, 233699 },
+ { 134144966, 234153 },
+ { 135233739, 234608 },
+ { 136328894, 235064 },
+ { 137430453, 235519 },
+ { 138538440, 235975 },
+ { 139652876, 236430 },
+ { 140773786, 236886 },
+ { 141901190, 237343 },
+ { 143035113, 237799 },
+ { 144175576, 238256 },
+ { 145322604, 238713 },
+ { 146476218, 239170 },
+ { 147636442, 239627 },
+ { 148803298, 240085 },
+ { 149976809, 240542 },
+ { 151156999, 241000 },
+ { 152343890, 241459 },
+ { 153537506, 241917 },
+ { 154737869, 242376 },
+ { 155945002, 242835 },
+ { 157158929, 243294 },
+ { 158379673, 243753 },
+ { 159607257, 244213 },
+ { 160841704, 244673 },
+ { 162083037, 245133 },
+ { 163331279, 245593 },
+ { 164586455, 246054 },
+ { 165848586, 246514 },
+ { 167117696, 246975 },
+ { 168393810, 247437 },
+ { 169676949, 247898 },
+ { 170967138, 248360 },
+ { 172264399, 248822 },
+ { 173568757, 249284 },
+ { 174880235, 249747 },
+ { 176198856, 250209 },
+ { 177524643, 250672 },
+ { 178857621, 251136 },
+ { 180197813, 251599 },
+ { 181545242, 252063 },
+ { 182899933, 252527 },
+ { 184261908, 252991 },
+ { 185631191, 253456 },
+ { 187007807, 253920 },
+ { 188391778, 254385 },
+ { 189783129, 254851 },
+ { 191181884, 255316 },
+ { 192588065, 255782 },
+ { 194001698, 256248 },
+ { 195422805, 256714 },
+ { 196851411, 257181 },
+ { 198287540, 257648 },
+ { 199731215, 258115 },
+ { 201182461, 258582 },
+ { 202641302, 259050 },
+ { 204107760, 259518 },
+ { 205581862, 259986 },
+ { 207063630, 260454 },
+ { 208553088, 260923 },
+ { 210050262, 261392 },
+ { 211555174, 261861 },
+ { 213067849, 262331 },
+ { 214588312, 262800 },
+ { 216116586, 263270 },
+ { 217652696, 263741 },
+ { 219196666, 264211 },
+ { 220748520, 264682 },
+ { 222308282, 265153 },
+ { 223875978, 265625 },
+ { 225451630, 266097 },
+ { 227035265, 266569 },
+ { 228626905, 267041 },
+ { 230226576, 267514 },
+ { 231834302, 267986 },
+ { 233450107, 268460 },
+ { 235074016, 268933 },
+ { 236706054, 269407 },
+ { 238346244, 269881 },
+ { 239994613, 270355 },
+ { 241651183, 270830 },
+ { 243315981, 271305 }
+};
+
+/* Calculate the send rate as per section 3.1 of RFC3448
+
+Returns send rate in bytes per second
+
+Integer maths and lookups are used as not allowed floating point in kernel
+
+The function for Xcalc as per section 3.1 of RFC3448 is:
+
+X = s
+ -------------------------------------------------------------
+ R*sqrt(2*b*p/3) + (t_RTO * (3*sqrt(3*b*p/8) * p * (1+32*p^2)))
+
+where
+X is the trasmit rate in bytes/second
+s is the packet size in bytes
+R is the round trip time in seconds
+p is the loss event rate, between 0 and 1.0, of the number of loss events
+ as a fraction of the number of packets transmitted
+t_RTO is the TCP retransmission timeout value in seconds
+b is the number of packets acknowledged by a single TCP acknowledgement
+
+we can assume that b = 1 and t_RTO is 4 * R. With this the equation becomes:
+
+X = s
+ -----------------------------------------------------------------------
+ R * sqrt(2 * p / 3) + (12 * R * (sqrt(3 * p / 8) * p * (1 + 32 * p^2)))
+
+
+which we can break down into:
+
+X = s
+ --------
+ R * f(p)
+
+where f(p) = sqrt(2 * p / 3) + (12 * sqrt(3 * p / 8) * p * (1 + 32 * p * p))
+
+Function parameters:
+s - bytes
+R - RTT in usecs
+p - loss rate (decimal fraction multiplied by 1,000,000)
+
+Returns Xcalc in bytes per second
+
+DON'T alter this code unless you run test cases against it as the code
+has been manipulated to stop underflow/overlow.
+
+*/
+u32 tfrc_calc_x(u16 s, u32 R, u32 p)
+{
+ int index;
+ u32 f;
+ u64 tmp1, tmp2;
+
+ if (p < TFRC_CALC_X_SPLIT)
+ index = (p / (TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE)) - 1;
+ else
+ index = (p / (1000000 / TFRC_CALC_X_ARRSIZE)) - 1;
+
+ if (index < 0)
+ /* p should be 0 unless there is a bug in my code */
+ index = 0;
+
+ if (R == 0)
+ R = 1; /* RTT can't be zero or else divide by zero */
+
+ BUG_ON(index >= TFRC_CALC_X_ARRSIZE);
+
+ if (p >= TFRC_CALC_X_SPLIT)
+ f = tfrc_calc_x_lookup[index][0];
+ else
+ f = tfrc_calc_x_lookup[index][1];
+
+ tmp1 = ((u64)s * 100000000);
+ tmp2 = ((u64)R * (u64)f);
+ do_div(tmp2, 10000);
+ do_div(tmp1, tmp2);
+ /* Don't alter above math unless you test due to overflow on 32 bit */
+
+ return (u32)tmp1;
+}
+
+EXPORT_SYMBOL_GPL(tfrc_calc_x);
+
+/*
+ * args: fvalue - function value to match
+ * returns: p closest to that value
+ *
+ * both fvalue and p are multiplied by 1,000,000 to use ints
+ */
+u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
+{
+ int ctr = 0;
+ int small;
+
+ if (fvalue < tfrc_calc_x_lookup[0][1])
+ return 0;
+
+ if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1])
+ small = 1;
+ else if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0])
+ return 1000000;
+ else
+ small = 0;
+
+ while (fvalue > tfrc_calc_x_lookup[ctr][small])
+ ctr++;
+
+ if (small)
+ return TFRC_CALC_X_SPLIT * ctr / TFRC_CALC_X_ARRSIZE;
+ else
+ return 1000000 * ctr / TFRC_CALC_X_ARRSIZE;
+}
+
+EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup);
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
new file mode 100644
index 00000000000..f97b85d55ad
--- /dev/null
+++ b/net/dccp/dccp.h
@@ -0,0 +1,425 @@
+#ifndef _DCCP_H
+#define _DCCP_H
+/*
+ * net/dccp/dccp.h
+ *
+ * An implementation of the DCCP protocol
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <net/snmp.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include "ackvec.h"
+
+#ifdef CONFIG_IP_DCCP_DEBUG
+extern int dccp_debug;
+
+#define dccp_pr_debug(format, a...) \
+ do { if (dccp_debug) \
+ printk(KERN_DEBUG "%s: " format, __FUNCTION__ , ##a); \
+ } while (0)
+#define dccp_pr_debug_cat(format, a...) do { if (dccp_debug) \
+ printk(format, ##a); } while (0)
+#else
+#define dccp_pr_debug(format, a...)
+#define dccp_pr_debug_cat(format, a...)
+#endif
+
+extern struct inet_hashinfo dccp_hashinfo;
+
+extern atomic_t dccp_orphan_count;
+extern int dccp_tw_count;
+extern void dccp_tw_deschedule(struct inet_timewait_sock *tw);
+
+extern void dccp_time_wait(struct sock *sk, int state, int timeo);
+
+/* FIXME: Right size this */
+#define DCCP_MAX_OPT_LEN 128
+
+#define DCCP_MAX_PACKET_HDR 32
+
+#define MAX_DCCP_HEADER (DCCP_MAX_PACKET_HDR + DCCP_MAX_OPT_LEN + MAX_HEADER)
+
+#define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT
+ * state, about 60 seconds */
+
+/* draft-ietf-dccp-spec-11.txt initial RTO value */
+#define DCCP_TIMEOUT_INIT ((unsigned)(3 * HZ))
+
+/* Maximal interval between probes for local resources. */
+#define DCCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ / 2U))
+
+#define DCCP_RTO_MAX ((unsigned)(120 * HZ)) /* FIXME: using TCP value */
+
+extern struct proto dccp_v4_prot;
+
+/* is seq1 < seq2 ? */
+static inline int before48(const u64 seq1, const u64 seq2)
+{
+ return (s64)((seq1 << 16) - (seq2 << 16)) < 0;
+}
+
+/* is seq1 > seq2 ? */
+static inline int after48(const u64 seq1, const u64 seq2)
+{
+ return (s64)((seq2 << 16) - (seq1 << 16)) < 0;
+}
+
+/* is seq2 <= seq1 <= seq3 ? */
+static inline int between48(const u64 seq1, const u64 seq2, const u64 seq3)
+{
+ return (seq3 << 16) - (seq2 << 16) >= (seq1 << 16) - (seq2 << 16);
+}
+
+static inline u64 max48(const u64 seq1, const u64 seq2)
+{
+ return after48(seq1, seq2) ? seq1 : seq2;
+}
+
+enum {
+ DCCP_MIB_NUM = 0,
+ DCCP_MIB_ACTIVEOPENS, /* ActiveOpens */
+ DCCP_MIB_ESTABRESETS, /* EstabResets */
+ DCCP_MIB_CURRESTAB, /* CurrEstab */
+ DCCP_MIB_OUTSEGS, /* OutSegs */
+ DCCP_MIB_OUTRSTS,
+ DCCP_MIB_ABORTONTIMEOUT,
+ DCCP_MIB_TIMEOUTS,
+ DCCP_MIB_ABORTFAILED,
+ DCCP_MIB_PASSIVEOPENS,
+ DCCP_MIB_ATTEMPTFAILS,
+ DCCP_MIB_OUTDATAGRAMS,
+ DCCP_MIB_INERRS,
+ DCCP_MIB_OPTMANDATORYERROR,
+ DCCP_MIB_INVALIDOPT,
+ __DCCP_MIB_MAX
+};
+
+#define DCCP_MIB_MAX __DCCP_MIB_MAX
+struct dccp_mib {
+ unsigned long mibs[DCCP_MIB_MAX];
+} __SNMP_MIB_ALIGN__;
+
+DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics);
+#define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field)
+#define DCCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(dccp_statistics, field)
+#define DCCP_INC_STATS_USER(field) SNMP_INC_STATS_USER(dccp_statistics, field)
+#define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field)
+#define DCCP_ADD_STATS_BH(field, val) \
+ SNMP_ADD_STATS_BH(dccp_statistics, field, val)
+#define DCCP_ADD_STATS_USER(field, val) \
+ SNMP_ADD_STATS_USER(dccp_statistics, field, val)
+
+extern int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb);
+
+extern int dccp_send_response(struct sock *sk);
+extern void dccp_send_ack(struct sock *sk);
+extern void dccp_send_delayed_ack(struct sock *sk);
+extern void dccp_send_sync(struct sock *sk, const u64 seq,
+ const enum dccp_pkt_type pkt_type);
+
+extern int dccp_write_xmit(struct sock *sk, struct sk_buff *skb, long *timeo);
+extern void dccp_write_space(struct sock *sk);
+
+extern void dccp_init_xmit_timers(struct sock *sk);
+static inline void dccp_clear_xmit_timers(struct sock *sk)
+{
+ inet_csk_clear_xmit_timers(sk);
+}
+
+extern unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu);
+
+extern const char *dccp_packet_name(const int type);
+extern const char *dccp_state_name(const int state);
+
+static inline void dccp_set_state(struct sock *sk, const int state)
+{
+ const int oldstate = sk->sk_state;
+
+ dccp_pr_debug("%s(%p) %-10.10s -> %s\n",
+ dccp_role(sk), sk,
+ dccp_state_name(oldstate), dccp_state_name(state));
+ WARN_ON(state == oldstate);
+
+ switch (state) {
+ case DCCP_OPEN:
+ if (oldstate != DCCP_OPEN)
+ DCCP_INC_STATS(DCCP_MIB_CURRESTAB);
+ break;
+
+ case DCCP_CLOSED:
+ if (oldstate == DCCP_CLOSING || oldstate == DCCP_OPEN)
+ DCCP_INC_STATS(DCCP_MIB_ESTABRESETS);
+
+ sk->sk_prot->unhash(sk);
+ if (inet_csk(sk)->icsk_bind_hash != NULL &&
+ !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
+ inet_put_port(&dccp_hashinfo, sk);
+ /* fall through */
+ default:
+ if (oldstate == DCCP_OPEN)
+ DCCP_DEC_STATS(DCCP_MIB_CURRESTAB);
+ }
+
+ /* Change state AFTER socket is unhashed to avoid closed
+ * socket sitting in hash tables.
+ */
+ sk->sk_state = state;
+}
+
+static inline void dccp_done(struct sock *sk)
+{
+ dccp_set_state(sk, DCCP_CLOSED);
+ dccp_clear_xmit_timers(sk);
+
+ sk->sk_shutdown = SHUTDOWN_MASK;
+
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_state_change(sk);
+ else
+ inet_csk_destroy_sock(sk);
+}
+
+static inline void dccp_openreq_init(struct request_sock *req,
+ struct dccp_sock *dp,
+ struct sk_buff *skb)
+{
+ /*
+ * FIXME: fill in the other req fields from the DCCP options
+ * received
+ */
+ inet_rsk(req)->rmt_port = dccp_hdr(skb)->dccph_sport;
+ inet_rsk(req)->acked = 0;
+ req->rcv_wnd = 0;
+}
+
+extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
+
+extern struct sock *dccp_create_openreq_child(struct sock *sk,
+ const struct request_sock *req,
+ const struct sk_buff *skb);
+
+extern int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb);
+
+extern void dccp_v4_err(struct sk_buff *skb, u32);
+
+extern int dccp_v4_rcv(struct sk_buff *skb);
+
+extern struct sock *dccp_v4_request_recv_sock(struct sock *sk,
+ struct sk_buff *skb,
+ struct request_sock *req,
+ struct dst_entry *dst);
+extern struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct request_sock **prev);
+
+extern int dccp_child_process(struct sock *parent, struct sock *child,
+ struct sk_buff *skb);
+extern int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+ struct dccp_hdr *dh, unsigned len);
+extern int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
+ const struct dccp_hdr *dh, const unsigned len);
+
+extern void dccp_close(struct sock *sk, long timeout);
+extern struct sk_buff *dccp_make_response(struct sock *sk,
+ struct dst_entry *dst,
+ struct request_sock *req);
+extern struct sk_buff *dccp_make_reset(struct sock *sk,
+ struct dst_entry *dst,
+ enum dccp_reset_codes code);
+
+extern int dccp_connect(struct sock *sk);
+extern int dccp_disconnect(struct sock *sk, int flags);
+extern int dccp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen);
+extern int dccp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int optlen);
+extern int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg);
+extern int dccp_sendmsg(struct kiocb *iocb, struct sock *sk,
+ struct msghdr *msg, size_t size);
+extern int dccp_recvmsg(struct kiocb *iocb, struct sock *sk,
+ struct msghdr *msg, size_t len, int nonblock,
+ int flags, int *addr_len);
+extern void dccp_shutdown(struct sock *sk, int how);
+
+extern int dccp_v4_checksum(const struct sk_buff *skb,
+ const u32 saddr, const u32 daddr);
+
+extern int dccp_v4_send_reset(struct sock *sk,
+ enum dccp_reset_codes code);
+extern void dccp_send_close(struct sock *sk, const int active);
+
+struct dccp_skb_cb {
+ __u8 dccpd_type:4;
+ __u8 dccpd_ccval:4;
+ __u8 dccpd_reset_code;
+ __u16 dccpd_opt_len;
+ __u64 dccpd_seq;
+ __u64 dccpd_ack_seq;
+};
+
+#define DCCP_SKB_CB(__skb) ((struct dccp_skb_cb *)&((__skb)->cb[0]))
+
+static inline int dccp_non_data_packet(const struct sk_buff *skb)
+{
+ const __u8 type = DCCP_SKB_CB(skb)->dccpd_type;
+
+ return type == DCCP_PKT_ACK ||
+ type == DCCP_PKT_CLOSE ||
+ type == DCCP_PKT_CLOSEREQ ||
+ type == DCCP_PKT_RESET ||
+ type == DCCP_PKT_SYNC ||
+ type == DCCP_PKT_SYNCACK;
+}
+
+static inline int dccp_packet_without_ack(const struct sk_buff *skb)
+{
+ const __u8 type = DCCP_SKB_CB(skb)->dccpd_type;
+
+ return type == DCCP_PKT_DATA || type == DCCP_PKT_REQUEST;
+}
+
+#define DCCP_MAX_SEQNO ((((u64)1) << 48) - 1)
+#define DCCP_PKT_WITHOUT_ACK_SEQ (DCCP_MAX_SEQNO << 2)
+
+static inline void dccp_set_seqno(u64 *seqno, u64 value)
+{
+ if (value > DCCP_MAX_SEQNO)
+ value -= DCCP_MAX_SEQNO + 1;
+ *seqno = value;
+}
+
+static inline u64 dccp_delta_seqno(u64 seqno1, u64 seqno2)
+{
+ return ((seqno2 << 16) - (seqno1 << 16)) >> 16;
+}
+
+static inline void dccp_inc_seqno(u64 *seqno)
+{
+ if (++*seqno > DCCP_MAX_SEQNO)
+ *seqno = 0;
+}
+
+static inline void dccp_hdr_set_seq(struct dccp_hdr *dh, const u64 gss)
+{
+ struct dccp_hdr_ext *dhx = (struct dccp_hdr_ext *)((void *)dh +
+ sizeof(*dh));
+
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ dh->dccph_seq = htonl((gss >> 32)) >> 8;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+ dh->dccph_seq = htonl((gss >> 32));
+#else
+#error "Adjust your <asm/byteorder.h> defines"
+#endif
+ dhx->dccph_seq_low = htonl(gss & 0xffffffff);
+}
+
+static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack,
+ const u64 gsr)
+{
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ dhack->dccph_ack_nr_high = htonl((gsr >> 32)) >> 8;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+ dhack->dccph_ack_nr_high = htonl((gsr >> 32));
+#else
+#error "Adjust your <asm/byteorder.h> defines"
+#endif
+ dhack->dccph_ack_nr_low = htonl(gsr & 0xffffffff);
+}
+
+static inline void dccp_update_gsr(struct sock *sk, u64 seq)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ dp->dccps_gsr = seq;
+ dccp_set_seqno(&dp->dccps_swl,
+ (dp->dccps_gsr + 1 -
+ (dp->dccps_options.dccpo_sequence_window / 4)));
+ dccp_set_seqno(&dp->dccps_swh,
+ (dp->dccps_gsr +
+ (3 * dp->dccps_options.dccpo_sequence_window) / 4));
+}
+
+static inline void dccp_update_gss(struct sock *sk, u64 seq)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ dp->dccps_awh = dp->dccps_gss = seq;
+ dccp_set_seqno(&dp->dccps_awl,
+ (dp->dccps_gss -
+ dp->dccps_options.dccpo_sequence_window + 1));
+}
+
+static inline int dccp_ack_pending(const struct sock *sk)
+{
+ const struct dccp_sock *dp = dccp_sk(sk);
+ return dp->dccps_timestamp_echo != 0 ||
+#ifdef CONFIG_IP_DCCP_ACKVEC
+ (dp->dccps_options.dccpo_send_ack_vector &&
+ dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) ||
+#endif
+ inet_csk_ack_scheduled(sk);
+}
+
+extern void dccp_insert_options(struct sock *sk, struct sk_buff *skb);
+extern void dccp_insert_option_elapsed_time(struct sock *sk,
+ struct sk_buff *skb,
+ u32 elapsed_time);
+extern void dccp_insert_option_timestamp(struct sock *sk,
+ struct sk_buff *skb);
+extern void dccp_insert_option(struct sock *sk, struct sk_buff *skb,
+ unsigned char option,
+ const void *value, unsigned char len);
+
+extern struct socket *dccp_ctl_socket;
+
+extern void dccp_timestamp(const struct sock *sk, struct timeval *tv);
+
+static inline suseconds_t timeval_usecs(const struct timeval *tv)
+{
+ return tv->tv_sec * USEC_PER_SEC + tv->tv_usec;
+}
+
+static inline suseconds_t timeval_delta(const struct timeval *large,
+ const struct timeval *small)
+{
+ time_t secs = large->tv_sec - small->tv_sec;
+ suseconds_t usecs = large->tv_usec - small->tv_usec;
+
+ if (usecs < 0) {
+ secs--;
+ usecs += USEC_PER_SEC;
+ }
+ return secs * USEC_PER_SEC + usecs;
+}
+
+static inline void timeval_add_usecs(struct timeval *tv,
+ const suseconds_t usecs)
+{
+ tv->tv_usec += usecs;
+ while (tv->tv_usec >= USEC_PER_SEC) {
+ tv->tv_sec++;
+ tv->tv_usec -= USEC_PER_SEC;
+ }
+}
+
+static inline void timeval_sub_usecs(struct timeval *tv,
+ const suseconds_t usecs)
+{
+ tv->tv_usec -= usecs;
+ while (tv->tv_usec < 0) {
+ tv->tv_sec--;
+ tv->tv_usec += USEC_PER_SEC;
+ }
+}
+
+#endif /* _DCCP_H */
diff --git a/net/dccp/diag.c b/net/dccp/diag.c
new file mode 100644
index 00000000000..f675d8e642d
--- /dev/null
+++ b/net/dccp/diag.c
@@ -0,0 +1,71 @@
+/*
+ * net/dccp/diag.c
+ *
+ * An implementation of the DCCP protocol
+ * Arnaldo Carvalho de Melo <acme@mandriva.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/config.h>
+
+#include <linux/module.h>
+#include <linux/inet_diag.h>
+
+#include "ccid.h"
+#include "dccp.h"
+
+static void dccp_get_info(struct sock *sk, struct tcp_info *info)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ memset(info, 0, sizeof(*info));
+
+ info->tcpi_state = sk->sk_state;
+ info->tcpi_retransmits = icsk->icsk_retransmits;
+ info->tcpi_probes = icsk->icsk_probes_out;
+ info->tcpi_backoff = icsk->icsk_backoff;
+ info->tcpi_pmtu = dp->dccps_pmtu_cookie;
+
+ if (dp->dccps_options.dccpo_send_ack_vector)
+ info->tcpi_options |= TCPI_OPT_SACK;
+
+ ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info);
+ ccid_hc_tx_get_info(dp->dccps_hc_tx_ccid, sk, info);
+}
+
+static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+ void *_info)
+{
+ r->idiag_rqueue = r->idiag_wqueue = 0;
+
+ if (_info != NULL)
+ dccp_get_info(sk, _info);
+}
+
+static struct inet_diag_handler dccp_diag_handler = {
+ .idiag_hashinfo = &dccp_hashinfo,
+ .idiag_get_info = dccp_diag_get_info,
+ .idiag_type = DCCPDIAG_GETSOCK,
+ .idiag_info_size = sizeof(struct tcp_info),
+};
+
+static int __init dccp_diag_init(void)
+{
+ return inet_diag_register(&dccp_diag_handler);
+}
+
+static void __exit dccp_diag_fini(void)
+{
+ inet_diag_unregister(&dccp_diag_handler);
+}
+
+module_init(dccp_diag_init);
+module_exit(dccp_diag_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>");
+MODULE_DESCRIPTION("DCCP inet_diag handler");
diff --git a/net/dccp/input.c b/net/dccp/input.c
new file mode 100644
index 00000000000..3454d594190
--- /dev/null
+++ b/net/dccp/input.c
@@ -0,0 +1,568 @@
+/*
+ * net/dccp/input.c
+ *
+ * An implementation of the DCCP protocol
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <linux/skbuff.h>
+
+#include <net/sock.h>
+
+#include "ackvec.h"
+#include "ccid.h"
+#include "dccp.h"
+
+static void dccp_fin(struct sock *sk, struct sk_buff *skb)
+{
+ sk->sk_shutdown |= RCV_SHUTDOWN;
+ sock_set_flag(sk, SOCK_DONE);
+ __skb_pull(skb, dccp_hdr(skb)->dccph_doff * 4);
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ skb_set_owner_r(skb, sk);
+ sk->sk_data_ready(sk, 0);
+}
+
+static void dccp_rcv_close(struct sock *sk, struct sk_buff *skb)
+{
+ dccp_v4_send_reset(sk, DCCP_RESET_CODE_CLOSED);
+ dccp_fin(sk, skb);
+ dccp_set_state(sk, DCCP_CLOSED);
+ sk_wake_async(sk, 1, POLL_HUP);
+}
+
+static void dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb)
+{
+ /*
+ * Step 7: Check for unexpected packet types
+ * If (S.is_server and P.type == CloseReq)
+ * Send Sync packet acknowledging P.seqno
+ * Drop packet and return
+ */
+ if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) {
+ dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC);
+ return;
+ }
+
+ if (sk->sk_state != DCCP_CLOSING)
+ dccp_set_state(sk, DCCP_CLOSING);
+ dccp_send_close(sk, 0);
+}
+
+static inline void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ if (dp->dccps_options.dccpo_send_ack_vector)
+ dccp_ackvec_check_rcv_ackno(dp->dccps_hc_rx_ackvec, sk,
+ DCCP_SKB_CB(skb)->dccpd_ack_seq);
+}
+
+static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
+{
+ const struct dccp_hdr *dh = dccp_hdr(skb);
+ struct dccp_sock *dp = dccp_sk(sk);
+ u64 lswl, lawl;
+
+ /*
+ * Step 5: Prepare sequence numbers for Sync
+ * If P.type == Sync or P.type == SyncAck,
+ * If S.AWL <= P.ackno <= S.AWH and P.seqno >= S.SWL,
+ * / * P is valid, so update sequence number variables
+ * accordingly. After this update, P will pass the tests
+ * in Step 6. A SyncAck is generated if necessary in
+ * Step 15 * /
+ * Update S.GSR, S.SWL, S.SWH
+ * Otherwise,
+ * Drop packet and return
+ */
+ if (dh->dccph_type == DCCP_PKT_SYNC ||
+ dh->dccph_type == DCCP_PKT_SYNCACK) {
+ if (between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
+ dp->dccps_awl, dp->dccps_awh) &&
+ !before48(DCCP_SKB_CB(skb)->dccpd_seq, dp->dccps_swl))
+ dccp_update_gsr(sk, DCCP_SKB_CB(skb)->dccpd_seq);
+ else
+ return -1;
+ }
+
+ /*
+ * Step 6: Check sequence numbers
+ * Let LSWL = S.SWL and LAWL = S.AWL
+ * If P.type == CloseReq or P.type == Close or P.type == Reset,
+ * LSWL := S.GSR + 1, LAWL := S.GAR
+ * If LSWL <= P.seqno <= S.SWH
+ * and (P.ackno does not exist or LAWL <= P.ackno <= S.AWH),
+ * Update S.GSR, S.SWL, S.SWH
+ * If P.type != Sync,
+ * Update S.GAR
+ * Otherwise,
+ * Send Sync packet acknowledging P.seqno
+ * Drop packet and return
+ */
+ lswl = dp->dccps_swl;
+ lawl = dp->dccps_awl;
+
+ if (dh->dccph_type == DCCP_PKT_CLOSEREQ ||
+ dh->dccph_type == DCCP_PKT_CLOSE ||
+ dh->dccph_type == DCCP_PKT_RESET) {
+ lswl = dp->dccps_gsr;
+ dccp_inc_seqno(&lswl);
+ lawl = dp->dccps_gar;
+ }
+
+ if (between48(DCCP_SKB_CB(skb)->dccpd_seq, lswl, dp->dccps_swh) &&
+ (DCCP_SKB_CB(skb)->dccpd_ack_seq == DCCP_PKT_WITHOUT_ACK_SEQ ||
+ between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
+ lawl, dp->dccps_awh))) {
+ dccp_update_gsr(sk, DCCP_SKB_CB(skb)->dccpd_seq);
+
+ if (dh->dccph_type != DCCP_PKT_SYNC &&
+ (DCCP_SKB_CB(skb)->dccpd_ack_seq !=
+ DCCP_PKT_WITHOUT_ACK_SEQ))
+ dp->dccps_gar = DCCP_SKB_CB(skb)->dccpd_ack_seq;
+ } else {
+ LIMIT_NETDEBUG(KERN_WARNING "DCCP: Step 6 failed for %s packet, "
+ "(LSWL(%llu) <= P.seqno(%llu) <= S.SWH(%llu)) and "
+ "(P.ackno %s or LAWL(%llu) <= P.ackno(%llu) <= S.AWH(%llu), "
+ "sending SYNC...\n",
+ dccp_packet_name(dh->dccph_type),
+ (unsigned long long) lswl,
+ (unsigned long long)
+ DCCP_SKB_CB(skb)->dccpd_seq,
+ (unsigned long long) dp->dccps_swh,
+ (DCCP_SKB_CB(skb)->dccpd_ack_seq ==
+ DCCP_PKT_WITHOUT_ACK_SEQ) ? "doesn't exist" : "exists",
+ (unsigned long long) lawl,
+ (unsigned long long)
+ DCCP_SKB_CB(skb)->dccpd_ack_seq,
+ (unsigned long long) dp->dccps_awh);
+ dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC);
+ return -1;
+ }
+
+ return 0;
+}
+
+int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
+ const struct dccp_hdr *dh, const unsigned len)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ if (dccp_check_seqno(sk, skb))
+ goto discard;
+
+ if (dccp_parse_options(sk, skb))
+ goto discard;
+
+ if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
+ dccp_event_ack_recv(sk, skb);
+
+ if (dp->dccps_options.dccpo_send_ack_vector &&
+ dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
+ DCCP_SKB_CB(skb)->dccpd_seq,
+ DCCP_ACKVEC_STATE_RECEIVED))
+ goto discard;
+
+ ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
+ ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
+
+ switch (dccp_hdr(skb)->dccph_type) {
+ case DCCP_PKT_DATAACK:
+ case DCCP_PKT_DATA:
+ /*
+ * FIXME: check if sk_receive_queue is full, schedule DATA_DROPPED
+ * option if it is.
+ */
+ __skb_pull(skb, dh->dccph_doff * 4);
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ skb_set_owner_r(skb, sk);
+ sk->sk_data_ready(sk, 0);
+ return 0;
+ case DCCP_PKT_ACK:
+ goto discard;
+ case DCCP_PKT_RESET:
+ /*
+ * Step 9: Process Reset
+ * If P.type == Reset,
+ * Tear down connection
+ * S.state := TIMEWAIT
+ * Set TIMEWAIT timer
+ * Drop packet and return
+ */
+ dccp_fin(sk, skb);
+ dccp_time_wait(sk, DCCP_TIME_WAIT, 0);
+ return 0;
+ case DCCP_PKT_CLOSEREQ:
+ dccp_rcv_closereq(sk, skb);
+ goto discard;
+ case DCCP_PKT_CLOSE:
+ dccp_rcv_close(sk, skb);
+ return 0;
+ case DCCP_PKT_REQUEST:
+ /* Step 7
+ * or (S.is_server and P.type == Response)
+ * or (S.is_client and P.type == Request)
+ * or (S.state >= OPEN and P.type == Request
+ * and P.seqno >= S.OSR)
+ * or (S.state >= OPEN and P.type == Response
+ * and P.seqno >= S.OSR)
+ * or (S.state == RESPOND and P.type == Data),
+ * Send Sync packet acknowledging P.seqno
+ * Drop packet and return
+ */
+ if (dp->dccps_role != DCCP_ROLE_LISTEN)
+ goto send_sync;
+ goto check_seq;
+ case DCCP_PKT_RESPONSE:
+ if (dp->dccps_role != DCCP_ROLE_CLIENT)
+ goto send_sync;
+check_seq:
+ if (!before48(DCCP_SKB_CB(skb)->dccpd_seq, dp->dccps_osr)) {
+send_sync:
+ dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq,
+ DCCP_PKT_SYNC);
+ }
+ break;
+ case DCCP_PKT_SYNC:
+ dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq,
+ DCCP_PKT_SYNCACK);
+ /*
+ * From the draft:
+ *
+ * As with DCCP-Ack packets, DCCP-Sync and DCCP-SyncAck packets
+ * MAY have non-zero-length application data areas, whose
+ * contents * receivers MUST ignore.
+ */
+ goto discard;
+ }
+
+ DCCP_INC_STATS_BH(DCCP_MIB_INERRS);
+discard:
+ __kfree_skb(skb);
+ return 0;
+}
+
+static int dccp_rcv_request_sent_state_process(struct sock *sk,
+ struct sk_buff *skb,
+ const struct dccp_hdr *dh,
+ const unsigned len)
+{
+ /*
+ * Step 4: Prepare sequence numbers in REQUEST
+ * If S.state == REQUEST,
+ * If (P.type == Response or P.type == Reset)
+ * and S.AWL <= P.ackno <= S.AWH,
+ * / * Set sequence number variables corresponding to the
+ * other endpoint, so P will pass the tests in Step 6 * /
+ * Set S.GSR, S.ISR, S.SWL, S.SWH
+ * / * Response processing continues in Step 10; Reset
+ * processing continues in Step 9 * /
+ */
+ if (dh->dccph_type == DCCP_PKT_RESPONSE) {
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ /* Stop the REQUEST timer */
+ inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
+ BUG_TRAP(sk->sk_send_head != NULL);
+ __kfree_skb(sk->sk_send_head);
+ sk->sk_send_head = NULL;
+
+ if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
+ dp->dccps_awl, dp->dccps_awh)) {
+ dccp_pr_debug("invalid ackno: S.AWL=%llu, "
+ "P.ackno=%llu, S.AWH=%llu \n",
+ (unsigned long long)dp->dccps_awl,
+ (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq,
+ (unsigned long long)dp->dccps_awh);
+ goto out_invalid_packet;
+ }
+
+ dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
+ dccp_update_gsr(sk, dp->dccps_isr);
+ /*
+ * SWL and AWL are initially adjusted so that they are not less than
+ * the initial Sequence Numbers received and sent, respectively:
+ * SWL := max(GSR + 1 - floor(W/4), ISR),
+ * AWL := max(GSS - W' + 1, ISS).
+ * These adjustments MUST be applied only at the beginning of the
+ * connection.
+ *
+ * AWL was adjusted in dccp_v4_connect -acme
+ */
+ dccp_set_seqno(&dp->dccps_swl,
+ max48(dp->dccps_swl, dp->dccps_isr));
+
+ if (ccid_hc_rx_init(dp->dccps_hc_rx_ccid, sk) != 0 ||
+ ccid_hc_tx_init(dp->dccps_hc_tx_ccid, sk) != 0) {
+ ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk);
+ ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk);
+ /* FIXME: send appropriate RESET code */
+ goto out_invalid_packet;
+ }
+
+ dccp_sync_mss(sk, dp->dccps_pmtu_cookie);
+
+ /*
+ * Step 10: Process REQUEST state (second part)
+ * If S.state == REQUEST,
+ * / * If we get here, P is a valid Response from the
+ * server (see Step 4), and we should move to
+ * PARTOPEN state. PARTOPEN means send an Ack,
+ * don't send Data packets, retransmit Acks
+ * periodically, and always include any Init Cookie
+ * from the Response * /
+ * S.state := PARTOPEN
+ * Set PARTOPEN timer
+ * Continue with S.state == PARTOPEN
+ * / * Step 12 will send the Ack completing the
+ * three-way handshake * /
+ */
+ dccp_set_state(sk, DCCP_PARTOPEN);
+
+ /* Make sure socket is routed, for correct metrics. */
+ inet_sk_rebuild_header(sk);
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ sk->sk_state_change(sk);
+ sk_wake_async(sk, 0, POLL_OUT);
+ }
+
+ if (sk->sk_write_pending || icsk->icsk_ack.pingpong ||
+ icsk->icsk_accept_queue.rskq_defer_accept) {
+ /* Save one ACK. Data will be ready after
+ * several ticks, if write_pending is set.
+ *
+ * It may be deleted, but with this feature tcpdumps
+ * look so _wonderfully_ clever, that I was not able
+ * to stand against the temptation 8) --ANK
+ */
+ /*
+ * OK, in DCCP we can as well do a similar trick, its
+ * even in the draft, but there is no need for us to
+ * schedule an ack here, as dccp_sendmsg does this for
+ * us, also stated in the draft. -acme
+ */
+ __kfree_skb(skb);
+ return 0;
+ }
+ dccp_send_ack(sk);
+ return -1;
+ }
+
+out_invalid_packet:
+ /* dccp_v4_do_rcv will send a reset */
+ DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR;
+ return 1;
+}
+
+static int dccp_rcv_respond_partopen_state_process(struct sock *sk,
+ struct sk_buff *skb,
+ const struct dccp_hdr *dh,
+ const unsigned len)
+{
+ int queued = 0;
+
+ switch (dh->dccph_type) {
+ case DCCP_PKT_RESET:
+ inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
+ break;
+ case DCCP_PKT_DATA:
+ if (sk->sk_state == DCCP_RESPOND)
+ break;
+ case DCCP_PKT_DATAACK:
+ case DCCP_PKT_ACK:
+ /*
+ * FIXME: we should be reseting the PARTOPEN (DELACK) timer
+ * here but only if we haven't used the DELACK timer for
+ * something else, like sending a delayed ack for a TIMESTAMP
+ * echo, etc, for now were not clearing it, sending an extra
+ * ACK when there is nothing else to do in DELACK is not a big
+ * deal after all.
+ */
+
+ /* Stop the PARTOPEN timer */
+ if (sk->sk_state == DCCP_PARTOPEN)
+ inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
+
+ dccp_sk(sk)->dccps_osr = DCCP_SKB_CB(skb)->dccpd_seq;
+ dccp_set_state(sk, DCCP_OPEN);
+
+ if (dh->dccph_type == DCCP_PKT_DATAACK ||
+ dh->dccph_type == DCCP_PKT_DATA) {
+ dccp_rcv_established(sk, skb, dh, len);
+ queued = 1; /* packet was queued
+ (by dccp_rcv_established) */
+ }
+ break;
+ }
+
+ return queued;
+}
+
+int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+ struct dccp_hdr *dh, unsigned len)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
+ const int old_state = sk->sk_state;
+ int queued = 0;
+
+ /*
+ * Step 3: Process LISTEN state
+ * (Continuing from dccp_v4_do_rcv and dccp_v6_do_rcv)
+ *
+ * If S.state == LISTEN,
+ * If P.type == Request or P contains a valid Init Cookie
+ * option,
+ * * Must scan the packet's options to check for an Init
+ * Cookie. Only the Init Cookie is processed here,
+ * however; other options are processed in Step 8. This
+ * scan need only be performed if the endpoint uses Init
+ * Cookies *
+ * * Generate a new socket and switch to that socket *
+ * Set S := new socket for this port pair
+ * S.state = RESPOND
+ * Choose S.ISS (initial seqno) or set from Init Cookie
+ * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
+ * Continue with S.state == RESPOND
+ * * A Response packet will be generated in Step 11 *
+ * Otherwise,
+ * Generate Reset(No Connection) unless P.type == Reset
+ * Drop packet and return
+ *
+ * NOTE: the check for the packet types is done in
+ * dccp_rcv_state_process
+ */
+ if (sk->sk_state == DCCP_LISTEN) {
+ if (dh->dccph_type == DCCP_PKT_REQUEST) {
+ if (dccp_v4_conn_request(sk, skb) < 0)
+ return 1;
+
+ /* FIXME: do congestion control initialization */
+ goto discard;
+ }
+ if (dh->dccph_type == DCCP_PKT_RESET)
+ goto discard;
+
+ /* Caller (dccp_v4_do_rcv) will send Reset */
+ dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
+ return 1;
+ }
+
+ if (sk->sk_state != DCCP_REQUESTING) {
+ if (dccp_check_seqno(sk, skb))
+ goto discard;
+
+ /*
+ * Step 8: Process options and mark acknowledgeable
+ */
+ if (dccp_parse_options(sk, skb))
+ goto discard;
+
+ if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
+ dccp_event_ack_recv(sk, skb);
+
+ ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
+ ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
+
+ if (dp->dccps_options.dccpo_send_ack_vector &&
+ dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
+ DCCP_SKB_CB(skb)->dccpd_seq,
+ DCCP_ACKVEC_STATE_RECEIVED))
+ goto discard;
+ }
+
+ /*
+ * Step 9: Process Reset
+ * If P.type == Reset,
+ * Tear down connection
+ * S.state := TIMEWAIT
+ * Set TIMEWAIT timer
+ * Drop packet and return
+ */
+ if (dh->dccph_type == DCCP_PKT_RESET) {
+ /*
+ * Queue the equivalent of TCP fin so that dccp_recvmsg
+ * exits the loop
+ */
+ dccp_fin(sk, skb);
+ dccp_time_wait(sk, DCCP_TIME_WAIT, 0);
+ return 0;
+ /*
+ * Step 7: Check for unexpected packet types
+ * If (S.is_server and P.type == CloseReq)
+ * or (S.is_server and P.type == Response)
+ * or (S.is_client and P.type == Request)
+ * or (S.state == RESPOND and P.type == Data),
+ * Send Sync packet acknowledging P.seqno
+ * Drop packet and return
+ */
+ } else if ((dp->dccps_role != DCCP_ROLE_CLIENT &&
+ (dh->dccph_type == DCCP_PKT_RESPONSE ||
+ dh->dccph_type == DCCP_PKT_CLOSEREQ)) ||
+ (dp->dccps_role == DCCP_ROLE_CLIENT &&
+ dh->dccph_type == DCCP_PKT_REQUEST) ||
+ (sk->sk_state == DCCP_RESPOND &&
+ dh->dccph_type == DCCP_PKT_DATA)) {
+ dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC);
+ goto discard;
+ } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) {
+ dccp_rcv_closereq(sk, skb);
+ goto discard;
+ } else if (dh->dccph_type == DCCP_PKT_CLOSE) {
+ dccp_rcv_close(sk, skb);
+ return 0;
+ }
+
+ if (unlikely(dh->dccph_type == DCCP_PKT_SYNC)) {
+ dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNCACK);
+ goto discard;
+ }
+
+ switch (sk->sk_state) {
+ case DCCP_CLOSED:
+ dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
+ return 1;
+
+ case DCCP_REQUESTING:
+ /* FIXME: do congestion control initialization */
+
+ queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len);
+ if (queued >= 0)
+ return queued;
+
+ __kfree_skb(skb);
+ return 0;
+
+ case DCCP_RESPOND:
+ case DCCP_PARTOPEN:
+ queued = dccp_rcv_respond_partopen_state_process(sk, skb,
+ dh, len);
+ break;
+ }
+
+ if (dh->dccph_type == DCCP_PKT_ACK ||
+ dh->dccph_type == DCCP_PKT_DATAACK) {
+ switch (old_state) {
+ case DCCP_PARTOPEN:
+ sk->sk_state_change(sk);
+ sk_wake_async(sk, 0, POLL_OUT);
+ break;
+ }
+ }
+
+ if (!queued) {
+discard:
+ __kfree_skb(skb);
+ }
+ return 0;
+}
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
new file mode 100644
index 00000000000..6298cf58ff9
--- /dev/null
+++ b/net/dccp/ipv4.c
@@ -0,0 +1,1348 @@
+/*
+ * net/dccp/ipv4.c
+ *
+ * An implementation of the DCCP protocol
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <linux/icmp.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/random.h>
+
+#include <net/icmp.h>
+#include <net/inet_hashtables.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/xfrm.h>
+
+#include "ackvec.h"
+#include "ccid.h"
+#include "dccp.h"
+
+struct inet_hashinfo __cacheline_aligned dccp_hashinfo = {
+ .lhash_lock = RW_LOCK_UNLOCKED,
+ .lhash_users = ATOMIC_INIT(0),
+ .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait),
+ .portalloc_lock = SPIN_LOCK_UNLOCKED,
+ .port_rover = 1024 - 1,
+};
+
+EXPORT_SYMBOL_GPL(dccp_hashinfo);
+
+static int dccp_v4_get_port(struct sock *sk, const unsigned short snum)
+{
+ return inet_csk_get_port(&dccp_hashinfo, sk, snum);
+}
+
+static void dccp_v4_hash(struct sock *sk)
+{
+ inet_hash(&dccp_hashinfo, sk);
+}
+
+static void dccp_v4_unhash(struct sock *sk)
+{
+ inet_unhash(&dccp_hashinfo, sk);
+}
+
+/* called with local bh disabled */
+static int __dccp_v4_check_established(struct sock *sk, const __u16 lport,
+ struct inet_timewait_sock **twp)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ const u32 daddr = inet->rcv_saddr;
+ const u32 saddr = inet->daddr;
+ const int dif = sk->sk_bound_dev_if;
+ INET_ADDR_COOKIE(acookie, saddr, daddr)
+ const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
+ unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
+ struct inet_ehash_bucket *head = inet_ehash_bucket(&dccp_hashinfo, hash);
+ const struct sock *sk2;
+ const struct hlist_node *node;
+ struct inet_timewait_sock *tw;
+
+ prefetch(head->chain.first);
+ write_lock(&head->lock);
+
+ /* Check TIME-WAIT sockets first. */
+ sk_for_each(sk2, node, &(head + dccp_hashinfo.ehash_size)->chain) {
+ tw = inet_twsk(sk2);
+
+ if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
+ goto not_unique;
+ }
+ tw = NULL;
+
+ /* And established part... */
+ sk_for_each(sk2, node, &head->chain) {
+ if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
+ goto not_unique;
+ }
+
+ /* Must record num and sport now. Otherwise we will see
+ * in hash table socket with a funny identity. */
+ inet->num = lport;
+ inet->sport = htons(lport);
+ sk->sk_hash = hash;
+ BUG_TRAP(sk_unhashed(sk));
+ __sk_add_node(sk, &head->chain);
+ sock_prot_inc_use(sk->sk_prot);
+ write_unlock(&head->lock);
+
+ if (twp != NULL) {
+ *twp = tw;
+ NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
+ } else if (tw != NULL) {
+ /* Silly. Should hash-dance instead... */
+ inet_twsk_deschedule(tw, &dccp_death_row);
+ NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
+
+ inet_twsk_put(tw);
+ }
+
+ return 0;
+
+not_unique:
+ write_unlock(&head->lock);
+ return -EADDRNOTAVAIL;
+}
+
+/*
+ * Bind a port for a connect operation and hash it.
+ */
+static int dccp_v4_hash_connect(struct sock *sk)
+{
+ const unsigned short snum = inet_sk(sk)->num;
+ struct inet_bind_hashbucket *head;
+ struct inet_bind_bucket *tb;
+ int ret;
+
+ if (snum == 0) {
+ int rover;
+ int low = sysctl_local_port_range[0];
+ int high = sysctl_local_port_range[1];
+ int remaining = (high - low) + 1;
+ struct hlist_node *node;
+ struct inet_timewait_sock *tw = NULL;
+
+ local_bh_disable();
+
+ /* TODO. Actually it is not so bad idea to remove
+ * dccp_hashinfo.portalloc_lock before next submission to
+ * Linus.
+ * As soon as we touch this place at all it is time to think.
+ *
+ * Now it protects single _advisory_ variable
+ * dccp_hashinfo.port_rover, hence it is mostly useless.
+ * Code will work nicely if we just delete it, but
+ * I am afraid in contented case it will work not better or
+ * even worse: another cpu just will hit the same bucket
+ * and spin there.
+ * So some cpu salt could remove both contention and
+ * memory pingpong. Any ideas how to do this in a nice way?
+ */
+ spin_lock(&dccp_hashinfo.portalloc_lock);
+ rover = dccp_hashinfo.port_rover;
+
+ do {
+ rover++;
+ if ((rover < low) || (rover > high))
+ rover = low;
+ head = &dccp_hashinfo.bhash[inet_bhashfn(rover,
+ dccp_hashinfo.bhash_size)];
+ spin_lock(&head->lock);
+
+ /* Does not bother with rcv_saddr checks,
+ * because the established check is already
+ * unique enough.
+ */
+ inet_bind_bucket_for_each(tb, node, &head->chain) {
+ if (tb->port == rover) {
+ BUG_TRAP(!hlist_empty(&tb->owners));
+ if (tb->fastreuse >= 0)
+ goto next_port;
+ if (!__dccp_v4_check_established(sk,
+ rover,
+ &tw))
+ goto ok;
+ goto next_port;
+ }
+ }
+
+ tb = inet_bind_bucket_create(dccp_hashinfo.bind_bucket_cachep,
+ head, rover);
+ if (tb == NULL) {
+ spin_unlock(&head->lock);
+ break;
+ }
+ tb->fastreuse = -1;
+ goto ok;
+
+ next_port:
+ spin_unlock(&head->lock);
+ } while (--remaining > 0);
+ dccp_hashinfo.port_rover = rover;
+ spin_unlock(&dccp_hashinfo.portalloc_lock);
+
+ local_bh_enable();
+
+ return -EADDRNOTAVAIL;
+
+ok:
+ /* All locks still held and bhs disabled */
+ dccp_hashinfo.port_rover = rover;
+ spin_unlock(&dccp_hashinfo.portalloc_lock);
+
+ inet_bind_hash(sk, tb, rover);
+ if (sk_unhashed(sk)) {
+ inet_sk(sk)->sport = htons(rover);
+ __inet_hash(&dccp_hashinfo, sk, 0);
+ }
+ spin_unlock(&head->lock);
+
+ if (tw != NULL) {
+ inet_twsk_deschedule(tw, &dccp_death_row);
+ inet_twsk_put(tw);
+ }
+
+ ret = 0;
+ goto out;
+ }
+
+ head = &dccp_hashinfo.bhash[inet_bhashfn(snum,
+ dccp_hashinfo.bhash_size)];
+ tb = inet_csk(sk)->icsk_bind_hash;
+ spin_lock_bh(&head->lock);
+ if (sk_head(&tb->owners) == sk && sk->sk_bind_node.next == NULL) {
+ __inet_hash(&dccp_hashinfo, sk, 0);
+ spin_unlock_bh(&head->lock);
+ return 0;
+ } else {
+ spin_unlock(&head->lock);
+ /* No definite answer... Walk to established hash table */
+ ret = __dccp_v4_check_established(sk, snum, NULL);
+out:
+ local_bh_enable();
+ return ret;
+ }
+}
+
+static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
+ int addr_len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct dccp_sock *dp = dccp_sk(sk);
+ const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
+ struct rtable *rt;
+ u32 daddr, nexthop;
+ int tmp;
+ int err;
+
+ dp->dccps_role = DCCP_ROLE_CLIENT;
+
+ if (dccp_service_not_initialized(sk))
+ return -EPROTO;
+
+ if (addr_len < sizeof(struct sockaddr_in))
+ return -EINVAL;
+
+ if (usin->sin_family != AF_INET)
+ return -EAFNOSUPPORT;
+
+ nexthop = daddr = usin->sin_addr.s_addr;
+ if (inet->opt != NULL && inet->opt->srr) {
+ if (daddr == 0)
+ return -EINVAL;
+ nexthop = inet->opt->faddr;
+ }
+
+ tmp = ip_route_connect(&rt, nexthop, inet->saddr,
+ RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
+ IPPROTO_DCCP,
+ inet->sport, usin->sin_port, sk);
+ if (tmp < 0)
+ return tmp;
+
+ if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
+ ip_rt_put(rt);
+ return -ENETUNREACH;
+ }
+
+ if (inet->opt == NULL || !inet->opt->srr)
+ daddr = rt->rt_dst;
+
+ if (inet->saddr == 0)
+ inet->saddr = rt->rt_src;
+ inet->rcv_saddr = inet->saddr;
+
+ inet->dport = usin->sin_port;
+ inet->daddr = daddr;
+
+ dp->dccps_ext_header_len = 0;
+ if (inet->opt != NULL)
+ dp->dccps_ext_header_len = inet->opt->optlen;
+ /*
+ * Socket identity is still unknown (sport may be zero).
+ * However we set state to DCCP_REQUESTING and not releasing socket
+ * lock select source port, enter ourselves into the hash tables and
+ * complete initialization after this.
+ */
+ dccp_set_state(sk, DCCP_REQUESTING);
+ err = dccp_v4_hash_connect(sk);
+ if (err != 0)
+ goto failure;
+
+ err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
+ if (err != 0)
+ goto failure;
+
+ /* OK, now commit destination to socket. */
+ sk_setup_caps(sk, &rt->u.dst);
+
+ dp->dccps_gar =
+ dp->dccps_iss = secure_dccp_sequence_number(inet->saddr,
+ inet->daddr,
+ inet->sport,
+ usin->sin_port);
+ dccp_update_gss(sk, dp->dccps_iss);
+
+ /*
+ * SWL and AWL are initially adjusted so that they are not less than
+ * the initial Sequence Numbers received and sent, respectively:
+ * SWL := max(GSR + 1 - floor(W/4), ISR),
+ * AWL := max(GSS - W' + 1, ISS).
+ * These adjustments MUST be applied only at the beginning of the
+ * connection.
+ */
+ dccp_set_seqno(&dp->dccps_awl, max48(dp->dccps_awl, dp->dccps_iss));
+
+ inet->id = dp->dccps_iss ^ jiffies;
+
+ err = dccp_connect(sk);
+ rt = NULL;
+ if (err != 0)
+ goto failure;
+out:
+ return err;
+failure:
+ /*
+ * This unhashes the socket and releases the local port, if necessary.
+ */
+ dccp_set_state(sk, DCCP_CLOSED);
+ ip_rt_put(rt);
+ sk->sk_route_caps = 0;
+ inet->dport = 0;
+ goto out;
+}
+
+/*
+ * This routine does path mtu discovery as defined in RFC1191.
+ */
+static inline void dccp_do_pmtu_discovery(struct sock *sk,
+ const struct iphdr *iph,
+ u32 mtu)
+{
+ struct dst_entry *dst;
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct dccp_sock *dp = dccp_sk(sk);
+
+ /* We are not interested in DCCP_LISTEN and request_socks (RESPONSEs
+ * send out by Linux are always < 576bytes so they should go through
+ * unfragmented).
+ */
+ if (sk->sk_state == DCCP_LISTEN)
+ return;
+
+ /* We don't check in the destentry if pmtu discovery is forbidden
+ * on this route. We just assume that no packet_to_big packets
+ * are send back when pmtu discovery is not active.
+ * There is a small race when the user changes this flag in the
+ * route, but I think that's acceptable.
+ */
+ if ((dst = __sk_dst_check(sk, 0)) == NULL)
+ return;
+
+ dst->ops->update_pmtu(dst, mtu);
+
+ /* Something is about to be wrong... Remember soft error
+ * for the case, if this connection will not able to recover.
+ */
+ if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
+ sk->sk_err_soft = EMSGSIZE;
+
+ mtu = dst_mtu(dst);
+
+ if (inet->pmtudisc != IP_PMTUDISC_DONT &&
+ dp->dccps_pmtu_cookie > mtu) {
+ dccp_sync_mss(sk, mtu);
+
+ /*
+ * From: draft-ietf-dccp-spec-11.txt
+ *
+ * DCCP-Sync packets are the best choice for upward
+ * probing, since DCCP-Sync probes do not risk application
+ * data loss.
+ */
+ dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC);
+ } /* else let the usual retransmit timer handle it */
+}
+
+static void dccp_v4_ctl_send_ack(struct sk_buff *rxskb)
+{
+ int err;
+ struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh;
+ const int dccp_hdr_ack_len = sizeof(struct dccp_hdr) +
+ sizeof(struct dccp_hdr_ext) +
+ sizeof(struct dccp_hdr_ack_bits);
+ struct sk_buff *skb;
+
+ if (((struct rtable *)rxskb->dst)->rt_type != RTN_LOCAL)
+ return;
+
+ skb = alloc_skb(MAX_DCCP_HEADER + 15, GFP_ATOMIC);
+ if (skb == NULL)
+ return;
+
+ /* Reserve space for headers. */
+ skb_reserve(skb, MAX_DCCP_HEADER);
+
+ skb->dst = dst_clone(rxskb->dst);
+
+ skb->h.raw = skb_push(skb, dccp_hdr_ack_len);
+ dh = dccp_hdr(skb);
+ memset(dh, 0, dccp_hdr_ack_len);
+
+ /* Build DCCP header and checksum it. */
+ dh->dccph_type = DCCP_PKT_ACK;
+ dh->dccph_sport = rxdh->dccph_dport;
+ dh->dccph_dport = rxdh->dccph_sport;
+ dh->dccph_doff = dccp_hdr_ack_len / 4;
+ dh->dccph_x = 1;
+
+ dccp_hdr_set_seq(dh, DCCP_SKB_CB(rxskb)->dccpd_ack_seq);
+ dccp_hdr_set_ack(dccp_hdr_ack_bits(skb),
+ DCCP_SKB_CB(rxskb)->dccpd_seq);
+
+ bh_lock_sock(dccp_ctl_socket->sk);
+ err = ip_build_and_send_pkt(skb, dccp_ctl_socket->sk,
+ rxskb->nh.iph->daddr,
+ rxskb->nh.iph->saddr, NULL);
+ bh_unlock_sock(dccp_ctl_socket->sk);
+
+ if (err == NET_XMIT_CN || err == 0) {
+ DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
+ DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
+ }
+}
+
+static void dccp_v4_reqsk_send_ack(struct sk_buff *skb,
+ struct request_sock *req)
+{
+ dccp_v4_ctl_send_ack(skb);
+}
+
+static int dccp_v4_send_response(struct sock *sk, struct request_sock *req,
+ struct dst_entry *dst)
+{
+ int err = -1;
+ struct sk_buff *skb;
+
+ /* First, grab a route. */
+
+ if (dst == NULL && (dst = inet_csk_route_req(sk, req)) == NULL)
+ goto out;
+
+ skb = dccp_make_response(sk, dst, req);
+ if (skb != NULL) {
+ const struct inet_request_sock *ireq = inet_rsk(req);
+
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+ err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
+ ireq->rmt_addr,
+ ireq->opt);
+ if (err == NET_XMIT_CN)
+ err = 0;
+ }
+
+out:
+ dst_release(dst);
+ return err;
+}
+
+/*
+ * This routine is called by the ICMP module when it gets some sort of error
+ * condition. If err < 0 then the socket should be closed and the error
+ * returned to the user. If err > 0 it's just the icmp type << 8 | icmp code.
+ * After adjustment header points to the first 8 bytes of the tcp header. We
+ * need to find the appropriate port.
+ *
+ * The locking strategy used here is very "optimistic". When someone else
+ * accesses the socket the ICMP is just dropped and for some paths there is no
+ * check at all. A more general error queue to queue errors for later handling
+ * is probably better.
+ */
+void dccp_v4_err(struct sk_buff *skb, u32 info)
+{
+ const struct iphdr *iph = (struct iphdr *)skb->data;
+ const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data +
+ (iph->ihl << 2));
+ struct dccp_sock *dp;
+ struct inet_sock *inet;
+ const int type = skb->h.icmph->type;
+ const int code = skb->h.icmph->code;
+ struct sock *sk;
+ __u64 seq;
+ int err;
+
+ if (skb->len < (iph->ihl << 2) + 8) {
+ ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+ return;
+ }
+
+ sk = inet_lookup(&dccp_hashinfo, iph->daddr, dh->dccph_dport,
+ iph->saddr, dh->dccph_sport, inet_iif(skb));
+ if (sk == NULL) {
+ ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+ return;
+ }
+
+ if (sk->sk_state == DCCP_TIME_WAIT) {
+ inet_twsk_put((struct inet_timewait_sock *)sk);
+ return;
+ }
+
+ bh_lock_sock(sk);
+ /* If too many ICMPs get dropped on busy
+ * servers this needs to be solved differently.
+ */
+ if (sock_owned_by_user(sk))
+ NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
+
+ if (sk->sk_state == DCCP_CLOSED)
+ goto out;
+
+ dp = dccp_sk(sk);
+ seq = dccp_hdr_seq(skb);
+ if (sk->sk_state != DCCP_LISTEN &&
+ !between48(seq, dp->dccps_swl, dp->dccps_swh)) {
+ NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
+ goto out;
+ }
+
+ switch (type) {
+ case ICMP_SOURCE_QUENCH:
+ /* Just silently ignore these. */
+ goto out;
+ case ICMP_PARAMETERPROB:
+ err = EPROTO;
+ break;
+ case ICMP_DEST_UNREACH:
+ if (code > NR_ICMP_UNREACH)
+ goto out;
+
+ if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
+ if (!sock_owned_by_user(sk))
+ dccp_do_pmtu_discovery(sk, iph, info);
+ goto out;
+ }
+
+ err = icmp_err_convert[code].errno;
+ break;
+ case ICMP_TIME_EXCEEDED:
+ err = EHOSTUNREACH;
+ break;
+ default:
+ goto out;
+ }
+
+ switch (sk->sk_state) {
+ struct request_sock *req , **prev;
+ case DCCP_LISTEN:
+ if (sock_owned_by_user(sk))
+ goto out;
+ req = inet_csk_search_req(sk, &prev, dh->dccph_dport,
+ iph->daddr, iph->saddr);
+ if (!req)
+ goto out;
+
+ /*
+ * ICMPs are not backlogged, hence we cannot get an established
+ * socket here.
+ */
+ BUG_TRAP(!req->sk);
+
+ if (seq != dccp_rsk(req)->dreq_iss) {
+ NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
+ goto out;
+ }
+ /*
+ * Still in RESPOND, just remove it silently.
+ * There is no good way to pass the error to the newly
+ * created socket, and POSIX does not want network
+ * errors returned from accept().
+ */
+ inet_csk_reqsk_queue_drop(sk, req, prev);
+ goto out;
+
+ case DCCP_REQUESTING:
+ case DCCP_RESPOND:
+ if (!sock_owned_by_user(sk)) {
+ DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
+ sk->sk_err = err;
+
+ sk->sk_error_report(sk);
+
+ dccp_done(sk);
+ } else
+ sk->sk_err_soft = err;
+ goto out;
+ }
+
+ /* If we've already connected we will keep trying
+ * until we time out, or the user gives up.
+ *
+ * rfc1122 4.2.3.9 allows to consider as hard errors
+ * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
+ * but it is obsoleted by pmtu discovery).
+ *
+ * Note, that in modern internet, where routing is unreliable
+ * and in each dark corner broken firewalls sit, sending random
+ * errors ordered by their masters even this two messages finally lose
+ * their original sense (even Linux sends invalid PORT_UNREACHs)
+ *
+ * Now we are in compliance with RFCs.
+ * --ANK (980905)
+ */
+
+ inet = inet_sk(sk);
+ if (!sock_owned_by_user(sk) && inet->recverr) {
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+ } else /* Only an error on timeout */
+ sk->sk_err_soft = err;
+out:
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+int dccp_v4_send_reset(struct sock *sk, enum dccp_reset_codes code)
+{
+ struct sk_buff *skb;
+ /*
+ * FIXME: what if rebuild_header fails?
+ * Should we be doing a rebuild_header here?
+ */
+ int err = inet_sk_rebuild_header(sk);
+
+ if (err != 0)
+ return err;
+
+ skb = dccp_make_reset(sk, sk->sk_dst_cache, code);
+ if (skb != NULL) {
+ const struct inet_sock *inet = inet_sk(sk);
+
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+ err = ip_build_and_send_pkt(skb, sk,
+ inet->saddr, inet->daddr, NULL);
+ if (err == NET_XMIT_CN)
+ err = 0;
+ }
+
+ return err;
+}
+
+static inline u64 dccp_v4_init_sequence(const struct sock *sk,
+ const struct sk_buff *skb)
+{
+ return secure_dccp_sequence_number(skb->nh.iph->daddr,
+ skb->nh.iph->saddr,
+ dccp_hdr(skb)->dccph_dport,
+ dccp_hdr(skb)->dccph_sport);
+}
+
+static inline int dccp_bad_service_code(const struct sock *sk,
+ const __u32 service)
+{
+ const struct dccp_sock *dp = dccp_sk(sk);
+
+ if (dp->dccps_service == service)
+ return 0;
+ return !dccp_list_has_service(dp->dccps_service_list, service);
+}
+
+int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
+{
+ struct inet_request_sock *ireq;
+ struct dccp_sock dp;
+ struct request_sock *req;
+ struct dccp_request_sock *dreq;
+ const __u32 saddr = skb->nh.iph->saddr;
+ const __u32 daddr = skb->nh.iph->daddr;
+ const __u32 service = dccp_hdr_request(skb)->dccph_req_service;
+ struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
+ __u8 reset_code = DCCP_RESET_CODE_TOO_BUSY;
+ struct dst_entry *dst = NULL;
+
+ /* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */
+ if (((struct rtable *)skb->dst)->rt_flags &
+ (RTCF_BROADCAST | RTCF_MULTICAST)) {
+ reset_code = DCCP_RESET_CODE_NO_CONNECTION;
+ goto drop;
+ }
+
+ if (dccp_bad_service_code(sk, service)) {
+ reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE;
+ goto drop;
+ }
+ /*
+ * TW buckets are converted to open requests without
+ * limitations, they conserve resources and peer is
+ * evidently real one.
+ */
+ if (inet_csk_reqsk_queue_is_full(sk))
+ goto drop;
+
+ /*
+ * Accept backlog is full. If we have already queued enough
+ * of warm entries in syn queue, drop request. It is better than
+ * clogging syn queue with openreqs with exponentially increasing
+ * timeout.
+ */
+ if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
+ goto drop;
+
+ req = reqsk_alloc(sk->sk_prot->rsk_prot);
+ if (req == NULL)
+ goto drop;
+
+ /* FIXME: process options */
+
+ dccp_openreq_init(req, &dp, skb);
+
+ ireq = inet_rsk(req);
+ ireq->loc_addr = daddr;
+ ireq->rmt_addr = saddr;
+ /* FIXME: Merge Aristeu's option parsing code when ready */
+ req->rcv_wnd = 100; /* Fake, option parsing will get the
+ right value */
+ ireq->opt = NULL;
+
+ /*
+ * Step 3: Process LISTEN state
+ *
+ * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
+ *
+ * In fact we defer setting S.GSR, S.SWL, S.SWH to
+ * dccp_create_openreq_child.
+ */
+ dreq = dccp_rsk(req);
+ dreq->dreq_isr = dcb->dccpd_seq;
+ dreq->dreq_iss = dccp_v4_init_sequence(sk, skb);
+ dreq->dreq_service = service;
+
+ if (dccp_v4_send_response(sk, req, dst))
+ goto drop_and_free;
+
+ inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
+ return 0;
+
+drop_and_free:
+ /*
+ * FIXME: should be reqsk_free after implementing req->rsk_ops
+ */
+ __reqsk_free(req);
+drop:
+ DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
+ dcb->dccpd_reset_code = reset_code;
+ return -1;
+}
+
+/*
+ * The three way handshake has completed - we got a valid ACK or DATAACK -
+ * now create the new socket.
+ *
+ * This is the equivalent of TCP's tcp_v4_syn_recv_sock
+ */
+struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct dst_entry *dst)
+{
+ struct inet_request_sock *ireq;
+ struct inet_sock *newinet;
+ struct dccp_sock *newdp;
+ struct sock *newsk;
+
+ if (sk_acceptq_is_full(sk))
+ goto exit_overflow;
+
+ if (dst == NULL && (dst = inet_csk_route_req(sk, req)) == NULL)
+ goto exit;
+
+ newsk = dccp_create_openreq_child(sk, req, skb);
+ if (newsk == NULL)
+ goto exit;
+
+ sk_setup_caps(newsk, dst);
+
+ newdp = dccp_sk(newsk);
+ newinet = inet_sk(newsk);
+ ireq = inet_rsk(req);
+ newinet->daddr = ireq->rmt_addr;
+ newinet->rcv_saddr = ireq->loc_addr;
+ newinet->saddr = ireq->loc_addr;
+ newinet->opt = ireq->opt;
+ ireq->opt = NULL;
+ newinet->mc_index = inet_iif(skb);
+ newinet->mc_ttl = skb->nh.iph->ttl;
+ newinet->id = jiffies;
+
+ dccp_sync_mss(newsk, dst_mtu(dst));
+
+ __inet_hash(&dccp_hashinfo, newsk, 0);
+ __inet_inherit_port(&dccp_hashinfo, sk, newsk);
+
+ return newsk;
+
+exit_overflow:
+ NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
+exit:
+ NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
+ dst_release(dst);
+ return NULL;
+}
+
+static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
+{
+ const struct dccp_hdr *dh = dccp_hdr(skb);
+ const struct iphdr *iph = skb->nh.iph;
+ struct sock *nsk;
+ struct request_sock **prev;
+ /* Find possible connection requests. */
+ struct request_sock *req = inet_csk_search_req(sk, &prev,
+ dh->dccph_sport,
+ iph->saddr, iph->daddr);
+ if (req != NULL)
+ return dccp_check_req(sk, skb, req, prev);
+
+ nsk = __inet_lookup_established(&dccp_hashinfo,
+ iph->saddr, dh->dccph_sport,
+ iph->daddr, ntohs(dh->dccph_dport),
+ inet_iif(skb));
+ if (nsk != NULL) {
+ if (nsk->sk_state != DCCP_TIME_WAIT) {
+ bh_lock_sock(nsk);
+ return nsk;
+ }
+ inet_twsk_put((struct inet_timewait_sock *)nsk);
+ return NULL;
+ }
+
+ return sk;
+}
+
+int dccp_v4_checksum(const struct sk_buff *skb, const u32 saddr,
+ const u32 daddr)
+{
+ const struct dccp_hdr* dh = dccp_hdr(skb);
+ int checksum_len;
+ u32 tmp;
+
+ if (dh->dccph_cscov == 0)
+ checksum_len = skb->len;
+ else {
+ checksum_len = (dh->dccph_cscov + dh->dccph_x) * sizeof(u32);
+ checksum_len = checksum_len < skb->len ? checksum_len :
+ skb->len;
+ }
+
+ tmp = csum_partial((unsigned char *)dh, checksum_len, 0);
+ return csum_tcpudp_magic(saddr, daddr, checksum_len,
+ IPPROTO_DCCP, tmp);
+}
+
+static int dccp_v4_verify_checksum(struct sk_buff *skb,
+ const u32 saddr, const u32 daddr)
+{
+ struct dccp_hdr *dh = dccp_hdr(skb);
+ int checksum_len;
+ u32 tmp;
+
+ if (dh->dccph_cscov == 0)
+ checksum_len = skb->len;
+ else {
+ checksum_len = (dh->dccph_cscov + dh->dccph_x) * sizeof(u32);
+ checksum_len = checksum_len < skb->len ? checksum_len :
+ skb->len;
+ }
+ tmp = csum_partial((unsigned char *)dh, checksum_len, 0);
+ return csum_tcpudp_magic(saddr, daddr, checksum_len,
+ IPPROTO_DCCP, tmp) == 0 ? 0 : -1;
+}
+
+static struct dst_entry* dccp_v4_route_skb(struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct rtable *rt;
+ struct flowi fl = { .oif = ((struct rtable *)skb->dst)->rt_iif,
+ .nl_u = { .ip4_u =
+ { .daddr = skb->nh.iph->saddr,
+ .saddr = skb->nh.iph->daddr,
+ .tos = RT_CONN_FLAGS(sk) } },
+ .proto = sk->sk_protocol,
+ .uli_u = { .ports =
+ { .sport = dccp_hdr(skb)->dccph_dport,
+ .dport = dccp_hdr(skb)->dccph_sport }
+ }
+ };
+
+ if (ip_route_output_flow(&rt, &fl, sk, 0)) {
+ IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
+ return NULL;
+ }
+
+ return &rt->u.dst;
+}
+
+static void dccp_v4_ctl_send_reset(struct sk_buff *rxskb)
+{
+ int err;
+ struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh;
+ const int dccp_hdr_reset_len = sizeof(struct dccp_hdr) +
+ sizeof(struct dccp_hdr_ext) +
+ sizeof(struct dccp_hdr_reset);
+ struct sk_buff *skb;
+ struct dst_entry *dst;
+ u64 seqno;
+
+ /* Never send a reset in response to a reset. */
+ if (rxdh->dccph_type == DCCP_PKT_RESET)
+ return;
+
+ if (((struct rtable *)rxskb->dst)->rt_type != RTN_LOCAL)
+ return;
+
+ dst = dccp_v4_route_skb(dccp_ctl_socket->sk, rxskb);
+ if (dst == NULL)
+ return;
+
+ skb = alloc_skb(MAX_DCCP_HEADER + 15, GFP_ATOMIC);
+ if (skb == NULL)
+ goto out;
+
+ /* Reserve space for headers. */
+ skb_reserve(skb, MAX_DCCP_HEADER);
+ skb->dst = dst_clone(dst);
+
+ skb->h.raw = skb_push(skb, dccp_hdr_reset_len);
+ dh = dccp_hdr(skb);
+ memset(dh, 0, dccp_hdr_reset_len);
+
+ /* Build DCCP header and checksum it. */
+ dh->dccph_type = DCCP_PKT_RESET;
+ dh->dccph_sport = rxdh->dccph_dport;
+ dh->dccph_dport = rxdh->dccph_sport;
+ dh->dccph_doff = dccp_hdr_reset_len / 4;
+ dh->dccph_x = 1;
+ dccp_hdr_reset(skb)->dccph_reset_code =
+ DCCP_SKB_CB(rxskb)->dccpd_reset_code;
+
+ /* See "8.3.1. Abnormal Termination" in draft-ietf-dccp-spec-11 */
+ seqno = 0;
+ if (DCCP_SKB_CB(rxskb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
+ dccp_set_seqno(&seqno, DCCP_SKB_CB(rxskb)->dccpd_ack_seq + 1);
+
+ dccp_hdr_set_seq(dh, seqno);
+ dccp_hdr_set_ack(dccp_hdr_ack_bits(skb),
+ DCCP_SKB_CB(rxskb)->dccpd_seq);
+
+ dh->dccph_checksum = dccp_v4_checksum(skb, rxskb->nh.iph->saddr,
+ rxskb->nh.iph->daddr);
+
+ bh_lock_sock(dccp_ctl_socket->sk);
+ err = ip_build_and_send_pkt(skb, dccp_ctl_socket->sk,
+ rxskb->nh.iph->daddr,
+ rxskb->nh.iph->saddr, NULL);
+ bh_unlock_sock(dccp_ctl_socket->sk);
+
+ if (err == NET_XMIT_CN || err == 0) {
+ DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
+ DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
+ }
+out:
+ dst_release(dst);
+}
+
+int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ struct dccp_hdr *dh = dccp_hdr(skb);
+
+ if (sk->sk_state == DCCP_OPEN) { /* Fast path */
+ if (dccp_rcv_established(sk, skb, dh, skb->len))
+ goto reset;
+ return 0;
+ }
+
+ /*
+ * Step 3: Process LISTEN state
+ * If S.state == LISTEN,
+ * If P.type == Request or P contains a valid Init Cookie
+ * option,
+ * * Must scan the packet's options to check for an Init
+ * Cookie. Only the Init Cookie is processed here,
+ * however; other options are processed in Step 8. This
+ * scan need only be performed if the endpoint uses Init
+ * Cookies *
+ * * Generate a new socket and switch to that socket *
+ * Set S := new socket for this port pair
+ * S.state = RESPOND
+ * Choose S.ISS (initial seqno) or set from Init Cookie
+ * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
+ * Continue with S.state == RESPOND
+ * * A Response packet will be generated in Step 11 *
+ * Otherwise,
+ * Generate Reset(No Connection) unless P.type == Reset
+ * Drop packet and return
+ *
+ * NOTE: the check for the packet types is done in
+ * dccp_rcv_state_process
+ */
+ if (sk->sk_state == DCCP_LISTEN) {
+ struct sock *nsk = dccp_v4_hnd_req(sk, skb);
+
+ if (nsk == NULL)
+ goto discard;
+
+ if (nsk != sk) {
+ if (dccp_child_process(sk, nsk, skb))
+ goto reset;
+ return 0;
+ }
+ }
+
+ if (dccp_rcv_state_process(sk, skb, dh, skb->len))
+ goto reset;
+ return 0;
+
+reset:
+ dccp_v4_ctl_send_reset(skb);
+discard:
+ kfree_skb(skb);
+ return 0;
+}
+
+static inline int dccp_invalid_packet(struct sk_buff *skb)
+{
+ const struct dccp_hdr *dh;
+
+ if (skb->pkt_type != PACKET_HOST)
+ return 1;
+
+ if (!pskb_may_pull(skb, sizeof(struct dccp_hdr))) {
+ LIMIT_NETDEBUG(KERN_WARNING "DCCP: pskb_may_pull failed\n");
+ return 1;
+ }
+
+ dh = dccp_hdr(skb);
+
+ /* If the packet type is not understood, drop packet and return */
+ if (dh->dccph_type >= DCCP_PKT_INVALID) {
+ LIMIT_NETDEBUG(KERN_WARNING "DCCP: invalid packet type\n");
+ return 1;
+ }
+
+ /*
+ * If P.Data Offset is too small for packet type, or too large for
+ * packet, drop packet and return
+ */
+ if (dh->dccph_doff < dccp_hdr_len(skb) / sizeof(u32)) {
+ LIMIT_NETDEBUG(KERN_WARNING "DCCP: P.Data Offset(%u) "
+ "too small 1\n",
+ dh->dccph_doff);
+ return 1;
+ }
+
+ if (!pskb_may_pull(skb, dh->dccph_doff * sizeof(u32))) {
+ LIMIT_NETDEBUG(KERN_WARNING "DCCP: P.Data Offset(%u) "
+ "too small 2\n",
+ dh->dccph_doff);
+ return 1;
+ }
+
+ dh = dccp_hdr(skb);
+
+ /*
+ * If P.type is not Data, Ack, or DataAck and P.X == 0 (the packet
+ * has short sequence numbers), drop packet and return
+ */
+ if (dh->dccph_x == 0 &&
+ dh->dccph_type != DCCP_PKT_DATA &&
+ dh->dccph_type != DCCP_PKT_ACK &&
+ dh->dccph_type != DCCP_PKT_DATAACK) {
+ LIMIT_NETDEBUG(KERN_WARNING "DCCP: P.type (%s) not Data, Ack "
+ "nor DataAck and P.X == 0\n",
+ dccp_packet_name(dh->dccph_type));
+ return 1;
+ }
+
+ /* If the header checksum is incorrect, drop packet and return */
+ if (dccp_v4_verify_checksum(skb, skb->nh.iph->saddr,
+ skb->nh.iph->daddr) < 0) {
+ LIMIT_NETDEBUG(KERN_WARNING "DCCP: header checksum is "
+ "incorrect\n");
+ return 1;
+ }
+
+ return 0;
+}
+
+/* this is called when real data arrives */
+int dccp_v4_rcv(struct sk_buff *skb)
+{
+ const struct dccp_hdr *dh;
+ struct sock *sk;
+ int rc;
+
+ /* Step 1: Check header basics: */
+
+ if (dccp_invalid_packet(skb))
+ goto discard_it;
+
+ dh = dccp_hdr(skb);
+
+ DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(skb);
+ DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type;
+
+ dccp_pr_debug("%8.8s "
+ "src=%u.%u.%u.%u@%-5d "
+ "dst=%u.%u.%u.%u@%-5d seq=%llu",
+ dccp_packet_name(dh->dccph_type),
+ NIPQUAD(skb->nh.iph->saddr), ntohs(dh->dccph_sport),
+ NIPQUAD(skb->nh.iph->daddr), ntohs(dh->dccph_dport),
+ (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq);
+
+ if (dccp_packet_without_ack(skb)) {
+ DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ;
+ dccp_pr_debug_cat("\n");
+ } else {
+ DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb);
+ dccp_pr_debug_cat(", ack=%llu\n",
+ (unsigned long long)
+ DCCP_SKB_CB(skb)->dccpd_ack_seq);
+ }
+
+ /* Step 2:
+ * Look up flow ID in table and get corresponding socket */
+ sk = __inet_lookup(&dccp_hashinfo,
+ skb->nh.iph->saddr, dh->dccph_sport,
+ skb->nh.iph->daddr, ntohs(dh->dccph_dport),
+ inet_iif(skb));
+
+ /*
+ * Step 2:
+ * If no socket ...
+ * Generate Reset(No Connection) unless P.type == Reset
+ * Drop packet and return
+ */
+ if (sk == NULL) {
+ dccp_pr_debug("failed to look up flow ID in table and "
+ "get corresponding socket\n");
+ goto no_dccp_socket;
+ }
+
+ /*
+ * Step 2:
+ * ... or S.state == TIMEWAIT,
+ * Generate Reset(No Connection) unless P.type == Reset
+ * Drop packet and return
+ */
+
+ if (sk->sk_state == DCCP_TIME_WAIT) {
+ dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: "
+ "do_time_wait\n");
+ goto do_time_wait;
+ }
+
+ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
+ dccp_pr_debug("xfrm4_policy_check failed\n");
+ goto discard_and_relse;
+ }
+
+ if (sk_filter(sk, skb, 0)) {
+ dccp_pr_debug("sk_filter failed\n");
+ goto discard_and_relse;
+ }
+
+ skb->dev = NULL;
+
+ bh_lock_sock(sk);
+ rc = 0;
+ if (!sock_owned_by_user(sk))
+ rc = dccp_v4_do_rcv(sk, skb);
+ else
+ sk_add_backlog(sk, skb);
+ bh_unlock_sock(sk);
+
+ sock_put(sk);
+ return rc;
+
+no_dccp_socket:
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+ goto discard_it;
+ /*
+ * Step 2:
+ * Generate Reset(No Connection) unless P.type == Reset
+ * Drop packet and return
+ */
+ if (dh->dccph_type != DCCP_PKT_RESET) {
+ DCCP_SKB_CB(skb)->dccpd_reset_code =
+ DCCP_RESET_CODE_NO_CONNECTION;
+ dccp_v4_ctl_send_reset(skb);
+ }
+
+discard_it:
+ /* Discard frame. */
+ kfree_skb(skb);
+ return 0;
+
+discard_and_relse:
+ sock_put(sk);
+ goto discard_it;
+
+do_time_wait:
+ inet_twsk_put((struct inet_timewait_sock *)sk);
+ goto no_dccp_socket;
+}
+
+static int dccp_v4_init_sock(struct sock *sk)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ static int dccp_ctl_socket_init = 1;
+
+ dccp_options_init(&dp->dccps_options);
+ do_gettimeofday(&dp->dccps_epoch);
+
+ if (dp->dccps_options.dccpo_send_ack_vector) {
+ dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(DCCP_MAX_ACKVEC_LEN,
+ GFP_KERNEL);
+ if (dp->dccps_hc_rx_ackvec == NULL)
+ return -ENOMEM;
+ }
+
+ /*
+ * FIXME: We're hardcoding the CCID, and doing this at this point makes
+ * the listening (master) sock get CCID control blocks, which is not
+ * necessary, but for now, to not mess with the test userspace apps,
+ * lets leave it here, later the real solution is to do this in a
+ * setsockopt(CCIDs-I-want/accept). -acme
+ */
+ if (likely(!dccp_ctl_socket_init)) {
+ dp->dccps_hc_rx_ccid = ccid_init(dp->dccps_options.dccpo_rx_ccid,
+ sk);
+ dp->dccps_hc_tx_ccid = ccid_init(dp->dccps_options.dccpo_tx_ccid,
+ sk);
+ if (dp->dccps_hc_rx_ccid == NULL ||
+ dp->dccps_hc_tx_ccid == NULL) {
+ ccid_exit(dp->dccps_hc_rx_ccid, sk);
+ ccid_exit(dp->dccps_hc_tx_ccid, sk);
+ if (dp->dccps_options.dccpo_send_ack_vector) {
+ dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
+ dp->dccps_hc_rx_ackvec = NULL;
+ }
+ dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
+ return -ENOMEM;
+ }
+ } else
+ dccp_ctl_socket_init = 0;
+
+ dccp_init_xmit_timers(sk);
+ inet_csk(sk)->icsk_rto = DCCP_TIMEOUT_INIT;
+ sk->sk_state = DCCP_CLOSED;
+ sk->sk_write_space = dccp_write_space;
+ dp->dccps_mss_cache = 536;
+ dp->dccps_role = DCCP_ROLE_UNDEFINED;
+ dp->dccps_service = DCCP_SERVICE_INVALID_VALUE;
+
+ return 0;
+}
+
+static int dccp_v4_destroy_sock(struct sock *sk)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ /*
+ * DCCP doesn't use sk_qrite_queue, just sk_send_head
+ * for retransmissions
+ */
+ if (sk->sk_send_head != NULL) {
+ kfree_skb(sk->sk_send_head);
+ sk->sk_send_head = NULL;
+ }
+
+ /* Clean up a referenced DCCP bind bucket. */
+ if (inet_csk(sk)->icsk_bind_hash != NULL)
+ inet_put_port(&dccp_hashinfo, sk);
+
+ if (dp->dccps_service_list != NULL) {
+ kfree(dp->dccps_service_list);
+ dp->dccps_service_list = NULL;
+ }
+
+ ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk);
+ ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk);
+ if (dp->dccps_options.dccpo_send_ack_vector) {
+ dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
+ dp->dccps_hc_rx_ackvec = NULL;
+ }
+ ccid_exit(dp->dccps_hc_rx_ccid, sk);
+ ccid_exit(dp->dccps_hc_tx_ccid, sk);
+ dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
+
+ return 0;
+}
+
+static void dccp_v4_reqsk_destructor(struct request_sock *req)
+{
+ kfree(inet_rsk(req)->opt);
+}
+
+static struct request_sock_ops dccp_request_sock_ops = {
+ .family = PF_INET,
+ .obj_size = sizeof(struct dccp_request_sock),
+ .rtx_syn_ack = dccp_v4_send_response,
+ .send_ack = dccp_v4_reqsk_send_ack,
+ .destructor = dccp_v4_reqsk_destructor,
+ .send_reset = dccp_v4_ctl_send_reset,
+};
+
+struct proto dccp_v4_prot = {
+ .name = "DCCP",
+ .owner = THIS_MODULE,
+ .close = dccp_close,
+ .connect = dccp_v4_connect,
+ .disconnect = dccp_disconnect,
+ .ioctl = dccp_ioctl,
+ .init = dccp_v4_init_sock,
+ .setsockopt = dccp_setsockopt,
+ .getsockopt = dccp_getsockopt,
+ .sendmsg = dccp_sendmsg,
+ .recvmsg = dccp_recvmsg,
+ .backlog_rcv = dccp_v4_do_rcv,
+ .hash = dccp_v4_hash,
+ .unhash = dccp_v4_unhash,
+ .accept = inet_csk_accept,
+ .get_port = dccp_v4_get_port,
+ .shutdown = dccp_shutdown,
+ .destroy = dccp_v4_destroy_sock,
+ .orphan_count = &dccp_orphan_count,
+ .max_header = MAX_DCCP_HEADER,
+ .obj_size = sizeof(struct dccp_sock),
+ .rsk_prot = &dccp_request_sock_ops,
+ .twsk_obj_size = sizeof(struct inet_timewait_sock),
+};
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
new file mode 100644
index 00000000000..1393461898b
--- /dev/null
+++ b/net/dccp/minisocks.c
@@ -0,0 +1,268 @@
+/*
+ * net/dccp/minisocks.c
+ *
+ * An implementation of the DCCP protocol
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <linux/skbuff.h>
+#include <linux/timer.h>
+
+#include <net/sock.h>
+#include <net/xfrm.h>
+#include <net/inet_timewait_sock.h>
+
+#include "ackvec.h"
+#include "ccid.h"
+#include "dccp.h"
+
+struct inet_timewait_death_row dccp_death_row = {
+ .sysctl_max_tw_buckets = NR_FILE * 2,
+ .period = DCCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
+ .death_lock = SPIN_LOCK_UNLOCKED,
+ .hashinfo = &dccp_hashinfo,
+ .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0,
+ (unsigned long)&dccp_death_row),
+ .twkill_work = __WORK_INITIALIZER(dccp_death_row.twkill_work,
+ inet_twdr_twkill_work,
+ &dccp_death_row),
+/* Short-time timewait calendar */
+
+ .twcal_hand = -1,
+ .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
+ (unsigned long)&dccp_death_row),
+};
+
+void dccp_time_wait(struct sock *sk, int state, int timeo)
+{
+ struct inet_timewait_sock *tw = NULL;
+
+ if (dccp_death_row.tw_count < dccp_death_row.sysctl_max_tw_buckets)
+ tw = inet_twsk_alloc(sk, state);
+
+ if (tw != NULL) {
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
+
+ /* Linkage updates. */
+ __inet_twsk_hashdance(tw, sk, &dccp_hashinfo);
+
+ /* Get the TIME_WAIT timeout firing. */
+ if (timeo < rto)
+ timeo = rto;
+
+ tw->tw_timeout = DCCP_TIMEWAIT_LEN;
+ if (state == DCCP_TIME_WAIT)
+ timeo = DCCP_TIMEWAIT_LEN;
+
+ inet_twsk_schedule(tw, &dccp_death_row, timeo,
+ DCCP_TIMEWAIT_LEN);
+ inet_twsk_put(tw);
+ } else {
+ /* Sorry, if we're out of memory, just CLOSE this
+ * socket up. We've got bigger problems than
+ * non-graceful socket closings.
+ */
+ LIMIT_NETDEBUG(KERN_INFO "DCCP: time wait bucket "
+ "table overflow\n");
+ }
+
+ dccp_done(sk);
+}
+
+struct sock *dccp_create_openreq_child(struct sock *sk,
+ const struct request_sock *req,
+ const struct sk_buff *skb)
+{
+ /*
+ * Step 3: Process LISTEN state
+ *
+ * // Generate a new socket and switch to that socket
+ * Set S := new socket for this port pair
+ */
+ struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
+
+ if (newsk != NULL) {
+ const struct dccp_request_sock *dreq = dccp_rsk(req);
+ struct inet_connection_sock *newicsk = inet_csk(sk);
+ struct dccp_sock *newdp = dccp_sk(newsk);
+
+ newdp->dccps_role = DCCP_ROLE_SERVER;
+ newdp->dccps_hc_rx_ackvec = NULL;
+ newdp->dccps_service_list = NULL;
+ newdp->dccps_service = dreq->dreq_service;
+ newicsk->icsk_rto = DCCP_TIMEOUT_INIT;
+ do_gettimeofday(&newdp->dccps_epoch);
+
+ if (newdp->dccps_options.dccpo_send_ack_vector) {
+ newdp->dccps_hc_rx_ackvec =
+ dccp_ackvec_alloc(DCCP_MAX_ACKVEC_LEN,
+ GFP_ATOMIC);
+ /*
+ * XXX: We're using the same CCIDs set on the parent,
+ * i.e. sk_clone copied the master sock and left the
+ * CCID pointers for this child, that is why we do the
+ * __ccid_get calls.
+ */
+ if (unlikely(newdp->dccps_hc_rx_ackvec == NULL))
+ goto out_free;
+ }
+
+ if (unlikely(ccid_hc_rx_init(newdp->dccps_hc_rx_ccid,
+ newsk) != 0 ||
+ ccid_hc_tx_init(newdp->dccps_hc_tx_ccid,
+ newsk) != 0)) {
+ dccp_ackvec_free(newdp->dccps_hc_rx_ackvec);
+ ccid_hc_rx_exit(newdp->dccps_hc_rx_ccid, newsk);
+ ccid_hc_tx_exit(newdp->dccps_hc_tx_ccid, newsk);
+out_free:
+ /* It is still raw copy of parent, so invalidate
+ * destructor and make plain sk_free() */
+ newsk->sk_destruct = NULL;
+ sk_free(newsk);
+ return NULL;
+ }
+
+ __ccid_get(newdp->dccps_hc_rx_ccid);
+ __ccid_get(newdp->dccps_hc_tx_ccid);
+
+ /*
+ * Step 3: Process LISTEN state
+ *
+ * Choose S.ISS (initial seqno) or set from Init Cookie
+ * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init
+ * Cookie
+ */
+
+ /* See dccp_v4_conn_request */
+ newdp->dccps_options.dccpo_sequence_window = req->rcv_wnd;
+
+ newdp->dccps_gar = newdp->dccps_isr = dreq->dreq_isr;
+ dccp_update_gsr(newsk, dreq->dreq_isr);
+
+ newdp->dccps_iss = dreq->dreq_iss;
+ dccp_update_gss(newsk, dreq->dreq_iss);
+
+ /*
+ * SWL and AWL are initially adjusted so that they are not less than
+ * the initial Sequence Numbers received and sent, respectively:
+ * SWL := max(GSR + 1 - floor(W/4), ISR),
+ * AWL := max(GSS - W' + 1, ISS).
+ * These adjustments MUST be applied only at the beginning of the
+ * connection.
+ */
+ dccp_set_seqno(&newdp->dccps_swl,
+ max48(newdp->dccps_swl, newdp->dccps_isr));
+ dccp_set_seqno(&newdp->dccps_awl,
+ max48(newdp->dccps_awl, newdp->dccps_iss));
+
+ dccp_init_xmit_timers(newsk);
+
+ DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS);
+ }
+ return newsk;
+}
+
+/*
+ * Process an incoming packet for RESPOND sockets represented
+ * as an request_sock.
+ */
+struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct request_sock **prev)
+{
+ struct sock *child = NULL;
+
+ /* Check for retransmitted REQUEST */
+ if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) {
+ if (after48(DCCP_SKB_CB(skb)->dccpd_seq,
+ dccp_rsk(req)->dreq_isr)) {
+ struct dccp_request_sock *dreq = dccp_rsk(req);
+
+ dccp_pr_debug("Retransmitted REQUEST\n");
+ /* Send another RESPONSE packet */
+ dccp_set_seqno(&dreq->dreq_iss, dreq->dreq_iss + 1);
+ dccp_set_seqno(&dreq->dreq_isr,
+ DCCP_SKB_CB(skb)->dccpd_seq);
+ req->rsk_ops->rtx_syn_ack(sk, req, NULL);
+ }
+ /* Network Duplicate, discard packet */
+ return NULL;
+ }
+
+ DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR;
+
+ if (dccp_hdr(skb)->dccph_type != DCCP_PKT_ACK &&
+ dccp_hdr(skb)->dccph_type != DCCP_PKT_DATAACK)
+ goto drop;
+
+ /* Invalid ACK */
+ if (DCCP_SKB_CB(skb)->dccpd_ack_seq != dccp_rsk(req)->dreq_iss) {
+ dccp_pr_debug("Invalid ACK number: ack_seq=%llu, "
+ "dreq_iss=%llu\n",
+ (unsigned long long)
+ DCCP_SKB_CB(skb)->dccpd_ack_seq,
+ (unsigned long long)
+ dccp_rsk(req)->dreq_iss);
+ goto drop;
+ }
+
+ child = dccp_v4_request_recv_sock(sk, skb, req, NULL);
+ if (child == NULL)
+ goto listen_overflow;
+
+ /* FIXME: deal with options */
+
+ inet_csk_reqsk_queue_unlink(sk, req, prev);
+ inet_csk_reqsk_queue_removed(sk, req);
+ inet_csk_reqsk_queue_add(sk, req, child);
+out:
+ return child;
+listen_overflow:
+ dccp_pr_debug("listen_overflow!\n");
+ DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY;
+drop:
+ if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET)
+ req->rsk_ops->send_reset(skb);
+
+ inet_csk_reqsk_queue_drop(sk, req, prev);
+ goto out;
+}
+
+/*
+ * Queue segment on the new socket if the new socket is active,
+ * otherwise we just shortcircuit this and continue with
+ * the new socket.
+ */
+int dccp_child_process(struct sock *parent, struct sock *child,
+ struct sk_buff *skb)
+{
+ int ret = 0;
+ const int state = child->sk_state;
+
+ if (!sock_owned_by_user(child)) {
+ ret = dccp_rcv_state_process(child, skb, dccp_hdr(skb),
+ skb->len);
+
+ /* Wakeup parent, send SIGIO */
+ if (state == DCCP_RESPOND && child->sk_state != state)
+ parent->sk_data_ready(parent, 0);
+ } else {
+ /* Alas, it is possible again, because we do lookup
+ * in main socket hash table and lock on listening
+ * socket does not protect us more.
+ */
+ sk_add_backlog(child, skb);
+ }
+
+ bh_unlock_sock(child);
+ sock_put(child);
+ return ret;
+}
diff --git a/net/dccp/options.c b/net/dccp/options.c
new file mode 100644
index 00000000000..0a76426c9ae
--- /dev/null
+++ b/net/dccp/options.c
@@ -0,0 +1,462 @@
+/*
+ * net/dccp/options.c
+ *
+ * An implementation of the DCCP protocol
+ * Copyright (c) 2005 Aristeu Sergio Rozanski Filho <aris@cathedrallabs.org>
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
+ * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+
+#include "ackvec.h"
+#include "ccid.h"
+#include "dccp.h"
+
+/* stores the default values for new connection. may be changed with sysctl */
+static const struct dccp_options dccpo_default_values = {
+ .dccpo_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW,
+ .dccpo_rx_ccid = DCCPF_INITIAL_CCID,
+ .dccpo_tx_ccid = DCCPF_INITIAL_CCID,
+ .dccpo_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR,
+ .dccpo_send_ndp_count = DCCPF_INITIAL_SEND_NDP_COUNT,
+};
+
+void dccp_options_init(struct dccp_options *dccpo)
+{
+ memcpy(dccpo, &dccpo_default_values, sizeof(*dccpo));
+}
+
+static u32 dccp_decode_value_var(const unsigned char *bf, const u8 len)
+{
+ u32 value = 0;
+
+ if (len > 3)
+ value += *bf++ << 24;
+ if (len > 2)
+ value += *bf++ << 16;
+ if (len > 1)
+ value += *bf++ << 8;
+ if (len > 0)
+ value += *bf;
+
+ return value;
+}
+
+int dccp_parse_options(struct sock *sk, struct sk_buff *skb)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+#ifdef CONFIG_IP_DCCP_DEBUG
+ const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ?
+ "CLIENT rx opt: " : "server rx opt: ";
+#endif
+ const struct dccp_hdr *dh = dccp_hdr(skb);
+ const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type;
+ unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
+ unsigned char *opt_ptr = options;
+ const unsigned char *opt_end = (unsigned char *)dh +
+ (dh->dccph_doff * 4);
+ struct dccp_options_received *opt_recv = &dp->dccps_options_received;
+ unsigned char opt, len;
+ unsigned char *value;
+ u32 elapsed_time;
+
+ memset(opt_recv, 0, sizeof(*opt_recv));
+
+ while (opt_ptr != opt_end) {
+ opt = *opt_ptr++;
+ len = 0;
+ value = NULL;
+
+ /* Check if this isn't a single byte option */
+ if (opt > DCCPO_MAX_RESERVED) {
+ if (opt_ptr == opt_end)
+ goto out_invalid_option;
+
+ len = *opt_ptr++;
+ if (len < 3)
+ goto out_invalid_option;
+ /*
+ * Remove the type and len fields, leaving
+ * just the value size
+ */
+ len -= 2;
+ value = opt_ptr;
+ opt_ptr += len;
+
+ if (opt_ptr > opt_end)
+ goto out_invalid_option;
+ }
+
+ switch (opt) {
+ case DCCPO_PADDING:
+ break;
+ case DCCPO_NDP_COUNT:
+ if (len > 3)
+ goto out_invalid_option;
+
+ opt_recv->dccpor_ndp = dccp_decode_value_var(value, len);
+ dccp_pr_debug("%sNDP count=%d\n", debug_prefix,
+ opt_recv->dccpor_ndp);
+ break;
+ case DCCPO_ACK_VECTOR_0:
+ case DCCPO_ACK_VECTOR_1:
+ if (pkt_type == DCCP_PKT_DATA)
+ continue;
+
+ if (dp->dccps_options.dccpo_send_ack_vector &&
+ dccp_ackvec_parse(sk, skb, opt, value, len))
+ goto out_invalid_option;
+ break;
+ case DCCPO_TIMESTAMP:
+ if (len != 4)
+ goto out_invalid_option;
+
+ opt_recv->dccpor_timestamp = ntohl(*(u32 *)value);
+
+ dp->dccps_timestamp_echo = opt_recv->dccpor_timestamp;
+ dccp_timestamp(sk, &dp->dccps_timestamp_time);
+
+ dccp_pr_debug("%sTIMESTAMP=%u, ackno=%llu\n",
+ debug_prefix, opt_recv->dccpor_timestamp,
+ (unsigned long long)
+ DCCP_SKB_CB(skb)->dccpd_ack_seq);
+ break;
+ case DCCPO_TIMESTAMP_ECHO:
+ if (len != 4 && len != 6 && len != 8)
+ goto out_invalid_option;
+
+ opt_recv->dccpor_timestamp_echo = ntohl(*(u32 *)value);
+
+ dccp_pr_debug("%sTIMESTAMP_ECHO=%u, len=%d, ackno=%llu, ",
+ debug_prefix,
+ opt_recv->dccpor_timestamp_echo,
+ len + 2,
+ (unsigned long long)
+ DCCP_SKB_CB(skb)->dccpd_ack_seq);
+
+
+ if (len == 4)
+ break;
+
+ if (len == 6)
+ elapsed_time = ntohs(*(u16 *)(value + 4));
+ else
+ elapsed_time = ntohl(*(u32 *)(value + 4));
+
+ /* Give precedence to the biggest ELAPSED_TIME */
+ if (elapsed_time > opt_recv->dccpor_elapsed_time)
+ opt_recv->dccpor_elapsed_time = elapsed_time;
+ break;
+ case DCCPO_ELAPSED_TIME:
+ if (len != 2 && len != 4)
+ goto out_invalid_option;
+
+ if (pkt_type == DCCP_PKT_DATA)
+ continue;
+
+ if (len == 2)
+ elapsed_time = ntohs(*(u16 *)value);
+ else
+ elapsed_time = ntohl(*(u32 *)value);
+
+ if (elapsed_time > opt_recv->dccpor_elapsed_time)
+ opt_recv->dccpor_elapsed_time = elapsed_time;
+
+ dccp_pr_debug("%sELAPSED_TIME=%d\n", debug_prefix,
+ elapsed_time);
+ break;
+ /*
+ * From draft-ietf-dccp-spec-11.txt:
+ *
+ * Option numbers 128 through 191 are for
+ * options sent from the HC-Sender to the
+ * HC-Receiver; option numbers 192 through 255
+ * are for options sent from the HC-Receiver to
+ * the HC-Sender.
+ */
+ case 128 ... 191: {
+ const u16 idx = value - options;
+
+ if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk,
+ opt, len, idx,
+ value) != 0)
+ goto out_invalid_option;
+ }
+ break;
+ case 192 ... 255: {
+ const u16 idx = value - options;
+
+ if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk,
+ opt, len, idx,
+ value) != 0)
+ goto out_invalid_option;
+ }
+ break;
+ default:
+ pr_info("DCCP(%p): option %d(len=%d) not "
+ "implemented, ignoring\n",
+ sk, opt, len);
+ break;
+ }
+ }
+
+ return 0;
+
+out_invalid_option:
+ DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT);
+ DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_OPTION_ERROR;
+ pr_info("DCCP(%p): invalid option %d, len=%d\n", sk, opt, len);
+ return -1;
+}
+
+static void dccp_encode_value_var(const u32 value, unsigned char *to,
+ const unsigned int len)
+{
+ if (len > 3)
+ *to++ = (value & 0xFF000000) >> 24;
+ if (len > 2)
+ *to++ = (value & 0xFF0000) >> 16;
+ if (len > 1)
+ *to++ = (value & 0xFF00) >> 8;
+ if (len > 0)
+ *to++ = (value & 0xFF);
+}
+
+static inline int dccp_ndp_len(const int ndp)
+{
+ return likely(ndp <= 0xFF) ? 1 : ndp <= 0xFFFF ? 2 : 3;
+}
+
+void dccp_insert_option(struct sock *sk, struct sk_buff *skb,
+ const unsigned char option,
+ const void *value, const unsigned char len)
+{
+ unsigned char *to;
+
+ if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 2 > DCCP_MAX_OPT_LEN) {
+ LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to insert "
+ "%d option!\n", option);
+ return;
+ }
+
+ DCCP_SKB_CB(skb)->dccpd_opt_len += len + 2;
+
+ to = skb_push(skb, len + 2);
+ *to++ = option;
+ *to++ = len + 2;
+
+ memcpy(to, value, len);
+}
+
+EXPORT_SYMBOL_GPL(dccp_insert_option);
+
+static void dccp_insert_option_ndp(struct sock *sk, struct sk_buff *skb)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ int ndp = dp->dccps_ndp_count;
+
+ if (dccp_non_data_packet(skb))
+ ++dp->dccps_ndp_count;
+ else
+ dp->dccps_ndp_count = 0;
+
+ if (ndp > 0) {
+ unsigned char *ptr;
+ const int ndp_len = dccp_ndp_len(ndp);
+ const int len = ndp_len + 2;
+
+ if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
+ return;
+
+ DCCP_SKB_CB(skb)->dccpd_opt_len += len;
+
+ ptr = skb_push(skb, len);
+ *ptr++ = DCCPO_NDP_COUNT;
+ *ptr++ = len;
+ dccp_encode_value_var(ndp, ptr, ndp_len);
+ }
+}
+
+static inline int dccp_elapsed_time_len(const u32 elapsed_time)
+{
+ return elapsed_time == 0 ? 0 : elapsed_time <= 0xFFFF ? 2 : 4;
+}
+
+void dccp_insert_option_elapsed_time(struct sock *sk,
+ struct sk_buff *skb,
+ u32 elapsed_time)
+{
+#ifdef CONFIG_IP_DCCP_DEBUG
+ struct dccp_sock *dp = dccp_sk(sk);
+ const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ?
+ "CLIENT TX opt: " : "server TX opt: ";
+#endif
+ const int elapsed_time_len = dccp_elapsed_time_len(elapsed_time);
+ const int len = 2 + elapsed_time_len;
+ unsigned char *to;
+
+ if (elapsed_time_len == 0)
+ return;
+
+ if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) {
+ LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to "
+ "insert elapsed time!\n");
+ return;
+ }
+
+ DCCP_SKB_CB(skb)->dccpd_opt_len += len;
+
+ to = skb_push(skb, len);
+ *to++ = DCCPO_ELAPSED_TIME;
+ *to++ = len;
+
+ if (elapsed_time_len == 2) {
+ const u16 var16 = htons((u16)elapsed_time);
+ memcpy(to, &var16, 2);
+ } else {
+ const u32 var32 = htonl(elapsed_time);
+ memcpy(to, &var32, 4);
+ }
+
+ dccp_pr_debug("%sELAPSED_TIME=%u, len=%d, seqno=%llu\n",
+ debug_prefix, elapsed_time,
+ len,
+ (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq);
+}
+
+EXPORT_SYMBOL_GPL(dccp_insert_option_elapsed_time);
+
+void dccp_timestamp(const struct sock *sk, struct timeval *tv)
+{
+ const struct dccp_sock *dp = dccp_sk(sk);
+
+ do_gettimeofday(tv);
+ tv->tv_sec -= dp->dccps_epoch.tv_sec;
+ tv->tv_usec -= dp->dccps_epoch.tv_usec;
+
+ while (tv->tv_usec < 0) {
+ tv->tv_sec--;
+ tv->tv_usec += USEC_PER_SEC;
+ }
+}
+
+EXPORT_SYMBOL_GPL(dccp_timestamp);
+
+void dccp_insert_option_timestamp(struct sock *sk, struct sk_buff *skb)
+{
+ struct timeval tv;
+ u32 now;
+
+ dccp_timestamp(sk, &tv);
+ now = timeval_usecs(&tv) / 10;
+ /* yes this will overflow but that is the point as we want a
+ * 10 usec 32 bit timer which mean it wraps every 11.9 hours */
+
+ now = htonl(now);
+ dccp_insert_option(sk, skb, DCCPO_TIMESTAMP, &now, sizeof(now));
+}
+
+EXPORT_SYMBOL_GPL(dccp_insert_option_timestamp);
+
+static void dccp_insert_option_timestamp_echo(struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+#ifdef CONFIG_IP_DCCP_DEBUG
+ const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ?
+ "CLIENT TX opt: " : "server TX opt: ";
+#endif
+ struct timeval now;
+ u32 tstamp_echo;
+ u32 elapsed_time;
+ int len, elapsed_time_len;
+ unsigned char *to;
+
+ dccp_timestamp(sk, &now);
+ elapsed_time = timeval_delta(&now, &dp->dccps_timestamp_time) / 10;
+ elapsed_time_len = dccp_elapsed_time_len(elapsed_time);
+ len = 6 + elapsed_time_len;
+
+ if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) {
+ LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to insert "
+ "timestamp echo!\n");
+ return;
+ }
+
+ DCCP_SKB_CB(skb)->dccpd_opt_len += len;
+
+ to = skb_push(skb, len);
+ *to++ = DCCPO_TIMESTAMP_ECHO;
+ *to++ = len;
+
+ tstamp_echo = htonl(dp->dccps_timestamp_echo);
+ memcpy(to, &tstamp_echo, 4);
+ to += 4;
+
+ if (elapsed_time_len == 2) {
+ const u16 var16 = htons((u16)elapsed_time);
+ memcpy(to, &var16, 2);
+ } else if (elapsed_time_len == 4) {
+ const u32 var32 = htonl(elapsed_time);
+ memcpy(to, &var32, 4);
+ }
+
+ dccp_pr_debug("%sTIMESTAMP_ECHO=%u, len=%d, seqno=%llu\n",
+ debug_prefix, dp->dccps_timestamp_echo,
+ len,
+ (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq);
+
+ dp->dccps_timestamp_echo = 0;
+ dp->dccps_timestamp_time.tv_sec = 0;
+ dp->dccps_timestamp_time.tv_usec = 0;
+}
+
+void dccp_insert_options(struct sock *sk, struct sk_buff *skb)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
+
+ if (dp->dccps_options.dccpo_send_ndp_count)
+ dccp_insert_option_ndp(sk, skb);
+
+ if (!dccp_packet_without_ack(skb)) {
+ if (dp->dccps_options.dccpo_send_ack_vector &&
+ dccp_ackvec_pending(dp->dccps_hc_rx_ackvec))
+ dccp_insert_option_ackvec(sk, skb);
+ if (dp->dccps_timestamp_echo != 0)
+ dccp_insert_option_timestamp_echo(sk, skb);
+ }
+
+ if (dp->dccps_hc_rx_insert_options) {
+ ccid_hc_rx_insert_options(dp->dccps_hc_rx_ccid, sk, skb);
+ dp->dccps_hc_rx_insert_options = 0;
+ }
+ if (dp->dccps_hc_tx_insert_options) {
+ ccid_hc_tx_insert_options(dp->dccps_hc_tx_ccid, sk, skb);
+ dp->dccps_hc_tx_insert_options = 0;
+ }
+
+ /* XXX: insert other options when appropriate */
+
+ if (DCCP_SKB_CB(skb)->dccpd_opt_len != 0) {
+ /* The length of all options has to be a multiple of 4 */
+ int padding = DCCP_SKB_CB(skb)->dccpd_opt_len % 4;
+
+ if (padding != 0) {
+ padding = 4 - padding;
+ memset(skb_push(skb, padding), 0, padding);
+ DCCP_SKB_CB(skb)->dccpd_opt_len += padding;
+ }
+ }
+}
diff --git a/net/dccp/output.c b/net/dccp/output.c
new file mode 100644
index 00000000000..74ff8702587
--- /dev/null
+++ b/net/dccp/output.c
@@ -0,0 +1,527 @@
+/*
+ * net/dccp/output.c
+ *
+ * An implementation of the DCCP protocol
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+
+#include <net/sock.h>
+
+#include "ackvec.h"
+#include "ccid.h"
+#include "dccp.h"
+
+static inline void dccp_event_ack_sent(struct sock *sk)
+{
+ inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
+}
+
+static inline void dccp_skb_entail(struct sock *sk, struct sk_buff *skb)
+{
+ skb_set_owner_w(skb, sk);
+ WARN_ON(sk->sk_send_head);
+ sk->sk_send_head = skb;
+}
+
+/*
+ * All SKB's seen here are completely headerless. It is our
+ * job to build the DCCP header, and pass the packet down to
+ * IP so it can do the same plus pass the packet off to the
+ * device.
+ */
+static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
+{
+ if (likely(skb != NULL)) {
+ const struct inet_sock *inet = inet_sk(sk);
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
+ struct dccp_hdr *dh;
+ /* XXX For now we're using only 48 bits sequence numbers */
+ const int dccp_header_size = sizeof(*dh) +
+ sizeof(struct dccp_hdr_ext) +
+ dccp_packet_hdr_len(dcb->dccpd_type);
+ int err, set_ack = 1;
+ u64 ackno = dp->dccps_gsr;
+
+ dccp_inc_seqno(&dp->dccps_gss);
+
+ switch (dcb->dccpd_type) {
+ case DCCP_PKT_DATA:
+ set_ack = 0;
+ /* fall through */
+ case DCCP_PKT_DATAACK:
+ break;
+
+ case DCCP_PKT_SYNC:
+ case DCCP_PKT_SYNCACK:
+ ackno = dcb->dccpd_seq;
+ /* fall through */
+ default:
+ /*
+ * Only data packets should come through with skb->sk
+ * set.
+ */
+ WARN_ON(skb->sk);
+ skb_set_owner_w(skb, sk);
+ break;
+ }
+
+ dcb->dccpd_seq = dp->dccps_gss;
+ dccp_insert_options(sk, skb);
+
+ skb->h.raw = skb_push(skb, dccp_header_size);
+ dh = dccp_hdr(skb);
+
+ /* Build DCCP header and checksum it. */
+ memset(dh, 0, dccp_header_size);
+ dh->dccph_type = dcb->dccpd_type;
+ dh->dccph_sport = inet->sport;
+ dh->dccph_dport = inet->dport;
+ dh->dccph_doff = (dccp_header_size + dcb->dccpd_opt_len) / 4;
+ dh->dccph_ccval = dcb->dccpd_ccval;
+ /* XXX For now we're using only 48 bits sequence numbers */
+ dh->dccph_x = 1;
+
+ dp->dccps_awh = dp->dccps_gss;
+ dccp_hdr_set_seq(dh, dp->dccps_gss);
+ if (set_ack)
+ dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), ackno);
+
+ switch (dcb->dccpd_type) {
+ case DCCP_PKT_REQUEST:
+ dccp_hdr_request(skb)->dccph_req_service =
+ dp->dccps_service;
+ break;
+ case DCCP_PKT_RESET:
+ dccp_hdr_reset(skb)->dccph_reset_code =
+ dcb->dccpd_reset_code;
+ break;
+ }
+
+ dh->dccph_checksum = dccp_v4_checksum(skb, inet->saddr,
+ inet->daddr);
+
+ if (set_ack)
+ dccp_event_ack_sent(sk);
+
+ DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
+
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+ err = ip_queue_xmit(skb, 0);
+ if (err <= 0)
+ return err;
+
+ /* NET_XMIT_CN is special. It does not guarantee,
+ * that this packet is lost. It tells that device
+ * is about to start to drop packets or already
+ * drops some packets of the same priority and
+ * invokes us to send less aggressively.
+ */
+ return err == NET_XMIT_CN ? 0 : err;
+ }
+ return -ENOBUFS;
+}
+
+unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ int mss_now;
+
+ /*
+ * FIXME: we really should be using the af_specific thing to support
+ * IPv6.
+ * mss_now = pmtu - tp->af_specific->net_header_len -
+ * sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext);
+ */
+ mss_now = pmtu - sizeof(struct iphdr) - sizeof(struct dccp_hdr) -
+ sizeof(struct dccp_hdr_ext);
+
+ /* Now subtract optional transport overhead */
+ mss_now -= dp->dccps_ext_header_len;
+
+ /*
+ * FIXME: this should come from the CCID infrastructure, where, say,
+ * TFRC will say it wants TIMESTAMPS, ELAPSED time, etc, for now lets
+ * put a rough estimate for NDP + TIMESTAMP + TIMESTAMP_ECHO + ELAPSED
+ * TIME + TFRC_OPT_LOSS_EVENT_RATE + TFRC_OPT_RECEIVE_RATE + padding to
+ * make it a multiple of 4
+ */
+
+ mss_now -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4;
+
+ /* And store cached results */
+ dp->dccps_pmtu_cookie = pmtu;
+ dp->dccps_mss_cache = mss_now;
+
+ return mss_now;
+}
+
+void dccp_write_space(struct sock *sk)
+{
+ read_lock(&sk->sk_callback_lock);
+
+ if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
+ wake_up_interruptible(sk->sk_sleep);
+ /* Should agree with poll, otherwise some programs break */
+ if (sock_writeable(sk))
+ sk_wake_async(sk, 2, POLL_OUT);
+
+ read_unlock(&sk->sk_callback_lock);
+}
+
+/**
+ * dccp_wait_for_ccid - Wait for ccid to tell us we can send a packet
+ * @sk: socket to wait for
+ * @timeo: for how long
+ */
+static int dccp_wait_for_ccid(struct sock *sk, struct sk_buff *skb,
+ long *timeo)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ DEFINE_WAIT(wait);
+ long delay;
+ int rc;
+
+ while (1) {
+ prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+
+ if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
+ goto do_error;
+ if (!*timeo)
+ goto do_nonblock;
+ if (signal_pending(current))
+ goto do_interrupted;
+
+ rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb,
+ skb->len);
+ if (rc <= 0)
+ break;
+ delay = msecs_to_jiffies(rc);
+ if (delay > *timeo || delay < 0)
+ goto do_nonblock;
+
+ sk->sk_write_pending++;
+ release_sock(sk);
+ *timeo -= schedule_timeout(delay);
+ lock_sock(sk);
+ sk->sk_write_pending--;
+ }
+out:
+ finish_wait(sk->sk_sleep, &wait);
+ return rc;
+
+do_error:
+ rc = -EPIPE;
+ goto out;
+do_nonblock:
+ rc = -EAGAIN;
+ goto out;
+do_interrupted:
+ rc = sock_intr_errno(*timeo);
+ goto out;
+}
+
+int dccp_write_xmit(struct sock *sk, struct sk_buff *skb, long *timeo)
+{
+ const struct dccp_sock *dp = dccp_sk(sk);
+ int err = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb,
+ skb->len);
+
+ if (err > 0)
+ err = dccp_wait_for_ccid(sk, skb, timeo);
+
+ if (err == 0) {
+ struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
+ const int len = skb->len;
+
+ if (sk->sk_state == DCCP_PARTOPEN) {
+ /* See 8.1.5. Handshake Completion */
+ inet_csk_schedule_ack(sk);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+ inet_csk(sk)->icsk_rto,
+ DCCP_RTO_MAX);
+ dcb->dccpd_type = DCCP_PKT_DATAACK;
+ } else if (dccp_ack_pending(sk))
+ dcb->dccpd_type = DCCP_PKT_DATAACK;
+ else
+ dcb->dccpd_type = DCCP_PKT_DATA;
+
+ err = dccp_transmit_skb(sk, skb);
+ ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, 0, len);
+ } else
+ kfree_skb(skb);
+
+ return err;
+}
+
+int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
+{
+ if (inet_sk_rebuild_header(sk) != 0)
+ return -EHOSTUNREACH; /* Routing failure or similar. */
+
+ return dccp_transmit_skb(sk, (skb_cloned(skb) ?
+ pskb_copy(skb, GFP_ATOMIC):
+ skb_clone(skb, GFP_ATOMIC)));
+}
+
+struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
+ struct request_sock *req)
+{
+ struct dccp_hdr *dh;
+ struct dccp_request_sock *dreq;
+ const int dccp_header_size = sizeof(struct dccp_hdr) +
+ sizeof(struct dccp_hdr_ext) +
+ sizeof(struct dccp_hdr_response);
+ struct sk_buff *skb = sock_wmalloc(sk, MAX_HEADER + DCCP_MAX_OPT_LEN +
+ dccp_header_size, 1,
+ GFP_ATOMIC);
+ if (skb == NULL)
+ return NULL;
+
+ /* Reserve space for headers. */
+ skb_reserve(skb, MAX_HEADER + DCCP_MAX_OPT_LEN + dccp_header_size);
+
+ skb->dst = dst_clone(dst);
+ skb->csum = 0;
+
+ dreq = dccp_rsk(req);
+ DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE;
+ DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_iss;
+ dccp_insert_options(sk, skb);
+
+ skb->h.raw = skb_push(skb, dccp_header_size);
+
+ dh = dccp_hdr(skb);
+ memset(dh, 0, dccp_header_size);
+
+ dh->dccph_sport = inet_sk(sk)->sport;
+ dh->dccph_dport = inet_rsk(req)->rmt_port;
+ dh->dccph_doff = (dccp_header_size +
+ DCCP_SKB_CB(skb)->dccpd_opt_len) / 4;
+ dh->dccph_type = DCCP_PKT_RESPONSE;
+ dh->dccph_x = 1;
+ dccp_hdr_set_seq(dh, dreq->dreq_iss);
+ dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dreq->dreq_isr);
+ dccp_hdr_response(skb)->dccph_resp_service = dreq->dreq_service;
+
+ dh->dccph_checksum = dccp_v4_checksum(skb, inet_rsk(req)->loc_addr,
+ inet_rsk(req)->rmt_addr);
+
+ DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
+ return skb;
+}
+
+struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst,
+ const enum dccp_reset_codes code)
+
+{
+ struct dccp_hdr *dh;
+ struct dccp_sock *dp = dccp_sk(sk);
+ const int dccp_header_size = sizeof(struct dccp_hdr) +
+ sizeof(struct dccp_hdr_ext) +
+ sizeof(struct dccp_hdr_reset);
+ struct sk_buff *skb = sock_wmalloc(sk, MAX_HEADER + DCCP_MAX_OPT_LEN +
+ dccp_header_size, 1,
+ GFP_ATOMIC);
+ if (skb == NULL)
+ return NULL;
+
+ /* Reserve space for headers. */
+ skb_reserve(skb, MAX_HEADER + DCCP_MAX_OPT_LEN + dccp_header_size);
+
+ skb->dst = dst_clone(dst);
+ skb->csum = 0;
+
+ dccp_inc_seqno(&dp->dccps_gss);
+
+ DCCP_SKB_CB(skb)->dccpd_reset_code = code;
+ DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESET;
+ DCCP_SKB_CB(skb)->dccpd_seq = dp->dccps_gss;
+ dccp_insert_options(sk, skb);
+
+ skb->h.raw = skb_push(skb, dccp_header_size);
+
+ dh = dccp_hdr(skb);
+ memset(dh, 0, dccp_header_size);
+
+ dh->dccph_sport = inet_sk(sk)->sport;
+ dh->dccph_dport = inet_sk(sk)->dport;
+ dh->dccph_doff = (dccp_header_size +
+ DCCP_SKB_CB(skb)->dccpd_opt_len) / 4;
+ dh->dccph_type = DCCP_PKT_RESET;
+ dh->dccph_x = 1;
+ dccp_hdr_set_seq(dh, dp->dccps_gss);
+ dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dp->dccps_gsr);
+
+ dccp_hdr_reset(skb)->dccph_reset_code = code;
+
+ dh->dccph_checksum = dccp_v4_checksum(skb, inet_sk(sk)->saddr,
+ inet_sk(sk)->daddr);
+
+ DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
+ return skb;
+}
+
+/*
+ * Do all connect socket setups that can be done AF independent.
+ */
+static inline void dccp_connect_init(struct sock *sk)
+{
+ struct dst_entry *dst = __sk_dst_get(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ sk->sk_err = 0;
+ sock_reset_flag(sk, SOCK_DONE);
+
+ dccp_sync_mss(sk, dst_mtu(dst));
+
+ /*
+ * FIXME: set dp->{dccps_swh,dccps_swl}, with
+ * something like dccp_inc_seq
+ */
+
+ icsk->icsk_retransmits = 0;
+}
+
+int dccp_connect(struct sock *sk)
+{
+ struct sk_buff *skb;
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ dccp_connect_init(sk);
+
+ skb = alloc_skb(MAX_DCCP_HEADER + 15, sk->sk_allocation);
+ if (unlikely(skb == NULL))
+ return -ENOBUFS;
+
+ /* Reserve space for headers. */
+ skb_reserve(skb, MAX_DCCP_HEADER);
+
+ DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST;
+ skb->csum = 0;
+
+ dccp_skb_entail(sk, skb);
+ dccp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL));
+ DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS);
+
+ /* Timer for repeating the REQUEST until an answer. */
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ icsk->icsk_rto, DCCP_RTO_MAX);
+ return 0;
+}
+
+void dccp_send_ack(struct sock *sk)
+{
+ /* If we have been reset, we may not send again. */
+ if (sk->sk_state != DCCP_CLOSED) {
+ struct sk_buff *skb = alloc_skb(MAX_DCCP_HEADER, GFP_ATOMIC);
+
+ if (skb == NULL) {
+ inet_csk_schedule_ack(sk);
+ inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+ TCP_DELACK_MAX,
+ DCCP_RTO_MAX);
+ return;
+ }
+
+ /* Reserve space for headers */
+ skb_reserve(skb, MAX_DCCP_HEADER);
+ skb->csum = 0;
+ DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_ACK;
+ dccp_transmit_skb(sk, skb);
+ }
+}
+
+EXPORT_SYMBOL_GPL(dccp_send_ack);
+
+void dccp_send_delayed_ack(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ /*
+ * FIXME: tune this timer. elapsed time fixes the skew, so no problem
+ * with using 2s, and active senders also piggyback the ACK into a
+ * DATAACK packet, so this is really for quiescent senders.
+ */
+ unsigned long timeout = jiffies + 2 * HZ;
+
+ /* Use new timeout only if there wasn't a older one earlier. */
+ if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
+ /* If delack timer was blocked or is about to expire,
+ * send ACK now.
+ *
+ * FIXME: check the "about to expire" part
+ */
+ if (icsk->icsk_ack.blocked) {
+ dccp_send_ack(sk);
+ return;
+ }
+
+ if (!time_before(timeout, icsk->icsk_ack.timeout))
+ timeout = icsk->icsk_ack.timeout;
+ }
+ icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
+ icsk->icsk_ack.timeout = timeout;
+ sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
+}
+
+void dccp_send_sync(struct sock *sk, const u64 seq,
+ const enum dccp_pkt_type pkt_type)
+{
+ /*
+ * We are not putting this on the write queue, so
+ * dccp_transmit_skb() will set the ownership to this
+ * sock.
+ */
+ struct sk_buff *skb = alloc_skb(MAX_DCCP_HEADER, GFP_ATOMIC);
+
+ if (skb == NULL)
+ /* FIXME: how to make sure the sync is sent? */
+ return;
+
+ /* Reserve space for headers and prepare control bits. */
+ skb_reserve(skb, MAX_DCCP_HEADER);
+ skb->csum = 0;
+ DCCP_SKB_CB(skb)->dccpd_type = pkt_type;
+ DCCP_SKB_CB(skb)->dccpd_seq = seq;
+
+ dccp_transmit_skb(sk, skb);
+}
+
+/*
+ * Send a DCCP_PKT_CLOSE/CLOSEREQ. The caller locks the socket for us. This
+ * cannot be allowed to fail queueing a DCCP_PKT_CLOSE/CLOSEREQ frame under
+ * any circumstances.
+ */
+void dccp_send_close(struct sock *sk, const int active)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct sk_buff *skb;
+ const gfp_t prio = active ? GFP_KERNEL : GFP_ATOMIC;
+
+ skb = alloc_skb(sk->sk_prot->max_header, prio);
+ if (skb == NULL)
+ return;
+
+ /* Reserve space for headers and prepare control bits. */
+ skb_reserve(skb, sk->sk_prot->max_header);
+ skb->csum = 0;
+ DCCP_SKB_CB(skb)->dccpd_type = dp->dccps_role == DCCP_ROLE_CLIENT ?
+ DCCP_PKT_CLOSE : DCCP_PKT_CLOSEREQ;
+
+ if (active) {
+ dccp_skb_entail(sk, skb);
+ dccp_transmit_skb(sk, skb_clone(skb, prio));
+ } else
+ dccp_transmit_skb(sk, skb);
+}
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
new file mode 100644
index 00000000000..a021c3422f6
--- /dev/null
+++ b/net/dccp/proto.c
@@ -0,0 +1,910 @@
+/*
+ * net/dccp/proto.c
+ *
+ * An implementation of the DCCP protocol
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/if_arp.h>
+#include <linux/init.h>
+#include <linux/random.h>
+#include <net/checksum.h>
+
+#include <net/inet_common.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/sock.h>
+#include <net/xfrm.h>
+
+#include <asm/semaphore.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/delay.h>
+#include <linux/poll.h>
+#include <linux/dccp.h>
+
+#include "ccid.h"
+#include "dccp.h"
+
+DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;
+
+atomic_t dccp_orphan_count = ATOMIC_INIT(0);
+
+static struct net_protocol dccp_protocol = {
+ .handler = dccp_v4_rcv,
+ .err_handler = dccp_v4_err,
+};
+
+const char *dccp_packet_name(const int type)
+{
+ static const char *dccp_packet_names[] = {
+ [DCCP_PKT_REQUEST] = "REQUEST",
+ [DCCP_PKT_RESPONSE] = "RESPONSE",
+ [DCCP_PKT_DATA] = "DATA",
+ [DCCP_PKT_ACK] = "ACK",
+ [DCCP_PKT_DATAACK] = "DATAACK",
+ [DCCP_PKT_CLOSEREQ] = "CLOSEREQ",
+ [DCCP_PKT_CLOSE] = "CLOSE",
+ [DCCP_PKT_RESET] = "RESET",
+ [DCCP_PKT_SYNC] = "SYNC",
+ [DCCP_PKT_SYNCACK] = "SYNCACK",
+ };
+
+ if (type >= DCCP_NR_PKT_TYPES)
+ return "INVALID";
+ else
+ return dccp_packet_names[type];
+}
+
+EXPORT_SYMBOL_GPL(dccp_packet_name);
+
+const char *dccp_state_name(const int state)
+{
+ static char *dccp_state_names[] = {
+ [DCCP_OPEN] = "OPEN",
+ [DCCP_REQUESTING] = "REQUESTING",
+ [DCCP_PARTOPEN] = "PARTOPEN",
+ [DCCP_LISTEN] = "LISTEN",
+ [DCCP_RESPOND] = "RESPOND",
+ [DCCP_CLOSING] = "CLOSING",
+ [DCCP_TIME_WAIT] = "TIME_WAIT",
+ [DCCP_CLOSED] = "CLOSED",
+ };
+
+ if (state >= DCCP_MAX_STATES)
+ return "INVALID STATE!";
+ else
+ return dccp_state_names[state];
+}
+
+EXPORT_SYMBOL_GPL(dccp_state_name);
+
+static inline int dccp_listen_start(struct sock *sk)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ dp->dccps_role = DCCP_ROLE_LISTEN;
+ /*
+ * Apps need to use setsockopt(DCCP_SOCKOPT_SERVICE)
+ * before calling listen()
+ */
+ if (dccp_service_not_initialized(sk))
+ return -EPROTO;
+ return inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
+}
+
+int dccp_disconnect(struct sock *sk, int flags)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_sock *inet = inet_sk(sk);
+ int err = 0;
+ const int old_state = sk->sk_state;
+
+ if (old_state != DCCP_CLOSED)
+ dccp_set_state(sk, DCCP_CLOSED);
+
+ /* ABORT function of RFC793 */
+ if (old_state == DCCP_LISTEN) {
+ inet_csk_listen_stop(sk);
+ /* FIXME: do the active reset thing */
+ } else if (old_state == DCCP_REQUESTING)
+ sk->sk_err = ECONNRESET;
+
+ dccp_clear_xmit_timers(sk);
+ __skb_queue_purge(&sk->sk_receive_queue);
+ if (sk->sk_send_head != NULL) {
+ __kfree_skb(sk->sk_send_head);
+ sk->sk_send_head = NULL;
+ }
+
+ inet->dport = 0;
+
+ if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
+ inet_reset_saddr(sk);
+
+ sk->sk_shutdown = 0;
+ sock_reset_flag(sk, SOCK_DONE);
+
+ icsk->icsk_backoff = 0;
+ inet_csk_delack_init(sk);
+ __sk_dst_reset(sk);
+
+ BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
+
+ sk->sk_error_report(sk);
+ return err;
+}
+
+/*
+ * Wait for a DCCP event.
+ *
+ * Note that we don't need to lock the socket, as the upper poll layers
+ * take care of normal races (between the test and the event) and we don't
+ * go look at any of the socket buffers directly.
+ */
+static unsigned int dccp_poll(struct file *file, struct socket *sock,
+ poll_table *wait)
+{
+ unsigned int mask;
+ struct sock *sk = sock->sk;
+
+ poll_wait(file, sk->sk_sleep, wait);
+ if (sk->sk_state == DCCP_LISTEN)
+ return inet_csk_listen_poll(sk);
+
+ /* Socket is not locked. We are protected from async events
+ by poll logic and correct handling of state changes
+ made by another threads is impossible in any case.
+ */
+
+ mask = 0;
+ if (sk->sk_err)
+ mask = POLLERR;
+
+ if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == DCCP_CLOSED)
+ mask |= POLLHUP;
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ mask |= POLLIN | POLLRDNORM;
+
+ /* Connected? */
+ if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) {
+ if (atomic_read(&sk->sk_rmem_alloc) > 0)
+ mask |= POLLIN | POLLRDNORM;
+
+ if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
+ if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
+ mask |= POLLOUT | POLLWRNORM;
+ } else { /* send SIGIO later */
+ set_bit(SOCK_ASYNC_NOSPACE,
+ &sk->sk_socket->flags);
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+
+ /* Race breaker. If space is freed after
+ * wspace test but before the flags are set,
+ * IO signal will be lost.
+ */
+ if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
+ mask |= POLLOUT | POLLWRNORM;
+ }
+ }
+ }
+ return mask;
+}
+
+int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+ dccp_pr_debug("entry\n");
+ return -ENOIOCTLCMD;
+}
+
+static int dccp_setsockopt_service(struct sock *sk, const u32 service,
+ char __user *optval, int optlen)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct dccp_service_list *sl = NULL;
+
+ if (service == DCCP_SERVICE_INVALID_VALUE ||
+ optlen > DCCP_SERVICE_LIST_MAX_LEN * sizeof(u32))
+ return -EINVAL;
+
+ if (optlen > sizeof(service)) {
+ sl = kmalloc(optlen, GFP_KERNEL);
+ if (sl == NULL)
+ return -ENOMEM;
+
+ sl->dccpsl_nr = optlen / sizeof(u32) - 1;
+ if (copy_from_user(sl->dccpsl_list,
+ optval + sizeof(service),
+ optlen - sizeof(service)) ||
+ dccp_list_has_service(sl, DCCP_SERVICE_INVALID_VALUE)) {
+ kfree(sl);
+ return -EFAULT;
+ }
+ }
+
+ lock_sock(sk);
+ dp->dccps_service = service;
+
+ if (dp->dccps_service_list != NULL)
+ kfree(dp->dccps_service_list);
+
+ dp->dccps_service_list = sl;
+ release_sock(sk);
+ return 0;
+}
+
+int dccp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int optlen)
+{
+ struct dccp_sock *dp;
+ int err;
+ int val;
+
+ if (level != SOL_DCCP)
+ return ip_setsockopt(sk, level, optname, optval, optlen);
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+
+ if (get_user(val, (int __user *)optval))
+ return -EFAULT;
+
+ if (optname == DCCP_SOCKOPT_SERVICE)
+ return dccp_setsockopt_service(sk, val, optval, optlen);
+
+ lock_sock(sk);
+ dp = dccp_sk(sk);
+ err = 0;
+
+ switch (optname) {
+ case DCCP_SOCKOPT_PACKET_SIZE:
+ dp->dccps_packet_size = val;
+ break;
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+ return err;
+}
+
+static int dccp_getsockopt_service(struct sock *sk, int len,
+ u32 __user *optval,
+ int __user *optlen)
+{
+ const struct dccp_sock *dp = dccp_sk(sk);
+ const struct dccp_service_list *sl;
+ int err = -ENOENT, slen = 0, total_len = sizeof(u32);
+
+ lock_sock(sk);
+ if (dccp_service_not_initialized(sk))
+ goto out;
+
+ if ((sl = dp->dccps_service_list) != NULL) {
+ slen = sl->dccpsl_nr * sizeof(u32);
+ total_len += slen;
+ }
+
+ err = -EINVAL;
+ if (total_len > len)
+ goto out;
+
+ err = 0;
+ if (put_user(total_len, optlen) ||
+ put_user(dp->dccps_service, optval) ||
+ (sl != NULL && copy_to_user(optval + 1, sl->dccpsl_list, slen)))
+ err = -EFAULT;
+out:
+ release_sock(sk);
+ return err;
+}
+
+int dccp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct dccp_sock *dp;
+ int val, len;
+
+ if (level != SOL_DCCP)
+ return ip_getsockopt(sk, level, optname, optval, optlen);
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ if (len < sizeof(int))
+ return -EINVAL;
+
+ dp = dccp_sk(sk);
+
+ switch (optname) {
+ case DCCP_SOCKOPT_PACKET_SIZE:
+ val = dp->dccps_packet_size;
+ len = sizeof(dp->dccps_packet_size);
+ break;
+ case DCCP_SOCKOPT_SERVICE:
+ return dccp_getsockopt_service(sk, len,
+ (u32 __user *)optval, optlen);
+ case 128 ... 191:
+ return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname,
+ len, (u32 __user *)optval, optlen);
+ case 192 ... 255:
+ return ccid_hc_tx_getsockopt(dp->dccps_hc_tx_ccid, sk, optname,
+ len, (u32 __user *)optval, optlen);
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ if (put_user(len, optlen) || copy_to_user(optval, &val, len))
+ return -EFAULT;
+
+ return 0;
+}
+
+int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+ size_t len)
+{
+ const struct dccp_sock *dp = dccp_sk(sk);
+ const int flags = msg->msg_flags;
+ const int noblock = flags & MSG_DONTWAIT;
+ struct sk_buff *skb;
+ int rc, size;
+ long timeo;
+
+ if (len > dp->dccps_mss_cache)
+ return -EMSGSIZE;
+
+ lock_sock(sk);
+ timeo = sock_sndtimeo(sk, noblock);
+
+ /*
+ * We have to use sk_stream_wait_connect here to set sk_write_pending,
+ * so that the trick in dccp_rcv_request_sent_state_process.
+ */
+ /* Wait for a connection to finish. */
+ if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN | DCCPF_CLOSING))
+ if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0)
+ goto out_release;
+
+ size = sk->sk_prot->max_header + len;
+ release_sock(sk);
+ skb = sock_alloc_send_skb(sk, size, noblock, &rc);
+ lock_sock(sk);
+ if (skb == NULL)
+ goto out_release;
+
+ skb_reserve(skb, sk->sk_prot->max_header);
+ rc = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
+ if (rc != 0)
+ goto out_discard;
+
+ rc = dccp_write_xmit(sk, skb, &timeo);
+ /*
+ * XXX we don't use sk_write_queue, so just discard the packet.
+ * Current plan however is to _use_ sk_write_queue with
+ * an algorith similar to tcp_sendmsg, where the main difference
+ * is that in DCCP we have to respect packet boundaries, so
+ * no coalescing of skbs.
+ *
+ * This bug was _quickly_ found & fixed by just looking at an OSTRA
+ * generated callgraph 8) -acme
+ */
+out_release:
+ release_sock(sk);
+ return rc ? : len;
+out_discard:
+ kfree_skb(skb);
+ goto out_release;
+}
+
+int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+ size_t len, int nonblock, int flags, int *addr_len)
+{
+ const struct dccp_hdr *dh;
+ long timeo;
+
+ lock_sock(sk);
+
+ if (sk->sk_state == DCCP_LISTEN) {
+ len = -ENOTCONN;
+ goto out;
+ }
+
+ timeo = sock_rcvtimeo(sk, nonblock);
+
+ do {
+ struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+
+ if (skb == NULL)
+ goto verify_sock_status;
+
+ dh = dccp_hdr(skb);
+
+ if (dh->dccph_type == DCCP_PKT_DATA ||
+ dh->dccph_type == DCCP_PKT_DATAACK)
+ goto found_ok_skb;
+
+ if (dh->dccph_type == DCCP_PKT_RESET ||
+ dh->dccph_type == DCCP_PKT_CLOSE) {
+ dccp_pr_debug("found fin ok!\n");
+ len = 0;
+ goto found_fin_ok;
+ }
+ dccp_pr_debug("packet_type=%s\n",
+ dccp_packet_name(dh->dccph_type));
+ sk_eat_skb(sk, skb);
+verify_sock_status:
+ if (sock_flag(sk, SOCK_DONE)) {
+ len = 0;
+ break;
+ }
+
+ if (sk->sk_err) {
+ len = sock_error(sk);
+ break;
+ }
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN) {
+ len = 0;
+ break;
+ }
+
+ if (sk->sk_state == DCCP_CLOSED) {
+ if (!sock_flag(sk, SOCK_DONE)) {
+ /* This occurs when user tries to read
+ * from never connected socket.
+ */
+ len = -ENOTCONN;
+ break;
+ }
+ len = 0;
+ break;
+ }
+
+ if (!timeo) {
+ len = -EAGAIN;
+ break;
+ }
+
+ if (signal_pending(current)) {
+ len = sock_intr_errno(timeo);
+ break;
+ }
+
+ sk_wait_data(sk, &timeo);
+ continue;
+ found_ok_skb:
+ if (len > skb->len)
+ len = skb->len;
+ else if (len < skb->len)
+ msg->msg_flags |= MSG_TRUNC;
+
+ if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len)) {
+ /* Exception. Bailout! */
+ len = -EFAULT;
+ break;
+ }
+ found_fin_ok:
+ if (!(flags & MSG_PEEK))
+ sk_eat_skb(sk, skb);
+ break;
+ } while (1);
+out:
+ release_sock(sk);
+ return len;
+}
+
+static int inet_dccp_listen(struct socket *sock, int backlog)
+{
+ struct sock *sk = sock->sk;
+ unsigned char old_state;
+ int err;
+
+ lock_sock(sk);
+
+ err = -EINVAL;
+ if (sock->state != SS_UNCONNECTED || sock->type != SOCK_DCCP)
+ goto out;
+
+ old_state = sk->sk_state;
+ if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN)))
+ goto out;
+
+ /* Really, if the socket is already in listen state
+ * we can only allow the backlog to be adjusted.
+ */
+ if (old_state != DCCP_LISTEN) {
+ /*
+ * FIXME: here it probably should be sk->sk_prot->listen_start
+ * see tcp_listen_start
+ */
+ err = dccp_listen_start(sk);
+ if (err)
+ goto out;
+ }
+ sk->sk_max_ack_backlog = backlog;
+ err = 0;
+
+out:
+ release_sock(sk);
+ return err;
+}
+
+static const unsigned char dccp_new_state[] = {
+ /* current state: new state: action: */
+ [0] = DCCP_CLOSED,
+ [DCCP_OPEN] = DCCP_CLOSING | DCCP_ACTION_FIN,
+ [DCCP_REQUESTING] = DCCP_CLOSED,
+ [DCCP_PARTOPEN] = DCCP_CLOSING | DCCP_ACTION_FIN,
+ [DCCP_LISTEN] = DCCP_CLOSED,
+ [DCCP_RESPOND] = DCCP_CLOSED,
+ [DCCP_CLOSING] = DCCP_CLOSED,
+ [DCCP_TIME_WAIT] = DCCP_CLOSED,
+ [DCCP_CLOSED] = DCCP_CLOSED,
+};
+
+static int dccp_close_state(struct sock *sk)
+{
+ const int next = dccp_new_state[sk->sk_state];
+ const int ns = next & DCCP_STATE_MASK;
+
+ if (ns != sk->sk_state)
+ dccp_set_state(sk, ns);
+
+ return next & DCCP_ACTION_FIN;
+}
+
+void dccp_close(struct sock *sk, long timeout)
+{
+ struct sk_buff *skb;
+
+ lock_sock(sk);
+
+ sk->sk_shutdown = SHUTDOWN_MASK;
+
+ if (sk->sk_state == DCCP_LISTEN) {
+ dccp_set_state(sk, DCCP_CLOSED);
+
+ /* Special case. */
+ inet_csk_listen_stop(sk);
+
+ goto adjudge_to_death;
+ }
+
+ /*
+ * We need to flush the recv. buffs. We do this only on the
+ * descriptor close, not protocol-sourced closes, because the
+ *reader process may not have drained the data yet!
+ */
+ /* FIXME: check for unread data */
+ while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+ __kfree_skb(skb);
+ }
+
+ if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
+ /* Check zero linger _after_ checking for unread data. */
+ sk->sk_prot->disconnect(sk, 0);
+ } else if (dccp_close_state(sk)) {
+ dccp_send_close(sk, 1);
+ }
+
+ sk_stream_wait_close(sk, timeout);
+
+adjudge_to_death:
+ /*
+ * It is the last release_sock in its life. It will remove backlog.
+ */
+ release_sock(sk);
+ /*
+ * Now socket is owned by kernel and we acquire BH lock
+ * to finish close. No need to check for user refs.
+ */
+ local_bh_disable();
+ bh_lock_sock(sk);
+ BUG_TRAP(!sock_owned_by_user(sk));
+
+ sock_hold(sk);
+ sock_orphan(sk);
+
+ /*
+ * The last release_sock may have processed the CLOSE or RESET
+ * packet moving sock to CLOSED state, if not we have to fire
+ * the CLOSE/CLOSEREQ retransmission timer, see "8.3. Termination"
+ * in draft-ietf-dccp-spec-11. -acme
+ */
+ if (sk->sk_state == DCCP_CLOSING) {
+ /* FIXME: should start at 2 * RTT */
+ /* Timer for repeating the CLOSE/CLOSEREQ until an answer. */
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ inet_csk(sk)->icsk_rto,
+ DCCP_RTO_MAX);
+#if 0
+ /* Yeah, we should use sk->sk_prot->orphan_count, etc */
+ dccp_set_state(sk, DCCP_CLOSED);
+#endif
+ }
+
+ atomic_inc(sk->sk_prot->orphan_count);
+ if (sk->sk_state == DCCP_CLOSED)
+ inet_csk_destroy_sock(sk);
+
+ /* Otherwise, socket is reprieved until protocol close. */
+
+ bh_unlock_sock(sk);
+ local_bh_enable();
+ sock_put(sk);
+}
+
+void dccp_shutdown(struct sock *sk, int how)
+{
+ dccp_pr_debug("entry\n");
+}
+
+static struct proto_ops inet_dccp_ops = {
+ .family = PF_INET,
+ .owner = THIS_MODULE,
+ .release = inet_release,
+ .bind = inet_bind,
+ .connect = inet_stream_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = inet_accept,
+ .getname = inet_getname,
+ /* FIXME: work on tcp_poll to rename it to inet_csk_poll */
+ .poll = dccp_poll,
+ .ioctl = inet_ioctl,
+ /* FIXME: work on inet_listen to rename it to sock_common_listen */
+ .listen = inet_dccp_listen,
+ .shutdown = inet_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet_sendmsg,
+ .recvmsg = sock_common_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+extern struct net_proto_family inet_family_ops;
+
+static struct inet_protosw dccp_v4_protosw = {
+ .type = SOCK_DCCP,
+ .protocol = IPPROTO_DCCP,
+ .prot = &dccp_v4_prot,
+ .ops = &inet_dccp_ops,
+ .capability = -1,
+ .no_check = 0,
+ .flags = 0,
+};
+
+/*
+ * This is the global socket data structure used for responding to
+ * the Out-of-the-blue (OOTB) packets. A control sock will be created
+ * for this socket at the initialization time.
+ */
+struct socket *dccp_ctl_socket;
+
+static char dccp_ctl_socket_err_msg[] __initdata =
+ KERN_ERR "DCCP: Failed to create the control socket.\n";
+
+static int __init dccp_ctl_sock_init(void)
+{
+ int rc = sock_create_kern(PF_INET, SOCK_DCCP, IPPROTO_DCCP,
+ &dccp_ctl_socket);
+ if (rc < 0)
+ printk(dccp_ctl_socket_err_msg);
+ else {
+ dccp_ctl_socket->sk->sk_allocation = GFP_ATOMIC;
+ inet_sk(dccp_ctl_socket->sk)->uc_ttl = -1;
+
+ /* Unhash it so that IP input processing does not even
+ * see it, we do not wish this socket to see incoming
+ * packets.
+ */
+ dccp_ctl_socket->sk->sk_prot->unhash(dccp_ctl_socket->sk);
+ }
+
+ return rc;
+}
+
+#ifdef CONFIG_IP_DCCP_UNLOAD_HACK
+void dccp_ctl_sock_exit(void)
+{
+ if (dccp_ctl_socket != NULL) {
+ sock_release(dccp_ctl_socket);
+ dccp_ctl_socket = NULL;
+ }
+}
+
+EXPORT_SYMBOL_GPL(dccp_ctl_sock_exit);
+#endif
+
+static int __init init_dccp_v4_mibs(void)
+{
+ int rc = -ENOMEM;
+
+ dccp_statistics[0] = alloc_percpu(struct dccp_mib);
+ if (dccp_statistics[0] == NULL)
+ goto out;
+
+ dccp_statistics[1] = alloc_percpu(struct dccp_mib);
+ if (dccp_statistics[1] == NULL)
+ goto out_free_one;
+
+ rc = 0;
+out:
+ return rc;
+out_free_one:
+ free_percpu(dccp_statistics[0]);
+ dccp_statistics[0] = NULL;
+ goto out;
+
+}
+
+static int thash_entries;
+module_param(thash_entries, int, 0444);
+MODULE_PARM_DESC(thash_entries, "Number of ehash buckets");
+
+#ifdef CONFIG_IP_DCCP_DEBUG
+int dccp_debug;
+module_param(dccp_debug, int, 0444);
+MODULE_PARM_DESC(dccp_debug, "Enable debug messages");
+#endif
+
+static int __init dccp_init(void)
+{
+ unsigned long goal;
+ int ehash_order, bhash_order, i;
+ int rc = proto_register(&dccp_v4_prot, 1);
+
+ if (rc)
+ goto out;
+
+ dccp_hashinfo.bind_bucket_cachep =
+ kmem_cache_create("dccp_bind_bucket",
+ sizeof(struct inet_bind_bucket), 0,
+ SLAB_HWCACHE_ALIGN, NULL, NULL);
+ if (!dccp_hashinfo.bind_bucket_cachep)
+ goto out_proto_unregister;
+
+ /*
+ * Size and allocate the main established and bind bucket
+ * hash tables.
+ *
+ * The methodology is similar to that of the buffer cache.
+ */
+ if (num_physpages >= (128 * 1024))
+ goal = num_physpages >> (21 - PAGE_SHIFT);
+ else
+ goal = num_physpages >> (23 - PAGE_SHIFT);
+
+ if (thash_entries)
+ goal = (thash_entries *
+ sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT;
+ for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++)
+ ;
+ do {
+ dccp_hashinfo.ehash_size = (1UL << ehash_order) * PAGE_SIZE /
+ sizeof(struct inet_ehash_bucket);
+ dccp_hashinfo.ehash_size >>= 1;
+ while (dccp_hashinfo.ehash_size &
+ (dccp_hashinfo.ehash_size - 1))
+ dccp_hashinfo.ehash_size--;
+ dccp_hashinfo.ehash = (struct inet_ehash_bucket *)
+ __get_free_pages(GFP_ATOMIC, ehash_order);
+ } while (!dccp_hashinfo.ehash && --ehash_order > 0);
+
+ if (!dccp_hashinfo.ehash) {
+ printk(KERN_CRIT "Failed to allocate DCCP "
+ "established hash table\n");
+ goto out_free_bind_bucket_cachep;
+ }
+
+ for (i = 0; i < (dccp_hashinfo.ehash_size << 1); i++) {
+ rwlock_init(&dccp_hashinfo.ehash[i].lock);
+ INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain);
+ }
+
+ bhash_order = ehash_order;
+
+ do {
+ dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE /
+ sizeof(struct inet_bind_hashbucket);
+ if ((dccp_hashinfo.bhash_size > (64 * 1024)) &&
+ bhash_order > 0)
+ continue;
+ dccp_hashinfo.bhash = (struct inet_bind_hashbucket *)
+ __get_free_pages(GFP_ATOMIC, bhash_order);
+ } while (!dccp_hashinfo.bhash && --bhash_order >= 0);
+
+ if (!dccp_hashinfo.bhash) {
+ printk(KERN_CRIT "Failed to allocate DCCP bind hash table\n");
+ goto out_free_dccp_ehash;
+ }
+
+ for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
+ spin_lock_init(&dccp_hashinfo.bhash[i].lock);
+ INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
+ }
+
+ if (init_dccp_v4_mibs())
+ goto out_free_dccp_bhash;
+
+ rc = -EAGAIN;
+ if (inet_add_protocol(&dccp_protocol, IPPROTO_DCCP))
+ goto out_free_dccp_v4_mibs;
+
+ inet_register_protosw(&dccp_v4_protosw);
+
+ rc = dccp_ctl_sock_init();
+ if (rc)
+ goto out_unregister_protosw;
+out:
+ return rc;
+out_unregister_protosw:
+ inet_unregister_protosw(&dccp_v4_protosw);
+ inet_del_protocol(&dccp_protocol, IPPROTO_DCCP);
+out_free_dccp_v4_mibs:
+ free_percpu(dccp_statistics[0]);
+ free_percpu(dccp_statistics[1]);
+ dccp_statistics[0] = dccp_statistics[1] = NULL;
+out_free_dccp_bhash:
+ free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
+ dccp_hashinfo.bhash = NULL;
+out_free_dccp_ehash:
+ free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
+ dccp_hashinfo.ehash = NULL;
+out_free_bind_bucket_cachep:
+ kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
+ dccp_hashinfo.bind_bucket_cachep = NULL;
+out_proto_unregister:
+ proto_unregister(&dccp_v4_prot);
+ goto out;
+}
+
+static const char dccp_del_proto_err_msg[] __exitdata =
+ KERN_ERR "can't remove dccp net_protocol\n";
+
+static void __exit dccp_fini(void)
+{
+ inet_unregister_protosw(&dccp_v4_protosw);
+
+ if (inet_del_protocol(&dccp_protocol, IPPROTO_DCCP) < 0)
+ printk(dccp_del_proto_err_msg);
+
+ free_percpu(dccp_statistics[0]);
+ free_percpu(dccp_statistics[1]);
+ free_pages((unsigned long)dccp_hashinfo.bhash,
+ get_order(dccp_hashinfo.bhash_size *
+ sizeof(struct inet_bind_hashbucket)));
+ free_pages((unsigned long)dccp_hashinfo.ehash,
+ get_order(dccp_hashinfo.ehash_size *
+ sizeof(struct inet_ehash_bucket)));
+ kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
+ proto_unregister(&dccp_v4_prot);
+}
+
+module_init(dccp_init);
+module_exit(dccp_fini);
+
+/*
+ * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33)
+ * values directly, Also cover the case where the protocol is not specified,
+ * i.e. net-pf-PF_INET-proto-0-type-SOCK_DCCP
+ */
+MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-33-type-6");
+MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-0-type-6");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@conectiva.com.br>");
+MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol");
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
new file mode 100644
index 00000000000..aa34b576e22
--- /dev/null
+++ b/net/dccp/timer.c
@@ -0,0 +1,255 @@
+/*
+ * net/dccp/timer.c
+ *
+ * An implementation of the DCCP protocol
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <linux/skbuff.h>
+
+#include "dccp.h"
+
+static void dccp_write_timer(unsigned long data);
+static void dccp_keepalive_timer(unsigned long data);
+static void dccp_delack_timer(unsigned long data);
+
+void dccp_init_xmit_timers(struct sock *sk)
+{
+ inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer,
+ &dccp_keepalive_timer);
+}
+
+static void dccp_write_err(struct sock *sk)
+{
+ sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
+ sk->sk_error_report(sk);
+
+ dccp_v4_send_reset(sk, DCCP_RESET_CODE_ABORTED);
+ dccp_done(sk);
+ DCCP_INC_STATS_BH(DCCP_MIB_ABORTONTIMEOUT);
+}
+
+/* A write timeout has occurred. Process the after effects. */
+static int dccp_write_timeout(struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ int retry_until;
+
+ if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) {
+ if (icsk->icsk_retransmits != 0)
+ dst_negative_advice(&sk->sk_dst_cache);
+ retry_until = icsk->icsk_syn_retries ? :
+ /* FIXME! */ 3 /* FIXME! sysctl_tcp_syn_retries */;
+ } else {
+ if (icsk->icsk_retransmits >=
+ /* FIXME! sysctl_tcp_retries1 */ 5 /* FIXME! */) {
+ /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu
+ black hole detection. :-(
+
+ It is place to make it. It is not made. I do not want
+ to make it. It is disguisting. It does not work in any
+ case. Let me to cite the same draft, which requires for
+ us to implement this:
+
+ "The one security concern raised by this memo is that ICMP black holes
+ are often caused by over-zealous security administrators who block
+ all ICMP messages. It is vitally important that those who design and
+ deploy security systems understand the impact of strict filtering on
+ upper-layer protocols. The safest web site in the world is worthless
+ if most TCP implementations cannot transfer data from it. It would
+ be far nicer to have all of the black holes fixed rather than fixing
+ all of the TCP implementations."
+
+ Golden words :-).
+ */
+
+ dst_negative_advice(&sk->sk_dst_cache);
+ }
+
+ retry_until = /* FIXME! */ 15 /* FIXME! sysctl_tcp_retries2 */;
+ /*
+ * FIXME: see tcp_write_timout and tcp_out_of_resources
+ */
+ }
+
+ if (icsk->icsk_retransmits >= retry_until) {
+ /* Has it gone just too far? */
+ dccp_write_err(sk);
+ return 1;
+ }
+ return 0;
+}
+
+/* This is the same as tcp_delack_timer, sans prequeue & mem_reclaim stuff */
+static void dccp_delack_timer(unsigned long data)
+{
+ struct sock *sk = (struct sock *)data;
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk)) {
+ /* Try again later. */
+ icsk->icsk_ack.blocked = 1;
+ NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
+ sk_reset_timer(sk, &icsk->icsk_delack_timer,
+ jiffies + TCP_DELACK_MIN);
+ goto out;
+ }
+
+ if (sk->sk_state == DCCP_CLOSED ||
+ !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
+ goto out;
+ if (time_after(icsk->icsk_ack.timeout, jiffies)) {
+ sk_reset_timer(sk, &icsk->icsk_delack_timer,
+ icsk->icsk_ack.timeout);
+ goto out;
+ }
+
+ icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
+
+ if (inet_csk_ack_scheduled(sk)) {
+ if (!icsk->icsk_ack.pingpong) {
+ /* Delayed ACK missed: inflate ATO. */
+ icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1,
+ icsk->icsk_rto);
+ } else {
+ /* Delayed ACK missed: leave pingpong mode and
+ * deflate ATO.
+ */
+ icsk->icsk_ack.pingpong = 0;
+ icsk->icsk_ack.ato = TCP_ATO_MIN;
+ }
+ dccp_send_ack(sk);
+ NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
+ }
+out:
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+/*
+ * The DCCP retransmit timer.
+ */
+static void dccp_retransmit_timer(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ /*
+ * sk->sk_send_head has to have one skb with
+ * DCCP_SKB_CB(skb)->dccpd_type set to one of the retransmittable DCCP
+ * packet types (REQUEST, RESPONSE, the ACK in the 3way handshake
+ * (PARTOPEN timer), etc).
+ */
+ BUG_TRAP(sk->sk_send_head != NULL);
+
+ /*
+ * More than than 4MSL (8 minutes) has passed, a RESET(aborted) was
+ * sent, no need to retransmit, this sock is dead.
+ */
+ if (dccp_write_timeout(sk))
+ goto out;
+
+ /*
+ * We want to know the number of packets retransmitted, not the
+ * total number of retransmissions of clones of original packets.
+ */
+ if (icsk->icsk_retransmits == 0)
+ DCCP_INC_STATS_BH(DCCP_MIB_TIMEOUTS);
+
+ if (dccp_retransmit_skb(sk, sk->sk_send_head) < 0) {
+ /*
+ * Retransmission failed because of local congestion,
+ * do not backoff.
+ */
+ if (icsk->icsk_retransmits == 0)
+ icsk->icsk_retransmits = 1;
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ min(icsk->icsk_rto,
+ TCP_RESOURCE_PROBE_INTERVAL),
+ DCCP_RTO_MAX);
+ goto out;
+ }
+
+ icsk->icsk_backoff++;
+ icsk->icsk_retransmits++;
+
+ icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto,
+ DCCP_RTO_MAX);
+ if (icsk->icsk_retransmits > 3 /* FIXME: sysctl_dccp_retries1 */)
+ __sk_dst_reset(sk);
+out:;
+}
+
+static void dccp_write_timer(unsigned long data)
+{
+ struct sock *sk = (struct sock *)data;
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ int event = 0;
+
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk)) {
+ /* Try again later */
+ sk_reset_timer(sk, &icsk->icsk_retransmit_timer,
+ jiffies + (HZ / 20));
+ goto out;
+ }
+
+ if (sk->sk_state == DCCP_CLOSED || !icsk->icsk_pending)
+ goto out;
+
+ if (time_after(icsk->icsk_timeout, jiffies)) {
+ sk_reset_timer(sk, &icsk->icsk_retransmit_timer,
+ icsk->icsk_timeout);
+ goto out;
+ }
+
+ event = icsk->icsk_pending;
+ icsk->icsk_pending = 0;
+
+ switch (event) {
+ case ICSK_TIME_RETRANS:
+ dccp_retransmit_timer(sk);
+ break;
+ }
+out:
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+/*
+ * Timer for listening sockets
+ */
+static void dccp_response_timer(struct sock *sk)
+{
+ inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL, DCCP_TIMEOUT_INIT,
+ DCCP_RTO_MAX);
+}
+
+static void dccp_keepalive_timer(unsigned long data)
+{
+ struct sock *sk = (struct sock *)data;
+
+ /* Only process if socket is not in use. */
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk)) {
+ /* Try again later. */
+ inet_csk_reset_keepalive_timer(sk, HZ / 20);
+ goto out;
+ }
+
+ if (sk->sk_state == DCCP_LISTEN) {
+ dccp_response_timer(sk);
+ goto out;
+ }
+out:
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 96a02800cd2..3f25cadccdd 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -118,7 +118,7 @@ Version 0.0.6 2.1.110 07-aug-98 Eduardo Marcelo Serrat
#include <linux/netfilter.h>
#include <linux/seq_file.h>
#include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <net/flow.h>
#include <asm/system.h>
#include <asm/ioctls.h>
@@ -452,7 +452,7 @@ static struct proto dn_proto = {
.obj_size = sizeof(struct dn_sock),
};
-static struct sock *dn_alloc_sock(struct socket *sock, int gfp)
+static struct sock *dn_alloc_sock(struct socket *sock, gfp_t gfp)
{
struct dn_scp *scp;
struct sock *sk = sk_alloc(PF_DECnet, gfp, &dn_proto, 1);
@@ -719,22 +719,9 @@ static int dn_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
if (saddr->sdn_flags & ~SDF_WILD)
return -EINVAL;
-#if 1
if (!capable(CAP_NET_BIND_SERVICE) && (saddr->sdn_objnum ||
(saddr->sdn_flags & SDF_WILD)))
return -EACCES;
-#else
- /*
- * Maybe put the default actions in the default security ops for
- * dn_prot_sock ? Would be nice if the capable call would go there
- * too.
- */
- if (security_dn_prot_sock(saddr) &&
- !capable(CAP_NET_BIND_SERVICE) ||
- saddr->sdn_objnum || (saddr->sdn_flags & SDF_WILD))
- return -EACCES;
-#endif
-
if (!(saddr->sdn_flags & SDF_WILD)) {
if (dn_ntohs(saddr->sdn_nodeaddrl)) {
@@ -804,7 +791,7 @@ static int dn_auto_bind(struct socket *sock)
return rv;
}
-static int dn_confirm_accept(struct sock *sk, long *timeo, int allocation)
+static int dn_confirm_accept(struct sock *sk, long *timeo, gfp_t allocation)
{
struct dn_scp *scp = DN_SK(sk);
DEFINE_WAIT(wait);
@@ -1763,7 +1750,7 @@ static int dn_recvmsg(struct kiocb *iocb, struct socket *sock,
nskb = skb->next;
if (skb->len == 0) {
- skb_unlink(skb);
+ skb_unlink(skb, queue);
kfree_skb(skb);
/*
* N.B. Don't refer to skb or cb after this point
@@ -1876,17 +1863,27 @@ static inline unsigned int dn_current_mss(struct sock *sk, int flags)
return mss_now;
}
-static int dn_error(struct sock *sk, int flags, int err)
+/*
+ * N.B. We get the timeout wrong here, but then we always did get it
+ * wrong before and this is another step along the road to correcting
+ * it. It ought to get updated each time we pass through the routine,
+ * but in practise it probably doesn't matter too much for now.
+ */
+static inline struct sk_buff *dn_alloc_send_pskb(struct sock *sk,
+ unsigned long datalen, int noblock,
+ int *errcode)
{
- if (err == -EPIPE)
- err = sock_error(sk) ? : -EPIPE;
- if (err == -EPIPE && !(flags & MSG_NOSIGNAL))
- send_sig(SIGPIPE, current, 0);
- return err;
+ struct sk_buff *skb = sock_alloc_send_skb(sk, datalen,
+ noblock, errcode);
+ if (skb) {
+ skb->protocol = __constant_htons(ETH_P_DNA_RT);
+ skb->pkt_type = PACKET_OUTGOING;
+ }
+ return skb;
}
static int dn_sendmsg(struct kiocb *iocb, struct socket *sock,
- struct msghdr *msg, size_t size)
+ struct msghdr *msg, size_t size)
{
struct sock *sk = sock->sk;
struct dn_scp *scp = DN_SK(sk);
@@ -1901,7 +1898,7 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock,
struct dn_skb_cb *cb;
size_t len;
unsigned char fctype;
- long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+ long timeo;
if (flags & ~(MSG_TRYHARD|MSG_OOB|MSG_DONTWAIT|MSG_EOR|MSG_NOSIGNAL|MSG_MORE|MSG_CMSG_COMPAT))
return -EOPNOTSUPP;
@@ -1909,18 +1906,21 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock,
if (addr_len && (addr_len != sizeof(struct sockaddr_dn)))
return -EINVAL;
+ lock_sock(sk);
+ timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
/*
* The only difference between stream sockets and sequenced packet
* sockets is that the stream sockets always behave as if MSG_EOR
* has been set.
*/
if (sock->type == SOCK_STREAM) {
- if (flags & MSG_EOR)
- return -EINVAL;
+ if (flags & MSG_EOR) {
+ err = -EINVAL;
+ goto out;
+ }
flags |= MSG_EOR;
}
- lock_sock(sk);
err = dn_check_state(sk, addr, addr_len, &timeo, flags);
if (err)
@@ -1989,8 +1989,12 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock,
/*
* Get a suitably sized skb.
+ * 64 is a bit of a hack really, but its larger than any
+ * link-layer headers and has served us well as a good
+ * guess as to their real length.
*/
- skb = dn_alloc_send_skb(sk, &len, flags & MSG_DONTWAIT, timeo, &err);
+ skb = dn_alloc_send_pskb(sk, len + 64 + DN_MAX_NSP_DATA_HEADER,
+ flags & MSG_DONTWAIT, &err);
if (err)
break;
@@ -2000,7 +2004,7 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock,
cb = DN_SKB_CB(skb);
- skb_reserve(skb, DN_MAX_NSP_DATA_HEADER);
+ skb_reserve(skb, 64 + DN_MAX_NSP_DATA_HEADER);
if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) {
err = -EFAULT;
@@ -2045,7 +2049,7 @@ out:
return sent ? sent : err;
out_err:
- err = dn_error(sk, flags, err);
+ err = sk_stream_error(sk, flags, err);
release_sock(sk);
return err;
}
@@ -2073,7 +2077,7 @@ static struct notifier_block dn_dev_notifier = {
.notifier_call = dn_device_event,
};
-extern int dn_route_rcv(struct sk_buff *, struct net_device *, struct packet_type *);
+extern int dn_route_rcv(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *);
static struct packet_type dn_dix_packet_type = {
.type = __constant_htons(ETH_P_DNA_RT),
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index 00233ecbc9c..5610bb16dbf 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -752,16 +752,16 @@ static void rtmsg_ifa(int event, struct dn_ifaddr *ifa)
skb = alloc_skb(size, GFP_KERNEL);
if (!skb) {
- netlink_set_err(rtnl, 0, RTMGRP_DECnet_IFADDR, ENOBUFS);
+ netlink_set_err(rtnl, 0, RTNLGRP_DECnet_IFADDR, ENOBUFS);
return;
}
if (dn_dev_fill_ifaddr(skb, ifa, 0, 0, event, 0) < 0) {
kfree_skb(skb);
- netlink_set_err(rtnl, 0, RTMGRP_DECnet_IFADDR, EINVAL);
+ netlink_set_err(rtnl, 0, RTNLGRP_DECnet_IFADDR, EINVAL);
return;
}
- NETLINK_CB(skb).dst_groups = RTMGRP_DECnet_IFADDR;
- netlink_broadcast(rtnl, skb, 0, RTMGRP_DECnet_IFADDR, GFP_KERNEL);
+ NETLINK_CB(skb).dst_group = RTNLGRP_DECnet_IFADDR;
+ netlink_broadcast(rtnl, skb, 0, RTNLGRP_DECnet_IFADDR, GFP_KERNEL);
}
static int dn_dev_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index f32dba9e26f..8d0cc3cf3e4 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -148,12 +148,12 @@ static int dn_neigh_construct(struct neighbour *neigh)
__neigh_parms_put(neigh->parms);
neigh->parms = neigh_parms_clone(parms);
- rcu_read_unlock();
if (dn_db->use_long)
neigh->ops = &dn_long_ops;
else
neigh->ops = &dn_short_ops;
+ rcu_read_unlock();
if (dn->flags & DN_NDFLAG_P3)
neigh->ops = &dn_phase3_ops;
diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c
index 202dbde9850..369f25b60f3 100644
--- a/net/decnet/dn_nsp_in.c
+++ b/net/decnet/dn_nsp_in.c
@@ -60,7 +60,7 @@
#include <linux/inet.h>
#include <linux/route.h>
#include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <asm/system.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c
index 8cce1fdbda9..c96c767b1f7 100644
--- a/net/decnet/dn_nsp_out.c
+++ b/net/decnet/dn_nsp_out.c
@@ -117,7 +117,7 @@ try_again:
* The eventual aim is for each socket to have a cached header size
* for its outgoing packets, and to set hdr from this when sk != NULL.
*/
-struct sk_buff *dn_alloc_skb(struct sock *sk, int size, int pri)
+struct sk_buff *dn_alloc_skb(struct sock *sk, int size, gfp_t pri)
{
struct sk_buff *skb;
int hdr = 64;
@@ -137,69 +137,6 @@ struct sk_buff *dn_alloc_skb(struct sock *sk, int size, int pri)
}
/*
- * Wrapper for the above, for allocs of data skbs. We try and get the
- * whole size thats been asked for (plus 11 bytes of header). If this
- * fails, then we try for any size over 16 bytes for SOCK_STREAMS.
- */
-struct sk_buff *dn_alloc_send_skb(struct sock *sk, size_t *size, int noblock, long timeo, int *err)
-{
- int space;
- int len;
- struct sk_buff *skb = NULL;
-
- *err = 0;
-
- while(skb == NULL) {
- if (signal_pending(current)) {
- *err = sock_intr_errno(timeo);
- break;
- }
-
- if (sk->sk_shutdown & SEND_SHUTDOWN) {
- *err = EINVAL;
- break;
- }
-
- if (sk->sk_err)
- break;
-
- len = *size + 11;
- space = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc);
-
- if (space < len) {
- if ((sk->sk_socket->type == SOCK_STREAM) &&
- (space >= (16 + 11)))
- len = space;
- }
-
- if (space < len) {
- set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
- if (noblock) {
- *err = EWOULDBLOCK;
- break;
- }
-
- clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
- SOCK_SLEEP_PRE(sk)
-
- if ((sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc)) <
- len)
- schedule();
-
- SOCK_SLEEP_POST(sk)
- continue;
- }
-
- if ((skb = dn_alloc_skb(sk, len, sk->sk_allocation)) == NULL)
- continue;
-
- *size = len - 11;
- }
-
- return skb;
-}
-
-/*
* Calculate persist timer based upon the smoothed round
* trip time and the variance. Backoff according to the
* nsp_backoff[] array.
@@ -273,7 +210,8 @@ static void dn_nsp_rtt(struct sock *sk, long rtt)
*
* Returns: The number of times the packet has been sent previously
*/
-static inline unsigned dn_nsp_clone_and_send(struct sk_buff *skb, int gfp)
+static inline unsigned dn_nsp_clone_and_send(struct sk_buff *skb,
+ gfp_t gfp)
{
struct dn_skb_cb *cb = DN_SKB_CB(skb);
struct sk_buff *skb2;
@@ -413,7 +351,8 @@ static unsigned short *dn_nsp_mk_data_header(struct sock *sk, struct sk_buff *sk
return ptr;
}
-void dn_nsp_queue_xmit(struct sock *sk, struct sk_buff *skb, int gfp, int oth)
+void dn_nsp_queue_xmit(struct sock *sk, struct sk_buff *skb,
+ gfp_t gfp, int oth)
{
struct dn_scp *scp = DN_SK(sk);
struct dn_skb_cb *cb = DN_SKB_CB(skb);
@@ -479,7 +418,7 @@ int dn_nsp_check_xmit_queue(struct sock *sk, struct sk_buff *skb, struct sk_buff
xmit_count = cb2->xmit_count;
segnum = cb2->segnum;
/* Remove and drop ack'ed packet */
- skb_unlink(ack);
+ skb_unlink(ack, q);
kfree_skb(ack);
ack = NULL;
@@ -580,7 +519,7 @@ static int dn_nsp_retrans_conn_conf(struct sock *sk)
return 0;
}
-void dn_send_conn_conf(struct sock *sk, int gfp)
+void dn_send_conn_conf(struct sock *sk, gfp_t gfp)
{
struct dn_scp *scp = DN_SK(sk);
struct sk_buff *skb = NULL;
@@ -612,7 +551,8 @@ void dn_send_conn_conf(struct sock *sk, int gfp)
static __inline__ void dn_nsp_do_disc(struct sock *sk, unsigned char msgflg,
- unsigned short reason, int gfp, struct dst_entry *dst,
+ unsigned short reason, gfp_t gfp,
+ struct dst_entry *dst,
int ddl, unsigned char *dd, __u16 rem, __u16 loc)
{
struct sk_buff *skb = NULL;
@@ -654,7 +594,7 @@ static __inline__ void dn_nsp_do_disc(struct sock *sk, unsigned char msgflg,
void dn_nsp_send_disc(struct sock *sk, unsigned char msgflg,
- unsigned short reason, int gfp)
+ unsigned short reason, gfp_t gfp)
{
struct dn_scp *scp = DN_SK(sk);
int ddl = 0;
@@ -675,7 +615,7 @@ void dn_nsp_return_disc(struct sk_buff *skb, unsigned char msgflg,
{
struct dn_skb_cb *cb = DN_SKB_CB(skb);
int ddl = 0;
- int gfp = GFP_ATOMIC;
+ gfp_t gfp = GFP_ATOMIC;
dn_nsp_do_disc(NULL, msgflg, reason, gfp, skb->dst, ddl,
NULL, cb->src_port, cb->dst_port);
@@ -687,7 +627,7 @@ void dn_nsp_send_link(struct sock *sk, unsigned char lsflags, char fcval)
struct dn_scp *scp = DN_SK(sk);
struct sk_buff *skb;
unsigned char *ptr;
- int gfp = GFP_ATOMIC;
+ gfp_t gfp = GFP_ATOMIC;
if ((skb = dn_alloc_skb(sk, DN_MAX_NSP_DATA_HEADER + 2, gfp)) == NULL)
return;
@@ -722,7 +662,7 @@ void dn_nsp_send_conninit(struct sock *sk, unsigned char msgflg)
unsigned char menuver;
struct dn_skb_cb *cb;
unsigned char type = 1;
- int allocation = (msgflg == NSP_CI) ? sk->sk_allocation : GFP_ATOMIC;
+ gfp_t allocation = (msgflg == NSP_CI) ? sk->sk_allocation : GFP_ATOMIC;
struct sk_buff *skb = dn_alloc_skb(sk, 200, allocation);
if (!skb)
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 2399fa8a3f8..3407f190afe 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -117,8 +117,7 @@ static struct dn_rt_hash_bucket *dn_rt_hash_table;
static unsigned dn_rt_hash_mask;
static struct timer_list dn_route_timer;
-static struct timer_list dn_rt_flush_timer =
- TIMER_INITIALIZER(dn_run_flush, 0, 0);
+static DEFINE_TIMER(dn_rt_flush_timer, dn_run_flush, 0, 0);
int decnet_dst_gc_interval = 2;
static struct dst_ops dn_dst_ops = {
@@ -572,7 +571,7 @@ static int dn_route_ptp_hello(struct sk_buff *skb)
return NET_RX_SUCCESS;
}
-int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
struct dn_skb_cb *cb;
unsigned char flags = 0;
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index 28ba5777a25..eeba56f9932 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -79,7 +79,7 @@ for( ; ((f) = *(fp)) != NULL && dn_key_eq((f)->fn_key, (key)); (fp) = &(f)->fn_n
static DEFINE_RWLOCK(dn_fib_tables_lock);
struct dn_fib_table *dn_fib_tables[RT_TABLE_MAX + 1];
-static kmem_cache_t *dn_hash_kmem;
+static kmem_cache_t *dn_hash_kmem __read_mostly;
static int dn_fib_hash_zombies;
static inline dn_fib_idx_t dn_hash(dn_fib_key_t key, struct dn_zone *dz)
@@ -349,10 +349,10 @@ static void dn_rtmsg_fib(int event, struct dn_fib_node *f, int z, int tb_id,
kfree_skb(skb);
return;
}
- NETLINK_CB(skb).dst_groups = RTMGRP_DECnet_ROUTE;
+ NETLINK_CB(skb).dst_group = RTNLGRP_DECnet_ROUTE;
if (nlh->nlmsg_flags & NLM_F_ECHO)
atomic_inc(&skb->users);
- netlink_broadcast(rtnl, skb, pid, RTMGRP_DECnet_ROUTE, GFP_KERNEL);
+ netlink_broadcast(rtnl, skb, pid, RTNLGRP_DECnet_ROUTE, GFP_KERNEL);
if (nlh->nlmsg_flags & NLM_F_ECHO)
netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
}
diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c
index 284a9998e53..1ab94c6e22e 100644
--- a/net/decnet/netfilter/dn_rtmsg.c
+++ b/net/decnet/netfilter/dn_rtmsg.c
@@ -19,6 +19,7 @@
#include <linux/netfilter.h>
#include <linux/spinlock.h>
#include <linux/netlink.h>
+#include <linux/netfilter_decnet.h>
#include <net/sock.h>
#include <net/flow.h>
@@ -71,10 +72,10 @@ static void dnrmg_send_peer(struct sk_buff *skb)
switch(flags & DN_RT_CNTL_MSK) {
case DN_RT_PKT_L1RT:
- group = DNRMG_L1_GROUP;
+ group = DNRNG_NLGRP_L1;
break;
case DN_RT_PKT_L2RT:
- group = DNRMG_L2_GROUP;
+ group = DNRNG_NLGRP_L2;
break;
default:
return;
@@ -83,7 +84,7 @@ static void dnrmg_send_peer(struct sk_buff *skb)
skb2 = dnrmg_build_message(skb, &status);
if (skb2 == NULL)
return;
- NETLINK_CB(skb2).dst_groups = group;
+ NETLINK_CB(skb2).dst_group = group;
netlink_broadcast(dnrmg, skb2, 0, group, GFP_ATOMIC);
}
@@ -138,7 +139,8 @@ static int __init init(void)
{
int rv = 0;
- dnrmg = netlink_kernel_create(NETLINK_DNRTMSG, dnrmg_receive_user_sk);
+ dnrmg = netlink_kernel_create(NETLINK_DNRTMSG, DNRNG_NLGRP_MAX,
+ dnrmg_receive_user_sk, THIS_MODULE);
if (dnrmg == NULL) {
printk(KERN_ERR "dn_rtmsg: Cannot create netlink socket");
return -ENOMEM;
@@ -162,6 +164,7 @@ static void __exit fini(void)
MODULE_DESCRIPTION("DECnet Routing Message Grabulator");
MODULE_AUTHOR("Steven Whitehouse <steve@chygwyn.com>");
MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_DNRTMSG);
module_init(init);
module_exit(fini);
diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c
index de691e119e1..34fdac51df9 100644
--- a/net/econet/af_econet.c
+++ b/net/econet/af_econet.c
@@ -159,7 +159,7 @@ static int econet_recvmsg(struct kiocb *iocb, struct socket *sock,
err = memcpy_toiovec(msg->msg_iov, skb->data, copied);
if (err)
goto out_free;
- sk->sk_stamp = skb->stamp;
+ skb_get_timestamp(skb, &sk->sk_stamp);
if (msg->msg_name)
memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
@@ -406,7 +406,7 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock,
unsigned long network = 0;
rcu_read_lock();
- idev = __in_dev_get(dev);
+ idev = __in_dev_get_rcu(dev);
if (idev) {
if (idev->ifa_list)
network = ntohl(idev->ifa_list->ifa_address) &
@@ -869,7 +869,7 @@ static void aun_tx_ack(unsigned long seq, int result)
foundit:
tx_result(skb->sk, eb->cookie, result);
- skb_unlink(skb);
+ skb_unlink(skb, &aun_queue);
spin_unlock_irqrestore(&aun_queue_lock, flags);
kfree_skb(skb);
}
@@ -947,7 +947,7 @@ static void ab_cleanup(unsigned long h)
{
tx_result(skb->sk, eb->cookie,
ECTYPE_TRANSMIT_NOT_PRESENT);
- skb_unlink(skb);
+ skb_unlink(skb, &aun_queue);
kfree_skb(skb);
}
skb = newskb;
@@ -1009,7 +1009,7 @@ release:
* Receive an Econet frame from a device.
*/
-static int econet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+static int econet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
struct ec_framehdr *hdr;
struct sock *sk;
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index f6dbfb99b14..e2457736727 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -62,8 +62,6 @@
#include <asm/system.h>
#include <asm/checksum.h>
-extern int __init netdev_boot_setup(char *str);
-
__setup("ether=", netdev_boot_setup);
/*
@@ -160,17 +158,15 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
struct ethhdr *eth;
unsigned char *rawp;
- skb->mac.raw=skb->data;
+ skb->mac.raw = skb->data;
skb_pull(skb,ETH_HLEN);
eth = eth_hdr(skb);
- skb->input_dev = dev;
- if(*eth->h_dest&1)
- {
- if(memcmp(eth->h_dest,dev->broadcast, ETH_ALEN)==0)
- skb->pkt_type=PACKET_BROADCAST;
+ if (*eth->h_dest&1) {
+ if (!compare_ether_addr(eth->h_dest, dev->broadcast))
+ skb->pkt_type = PACKET_BROADCAST;
else
- skb->pkt_type=PACKET_MULTICAST;
+ skb->pkt_type = PACKET_MULTICAST;
}
/*
@@ -181,10 +177,9 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
* seems to set IFF_PROMISC.
*/
- else if(1 /*dev->flags&IFF_PROMISC*/)
- {
- if(memcmp(eth->h_dest,dev->dev_addr, ETH_ALEN))
- skb->pkt_type=PACKET_OTHERHOST;
+ else if(1 /*dev->flags&IFF_PROMISC*/) {
+ if (unlikely(compare_ether_addr(eth->h_dest, dev->dev_addr)))
+ skb->pkt_type = PACKET_OTHERHOST;
}
if (ntohs(eth->h_proto) >= 1536)
diff --git a/net/ethernet/sysctl_net_ether.c b/net/ethernet/sysctl_net_ether.c
index b81a6d53234..66b39fc342d 100644
--- a/net/ethernet/sysctl_net_ether.c
+++ b/net/ethernet/sysctl_net_ether.c
@@ -7,6 +7,7 @@
#include <linux/mm.h>
#include <linux/sysctl.h>
+#include <linux/if_ether.h>
ctl_table ether_table[] = {
{0}
diff --git a/net/ieee80211/Kconfig b/net/ieee80211/Kconfig
new file mode 100644
index 00000000000..91b16fbf91f
--- /dev/null
+++ b/net/ieee80211/Kconfig
@@ -0,0 +1,68 @@
+config IEEE80211
+ tristate "Generic IEEE 802.11 Networking Stack"
+ ---help---
+ This option enables the hardware independent IEEE 802.11
+ networking stack.
+
+config IEEE80211_DEBUG
+ bool "Enable full debugging output"
+ depends on IEEE80211
+ ---help---
+ This option will enable debug tracing output for the
+ ieee80211 network stack.
+
+ This will result in the kernel module being ~70k larger. You
+ can control which debug output is sent to the kernel log by
+ setting the value in
+
+ /proc/net/ieee80211/debug_level
+
+ For example:
+
+ % echo 0x00000FFO > /proc/net/ieee80211/debug_level
+
+ For a list of values you can assign to debug_level, you
+ can look at the bit mask values in <net/ieee80211.h>
+
+ If you are not trying to debug or develop the ieee80211
+ subsystem, you most likely want to say N here.
+
+config IEEE80211_CRYPT_WEP
+ tristate "IEEE 802.11 WEP encryption (802.1x)"
+ depends on IEEE80211
+ select CRYPTO
+ select CRYPTO_ARC4
+ select CRC32
+ ---help---
+ Include software based cipher suites in support of IEEE
+ 802.11's WEP. This is needed for WEP as well as 802.1x.
+
+ This can be compiled as a modules and it will be called
+ "ieee80211_crypt_wep".
+
+config IEEE80211_CRYPT_CCMP
+ tristate "IEEE 802.11i CCMP support"
+ depends on IEEE80211
+ select CRYPTO
+ select CRYPTO_AES
+ ---help---
+ Include software based cipher suites in support of IEEE 802.11i
+ (aka TGi, WPA, WPA2, WPA-PSK, etc.) for use with CCMP enabled
+ networks.
+
+ This can be compiled as a modules and it will be called
+ "ieee80211_crypt_ccmp".
+
+config IEEE80211_CRYPT_TKIP
+ tristate "IEEE 802.11i TKIP encryption"
+ depends on IEEE80211
+ select CRYPTO
+ select CRYPTO_MICHAEL_MIC
+ ---help---
+ Include software based cipher suites in support of IEEE 802.11i
+ (aka TGi, WPA, WPA2, WPA-PSK, etc.) for use with TKIP enabled
+ networks.
+
+ This can be compiled as a modules and it will be called
+ "ieee80211_crypt_tkip".
+
diff --git a/net/ieee80211/Makefile b/net/ieee80211/Makefile
new file mode 100644
index 00000000000..f988417121d
--- /dev/null
+++ b/net/ieee80211/Makefile
@@ -0,0 +1,12 @@
+obj-$(CONFIG_IEEE80211) += ieee80211.o
+obj-$(CONFIG_IEEE80211) += ieee80211_crypt.o
+obj-$(CONFIG_IEEE80211_CRYPT_WEP) += ieee80211_crypt_wep.o
+obj-$(CONFIG_IEEE80211_CRYPT_CCMP) += ieee80211_crypt_ccmp.o
+obj-$(CONFIG_IEEE80211_CRYPT_TKIP) += ieee80211_crypt_tkip.o
+ieee80211-objs := \
+ ieee80211_module.o \
+ ieee80211_tx.o \
+ ieee80211_rx.o \
+ ieee80211_wx.o \
+ ieee80211_geo.o
+
diff --git a/net/ieee80211/ieee80211_crypt.c b/net/ieee80211/ieee80211_crypt.c
new file mode 100644
index 00000000000..f3b6aa3be63
--- /dev/null
+++ b/net/ieee80211/ieee80211_crypt.c
@@ -0,0 +1,279 @@
+/*
+ * Host AP crypto routines
+ *
+ * Copyright (c) 2002-2003, Jouni Malinen <jkmaline@cc.hut.fi>
+ * Portions Copyright (C) 2004, Intel Corporation <jketreno@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation. See README and COPYING for
+ * more details.
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <asm/string.h>
+#include <asm/errno.h>
+
+#include <net/ieee80211.h>
+
+MODULE_AUTHOR("Jouni Malinen");
+MODULE_DESCRIPTION("HostAP crypto");
+MODULE_LICENSE("GPL");
+
+struct ieee80211_crypto_alg {
+ struct list_head list;
+ struct ieee80211_crypto_ops *ops;
+};
+
+struct ieee80211_crypto {
+ struct list_head algs;
+ spinlock_t lock;
+};
+
+static struct ieee80211_crypto *hcrypt;
+
+void ieee80211_crypt_deinit_entries(struct ieee80211_device *ieee, int force)
+{
+ struct list_head *ptr, *n;
+ struct ieee80211_crypt_data *entry;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ieee->lock, flags);
+
+ if (list_empty(&ieee->crypt_deinit_list))
+ goto unlock;
+
+ for (ptr = ieee->crypt_deinit_list.next, n = ptr->next;
+ ptr != &ieee->crypt_deinit_list; ptr = n, n = ptr->next) {
+ entry = list_entry(ptr, struct ieee80211_crypt_data, list);
+
+ if (atomic_read(&entry->refcnt) != 0 && !force)
+ continue;
+
+ list_del(ptr);
+
+ if (entry->ops) {
+ entry->ops->deinit(entry->priv);
+ module_put(entry->ops->owner);
+ }
+ kfree(entry);
+ }
+ unlock:
+ spin_unlock_irqrestore(&ieee->lock, flags);
+}
+
+/* After this, crypt_deinit_list won't accept new members */
+void ieee80211_crypt_quiescing(struct ieee80211_device *ieee)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ieee->lock, flags);
+ ieee->crypt_quiesced = 1;
+ spin_unlock_irqrestore(&ieee->lock, flags);
+}
+
+void ieee80211_crypt_deinit_handler(unsigned long data)
+{
+ struct ieee80211_device *ieee = (struct ieee80211_device *)data;
+ unsigned long flags;
+
+ ieee80211_crypt_deinit_entries(ieee, 0);
+
+ spin_lock_irqsave(&ieee->lock, flags);
+ if (!list_empty(&ieee->crypt_deinit_list) && !ieee->crypt_quiesced) {
+ printk(KERN_DEBUG "%s: entries remaining in delayed crypt "
+ "deletion list\n", ieee->dev->name);
+ ieee->crypt_deinit_timer.expires = jiffies + HZ;
+ add_timer(&ieee->crypt_deinit_timer);
+ }
+ spin_unlock_irqrestore(&ieee->lock, flags);
+}
+
+void ieee80211_crypt_delayed_deinit(struct ieee80211_device *ieee,
+ struct ieee80211_crypt_data **crypt)
+{
+ struct ieee80211_crypt_data *tmp;
+ unsigned long flags;
+
+ if (*crypt == NULL)
+ return;
+
+ tmp = *crypt;
+ *crypt = NULL;
+
+ /* must not run ops->deinit() while there may be pending encrypt or
+ * decrypt operations. Use a list of delayed deinits to avoid needing
+ * locking. */
+
+ spin_lock_irqsave(&ieee->lock, flags);
+ if (!ieee->crypt_quiesced) {
+ list_add(&tmp->list, &ieee->crypt_deinit_list);
+ if (!timer_pending(&ieee->crypt_deinit_timer)) {
+ ieee->crypt_deinit_timer.expires = jiffies + HZ;
+ add_timer(&ieee->crypt_deinit_timer);
+ }
+ }
+ spin_unlock_irqrestore(&ieee->lock, flags);
+}
+
+int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops)
+{
+ unsigned long flags;
+ struct ieee80211_crypto_alg *alg;
+
+ if (hcrypt == NULL)
+ return -1;
+
+ alg = kmalloc(sizeof(*alg), GFP_KERNEL);
+ if (alg == NULL)
+ return -ENOMEM;
+
+ memset(alg, 0, sizeof(*alg));
+ alg->ops = ops;
+
+ spin_lock_irqsave(&hcrypt->lock, flags);
+ list_add(&alg->list, &hcrypt->algs);
+ spin_unlock_irqrestore(&hcrypt->lock, flags);
+
+ printk(KERN_DEBUG "ieee80211_crypt: registered algorithm '%s'\n",
+ ops->name);
+
+ return 0;
+}
+
+int ieee80211_unregister_crypto_ops(struct ieee80211_crypto_ops *ops)
+{
+ unsigned long flags;
+ struct list_head *ptr;
+ struct ieee80211_crypto_alg *del_alg = NULL;
+
+ if (hcrypt == NULL)
+ return -1;
+
+ spin_lock_irqsave(&hcrypt->lock, flags);
+ for (ptr = hcrypt->algs.next; ptr != &hcrypt->algs; ptr = ptr->next) {
+ struct ieee80211_crypto_alg *alg =
+ (struct ieee80211_crypto_alg *)ptr;
+ if (alg->ops == ops) {
+ list_del(&alg->list);
+ del_alg = alg;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&hcrypt->lock, flags);
+
+ if (del_alg) {
+ printk(KERN_DEBUG "ieee80211_crypt: unregistered algorithm "
+ "'%s'\n", ops->name);
+ kfree(del_alg);
+ }
+
+ return del_alg ? 0 : -1;
+}
+
+struct ieee80211_crypto_ops *ieee80211_get_crypto_ops(const char *name)
+{
+ unsigned long flags;
+ struct list_head *ptr;
+ struct ieee80211_crypto_alg *found_alg = NULL;
+
+ if (hcrypt == NULL)
+ return NULL;
+
+ spin_lock_irqsave(&hcrypt->lock, flags);
+ for (ptr = hcrypt->algs.next; ptr != &hcrypt->algs; ptr = ptr->next) {
+ struct ieee80211_crypto_alg *alg =
+ (struct ieee80211_crypto_alg *)ptr;
+ if (strcmp(alg->ops->name, name) == 0) {
+ found_alg = alg;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&hcrypt->lock, flags);
+
+ if (found_alg)
+ return found_alg->ops;
+ else
+ return NULL;
+}
+
+static void *ieee80211_crypt_null_init(int keyidx)
+{
+ return (void *)1;
+}
+static void ieee80211_crypt_null_deinit(void *priv)
+{
+}
+
+static struct ieee80211_crypto_ops ieee80211_crypt_null = {
+ .name = "NULL",
+ .init = ieee80211_crypt_null_init,
+ .deinit = ieee80211_crypt_null_deinit,
+ .encrypt_mpdu = NULL,
+ .decrypt_mpdu = NULL,
+ .encrypt_msdu = NULL,
+ .decrypt_msdu = NULL,
+ .set_key = NULL,
+ .get_key = NULL,
+ .extra_mpdu_prefix_len = 0,
+ .extra_mpdu_postfix_len = 0,
+ .owner = THIS_MODULE,
+};
+
+static int __init ieee80211_crypto_init(void)
+{
+ int ret = -ENOMEM;
+
+ hcrypt = kmalloc(sizeof(*hcrypt), GFP_KERNEL);
+ if (!hcrypt)
+ goto out;
+
+ memset(hcrypt, 0, sizeof(*hcrypt));
+ INIT_LIST_HEAD(&hcrypt->algs);
+ spin_lock_init(&hcrypt->lock);
+
+ ret = ieee80211_register_crypto_ops(&ieee80211_crypt_null);
+ if (ret < 0) {
+ kfree(hcrypt);
+ hcrypt = NULL;
+ }
+ out:
+ return ret;
+}
+
+static void __exit ieee80211_crypto_deinit(void)
+{
+ struct list_head *ptr, *n;
+
+ if (hcrypt == NULL)
+ return;
+
+ for (ptr = hcrypt->algs.next, n = ptr->next; ptr != &hcrypt->algs;
+ ptr = n, n = ptr->next) {
+ struct ieee80211_crypto_alg *alg =
+ (struct ieee80211_crypto_alg *)ptr;
+ list_del(ptr);
+ printk(KERN_DEBUG "ieee80211_crypt: unregistered algorithm "
+ "'%s' (deinit)\n", alg->ops->name);
+ kfree(alg);
+ }
+
+ kfree(hcrypt);
+}
+
+EXPORT_SYMBOL(ieee80211_crypt_deinit_entries);
+EXPORT_SYMBOL(ieee80211_crypt_deinit_handler);
+EXPORT_SYMBOL(ieee80211_crypt_delayed_deinit);
+EXPORT_SYMBOL(ieee80211_crypt_quiescing);
+
+EXPORT_SYMBOL(ieee80211_register_crypto_ops);
+EXPORT_SYMBOL(ieee80211_unregister_crypto_ops);
+EXPORT_SYMBOL(ieee80211_get_crypto_ops);
+
+module_init(ieee80211_crypto_init);
+module_exit(ieee80211_crypto_deinit);
diff --git a/net/ieee80211/ieee80211_crypt_ccmp.c b/net/ieee80211/ieee80211_crypt_ccmp.c
new file mode 100644
index 00000000000..05a853c1301
--- /dev/null
+++ b/net/ieee80211/ieee80211_crypt_ccmp.c
@@ -0,0 +1,472 @@
+/*
+ * Host AP crypt: host-based CCMP encryption implementation for Host AP driver
+ *
+ * Copyright (c) 2003-2004, Jouni Malinen <jkmaline@cc.hut.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation. See README and COPYING for
+ * more details.
+ */
+
+#include <linux/config.h>
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_arp.h>
+#include <asm/string.h>
+#include <linux/wireless.h>
+
+#include <net/ieee80211.h>
+
+#include <linux/crypto.h>
+#include <asm/scatterlist.h>
+
+MODULE_AUTHOR("Jouni Malinen");
+MODULE_DESCRIPTION("Host AP crypt: CCMP");
+MODULE_LICENSE("GPL");
+
+#define AES_BLOCK_LEN 16
+#define CCMP_HDR_LEN 8
+#define CCMP_MIC_LEN 8
+#define CCMP_TK_LEN 16
+#define CCMP_PN_LEN 6
+
+struct ieee80211_ccmp_data {
+ u8 key[CCMP_TK_LEN];
+ int key_set;
+
+ u8 tx_pn[CCMP_PN_LEN];
+ u8 rx_pn[CCMP_PN_LEN];
+
+ u32 dot11RSNAStatsCCMPFormatErrors;
+ u32 dot11RSNAStatsCCMPReplays;
+ u32 dot11RSNAStatsCCMPDecryptErrors;
+
+ int key_idx;
+
+ struct crypto_tfm *tfm;
+
+ /* scratch buffers for virt_to_page() (crypto API) */
+ u8 tx_b0[AES_BLOCK_LEN], tx_b[AES_BLOCK_LEN],
+ tx_e[AES_BLOCK_LEN], tx_s0[AES_BLOCK_LEN];
+ u8 rx_b0[AES_BLOCK_LEN], rx_b[AES_BLOCK_LEN], rx_a[AES_BLOCK_LEN];
+};
+
+static void ieee80211_ccmp_aes_encrypt(struct crypto_tfm *tfm,
+ const u8 pt[16], u8 ct[16])
+{
+ struct scatterlist src, dst;
+
+ src.page = virt_to_page(pt);
+ src.offset = offset_in_page(pt);
+ src.length = AES_BLOCK_LEN;
+
+ dst.page = virt_to_page(ct);
+ dst.offset = offset_in_page(ct);
+ dst.length = AES_BLOCK_LEN;
+
+ crypto_cipher_encrypt(tfm, &dst, &src, AES_BLOCK_LEN);
+}
+
+static void *ieee80211_ccmp_init(int key_idx)
+{
+ struct ieee80211_ccmp_data *priv;
+
+ priv = kmalloc(sizeof(*priv), GFP_ATOMIC);
+ if (priv == NULL)
+ goto fail;
+ memset(priv, 0, sizeof(*priv));
+ priv->key_idx = key_idx;
+
+ priv->tfm = crypto_alloc_tfm("aes", 0);
+ if (priv->tfm == NULL) {
+ printk(KERN_DEBUG "ieee80211_crypt_ccmp: could not allocate "
+ "crypto API aes\n");
+ goto fail;
+ }
+
+ return priv;
+
+ fail:
+ if (priv) {
+ if (priv->tfm)
+ crypto_free_tfm(priv->tfm);
+ kfree(priv);
+ }
+
+ return NULL;
+}
+
+static void ieee80211_ccmp_deinit(void *priv)
+{
+ struct ieee80211_ccmp_data *_priv = priv;
+ if (_priv && _priv->tfm)
+ crypto_free_tfm(_priv->tfm);
+ kfree(priv);
+}
+
+static inline void xor_block(u8 * b, u8 * a, size_t len)
+{
+ int i;
+ for (i = 0; i < len; i++)
+ b[i] ^= a[i];
+}
+
+static void ccmp_init_blocks(struct crypto_tfm *tfm,
+ struct ieee80211_hdr_4addr *hdr,
+ u8 * pn, size_t dlen, u8 * b0, u8 * auth, u8 * s0)
+{
+ u8 *pos, qc = 0;
+ size_t aad_len;
+ u16 fc;
+ int a4_included, qc_included;
+ u8 aad[2 * AES_BLOCK_LEN];
+
+ fc = le16_to_cpu(hdr->frame_ctl);
+ a4_included = ((fc & (IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) ==
+ (IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS));
+ qc_included = ((WLAN_FC_GET_TYPE(fc) == IEEE80211_FTYPE_DATA) &&
+ (WLAN_FC_GET_STYPE(fc) & 0x08));
+ aad_len = 22;
+ if (a4_included)
+ aad_len += 6;
+ if (qc_included) {
+ pos = (u8 *) & hdr->addr4;
+ if (a4_included)
+ pos += 6;
+ qc = *pos & 0x0f;
+ aad_len += 2;
+ }
+
+ /* CCM Initial Block:
+ * Flag (Include authentication header, M=3 (8-octet MIC),
+ * L=1 (2-octet Dlen))
+ * Nonce: 0x00 | A2 | PN
+ * Dlen */
+ b0[0] = 0x59;
+ b0[1] = qc;
+ memcpy(b0 + 2, hdr->addr2, ETH_ALEN);
+ memcpy(b0 + 8, pn, CCMP_PN_LEN);
+ b0[14] = (dlen >> 8) & 0xff;
+ b0[15] = dlen & 0xff;
+
+ /* AAD:
+ * FC with bits 4..6 and 11..13 masked to zero; 14 is always one
+ * A1 | A2 | A3
+ * SC with bits 4..15 (seq#) masked to zero
+ * A4 (if present)
+ * QC (if present)
+ */
+ pos = (u8 *) hdr;
+ aad[0] = 0; /* aad_len >> 8 */
+ aad[1] = aad_len & 0xff;
+ aad[2] = pos[0] & 0x8f;
+ aad[3] = pos[1] & 0xc7;
+ memcpy(aad + 4, hdr->addr1, 3 * ETH_ALEN);
+ pos = (u8 *) & hdr->seq_ctl;
+ aad[22] = pos[0] & 0x0f;
+ aad[23] = 0; /* all bits masked */
+ memset(aad + 24, 0, 8);
+ if (a4_included)
+ memcpy(aad + 24, hdr->addr4, ETH_ALEN);
+ if (qc_included) {
+ aad[a4_included ? 30 : 24] = qc;
+ /* rest of QC masked */
+ }
+
+ /* Start with the first block and AAD */
+ ieee80211_ccmp_aes_encrypt(tfm, b0, auth);
+ xor_block(auth, aad, AES_BLOCK_LEN);
+ ieee80211_ccmp_aes_encrypt(tfm, auth, auth);
+ xor_block(auth, &aad[AES_BLOCK_LEN], AES_BLOCK_LEN);
+ ieee80211_ccmp_aes_encrypt(tfm, auth, auth);
+ b0[0] &= 0x07;
+ b0[14] = b0[15] = 0;
+ ieee80211_ccmp_aes_encrypt(tfm, b0, s0);
+}
+
+static int ieee80211_ccmp_hdr(struct sk_buff *skb, int hdr_len, void *priv)
+{
+ struct ieee80211_ccmp_data *key = priv;
+ int i;
+ u8 *pos;
+
+ if (skb_headroom(skb) < CCMP_HDR_LEN || skb->len < hdr_len)
+ return -1;
+
+ pos = skb_push(skb, CCMP_HDR_LEN);
+ memmove(pos, pos + CCMP_HDR_LEN, hdr_len);
+ pos += hdr_len;
+
+ i = CCMP_PN_LEN - 1;
+ while (i >= 0) {
+ key->tx_pn[i]++;
+ if (key->tx_pn[i] != 0)
+ break;
+ i--;
+ }
+
+ *pos++ = key->tx_pn[5];
+ *pos++ = key->tx_pn[4];
+ *pos++ = 0;
+ *pos++ = (key->key_idx << 6) | (1 << 5) /* Ext IV included */ ;
+ *pos++ = key->tx_pn[3];
+ *pos++ = key->tx_pn[2];
+ *pos++ = key->tx_pn[1];
+ *pos++ = key->tx_pn[0];
+
+ return CCMP_HDR_LEN;
+}
+
+static int ieee80211_ccmp_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
+{
+ struct ieee80211_ccmp_data *key = priv;
+ int data_len, i, blocks, last, len;
+ u8 *pos, *mic;
+ struct ieee80211_hdr_4addr *hdr;
+ u8 *b0 = key->tx_b0;
+ u8 *b = key->tx_b;
+ u8 *e = key->tx_e;
+ u8 *s0 = key->tx_s0;
+
+ if (skb_tailroom(skb) < CCMP_MIC_LEN || skb->len < hdr_len)
+ return -1;
+
+ data_len = skb->len - hdr_len;
+ len = ieee80211_ccmp_hdr(skb, hdr_len, priv);
+ if (len < 0)
+ return -1;
+
+ pos = skb->data + hdr_len + CCMP_HDR_LEN;
+ mic = skb_put(skb, CCMP_MIC_LEN);
+ hdr = (struct ieee80211_hdr_4addr *)skb->data;
+ ccmp_init_blocks(key->tfm, hdr, key->tx_pn, data_len, b0, b, s0);
+
+ blocks = (data_len + AES_BLOCK_LEN - 1) / AES_BLOCK_LEN;
+ last = data_len % AES_BLOCK_LEN;
+
+ for (i = 1; i <= blocks; i++) {
+ len = (i == blocks && last) ? last : AES_BLOCK_LEN;
+ /* Authentication */
+ xor_block(b, pos, len);
+ ieee80211_ccmp_aes_encrypt(key->tfm, b, b);
+ /* Encryption, with counter */
+ b0[14] = (i >> 8) & 0xff;
+ b0[15] = i & 0xff;
+ ieee80211_ccmp_aes_encrypt(key->tfm, b0, e);
+ xor_block(pos, e, len);
+ pos += len;
+ }
+
+ for (i = 0; i < CCMP_MIC_LEN; i++)
+ mic[i] = b[i] ^ s0[i];
+
+ return 0;
+}
+
+static int ieee80211_ccmp_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
+{
+ struct ieee80211_ccmp_data *key = priv;
+ u8 keyidx, *pos;
+ struct ieee80211_hdr_4addr *hdr;
+ u8 *b0 = key->rx_b0;
+ u8 *b = key->rx_b;
+ u8 *a = key->rx_a;
+ u8 pn[6];
+ int i, blocks, last, len;
+ size_t data_len = skb->len - hdr_len - CCMP_HDR_LEN - CCMP_MIC_LEN;
+ u8 *mic = skb->data + skb->len - CCMP_MIC_LEN;
+
+ if (skb->len < hdr_len + CCMP_HDR_LEN + CCMP_MIC_LEN) {
+ key->dot11RSNAStatsCCMPFormatErrors++;
+ return -1;
+ }
+
+ hdr = (struct ieee80211_hdr_4addr *)skb->data;
+ pos = skb->data + hdr_len;
+ keyidx = pos[3];
+ if (!(keyidx & (1 << 5))) {
+ if (net_ratelimit()) {
+ printk(KERN_DEBUG "CCMP: received packet without ExtIV"
+ " flag from " MAC_FMT "\n", MAC_ARG(hdr->addr2));
+ }
+ key->dot11RSNAStatsCCMPFormatErrors++;
+ return -2;
+ }
+ keyidx >>= 6;
+ if (key->key_idx != keyidx) {
+ printk(KERN_DEBUG "CCMP: RX tkey->key_idx=%d frame "
+ "keyidx=%d priv=%p\n", key->key_idx, keyidx, priv);
+ return -6;
+ }
+ if (!key->key_set) {
+ if (net_ratelimit()) {
+ printk(KERN_DEBUG "CCMP: received packet from " MAC_FMT
+ " with keyid=%d that does not have a configured"
+ " key\n", MAC_ARG(hdr->addr2), keyidx);
+ }
+ return -3;
+ }
+
+ pn[0] = pos[7];
+ pn[1] = pos[6];
+ pn[2] = pos[5];
+ pn[3] = pos[4];
+ pn[4] = pos[1];
+ pn[5] = pos[0];
+ pos += 8;
+
+ if (memcmp(pn, key->rx_pn, CCMP_PN_LEN) <= 0) {
+ if (net_ratelimit()) {
+ printk(KERN_DEBUG "CCMP: replay detected: STA=" MAC_FMT
+ " previous PN %02x%02x%02x%02x%02x%02x "
+ "received PN %02x%02x%02x%02x%02x%02x\n",
+ MAC_ARG(hdr->addr2), MAC_ARG(key->rx_pn),
+ MAC_ARG(pn));
+ }
+ key->dot11RSNAStatsCCMPReplays++;
+ return -4;
+ }
+
+ ccmp_init_blocks(key->tfm, hdr, pn, data_len, b0, a, b);
+ xor_block(mic, b, CCMP_MIC_LEN);
+
+ blocks = (data_len + AES_BLOCK_LEN - 1) / AES_BLOCK_LEN;
+ last = data_len % AES_BLOCK_LEN;
+
+ for (i = 1; i <= blocks; i++) {
+ len = (i == blocks && last) ? last : AES_BLOCK_LEN;
+ /* Decrypt, with counter */
+ b0[14] = (i >> 8) & 0xff;
+ b0[15] = i & 0xff;
+ ieee80211_ccmp_aes_encrypt(key->tfm, b0, b);
+ xor_block(pos, b, len);
+ /* Authentication */
+ xor_block(a, pos, len);
+ ieee80211_ccmp_aes_encrypt(key->tfm, a, a);
+ pos += len;
+ }
+
+ if (memcmp(mic, a, CCMP_MIC_LEN) != 0) {
+ if (net_ratelimit()) {
+ printk(KERN_DEBUG "CCMP: decrypt failed: STA="
+ MAC_FMT "\n", MAC_ARG(hdr->addr2));
+ }
+ key->dot11RSNAStatsCCMPDecryptErrors++;
+ return -5;
+ }
+
+ memcpy(key->rx_pn, pn, CCMP_PN_LEN);
+
+ /* Remove hdr and MIC */
+ memmove(skb->data + CCMP_HDR_LEN, skb->data, hdr_len);
+ skb_pull(skb, CCMP_HDR_LEN);
+ skb_trim(skb, skb->len - CCMP_MIC_LEN);
+
+ return keyidx;
+}
+
+static int ieee80211_ccmp_set_key(void *key, int len, u8 * seq, void *priv)
+{
+ struct ieee80211_ccmp_data *data = priv;
+ int keyidx;
+ struct crypto_tfm *tfm = data->tfm;
+
+ keyidx = data->key_idx;
+ memset(data, 0, sizeof(*data));
+ data->key_idx = keyidx;
+ data->tfm = tfm;
+ if (len == CCMP_TK_LEN) {
+ memcpy(data->key, key, CCMP_TK_LEN);
+ data->key_set = 1;
+ if (seq) {
+ data->rx_pn[0] = seq[5];
+ data->rx_pn[1] = seq[4];
+ data->rx_pn[2] = seq[3];
+ data->rx_pn[3] = seq[2];
+ data->rx_pn[4] = seq[1];
+ data->rx_pn[5] = seq[0];
+ }
+ crypto_cipher_setkey(data->tfm, data->key, CCMP_TK_LEN);
+ } else if (len == 0)
+ data->key_set = 0;
+ else
+ return -1;
+
+ return 0;
+}
+
+static int ieee80211_ccmp_get_key(void *key, int len, u8 * seq, void *priv)
+{
+ struct ieee80211_ccmp_data *data = priv;
+
+ if (len < CCMP_TK_LEN)
+ return -1;
+
+ if (!data->key_set)
+ return 0;
+ memcpy(key, data->key, CCMP_TK_LEN);
+
+ if (seq) {
+ seq[0] = data->tx_pn[5];
+ seq[1] = data->tx_pn[4];
+ seq[2] = data->tx_pn[3];
+ seq[3] = data->tx_pn[2];
+ seq[4] = data->tx_pn[1];
+ seq[5] = data->tx_pn[0];
+ }
+
+ return CCMP_TK_LEN;
+}
+
+static char *ieee80211_ccmp_print_stats(char *p, void *priv)
+{
+ struct ieee80211_ccmp_data *ccmp = priv;
+ p += sprintf(p, "key[%d] alg=CCMP key_set=%d "
+ "tx_pn=%02x%02x%02x%02x%02x%02x "
+ "rx_pn=%02x%02x%02x%02x%02x%02x "
+ "format_errors=%d replays=%d decrypt_errors=%d\n",
+ ccmp->key_idx, ccmp->key_set,
+ MAC_ARG(ccmp->tx_pn), MAC_ARG(ccmp->rx_pn),
+ ccmp->dot11RSNAStatsCCMPFormatErrors,
+ ccmp->dot11RSNAStatsCCMPReplays,
+ ccmp->dot11RSNAStatsCCMPDecryptErrors);
+
+ return p;
+}
+
+static struct ieee80211_crypto_ops ieee80211_crypt_ccmp = {
+ .name = "CCMP",
+ .init = ieee80211_ccmp_init,
+ .deinit = ieee80211_ccmp_deinit,
+ .build_iv = ieee80211_ccmp_hdr,
+ .encrypt_mpdu = ieee80211_ccmp_encrypt,
+ .decrypt_mpdu = ieee80211_ccmp_decrypt,
+ .encrypt_msdu = NULL,
+ .decrypt_msdu = NULL,
+ .set_key = ieee80211_ccmp_set_key,
+ .get_key = ieee80211_ccmp_get_key,
+ .print_stats = ieee80211_ccmp_print_stats,
+ .extra_mpdu_prefix_len = CCMP_HDR_LEN,
+ .extra_mpdu_postfix_len = CCMP_MIC_LEN,
+ .owner = THIS_MODULE,
+};
+
+static int __init ieee80211_crypto_ccmp_init(void)
+{
+ return ieee80211_register_crypto_ops(&ieee80211_crypt_ccmp);
+}
+
+static void __exit ieee80211_crypto_ccmp_exit(void)
+{
+ ieee80211_unregister_crypto_ops(&ieee80211_crypt_ccmp);
+}
+
+module_init(ieee80211_crypto_ccmp_init);
+module_exit(ieee80211_crypto_ccmp_exit);
diff --git a/net/ieee80211/ieee80211_crypt_tkip.c b/net/ieee80211/ieee80211_crypt_tkip.c
new file mode 100644
index 00000000000..2e34f29b795
--- /dev/null
+++ b/net/ieee80211/ieee80211_crypt_tkip.c
@@ -0,0 +1,725 @@
+/*
+ * Host AP crypt: host-based TKIP encryption implementation for Host AP driver
+ *
+ * Copyright (c) 2003-2004, Jouni Malinen <jkmaline@cc.hut.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation. See README and COPYING for
+ * more details.
+ */
+
+#include <linux/config.h>
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_arp.h>
+#include <asm/string.h>
+
+#include <net/ieee80211.h>
+
+#include <linux/crypto.h>
+#include <asm/scatterlist.h>
+#include <linux/crc32.h>
+
+MODULE_AUTHOR("Jouni Malinen");
+MODULE_DESCRIPTION("Host AP crypt: TKIP");
+MODULE_LICENSE("GPL");
+
+struct ieee80211_tkip_data {
+#define TKIP_KEY_LEN 32
+ u8 key[TKIP_KEY_LEN];
+ int key_set;
+
+ u32 tx_iv32;
+ u16 tx_iv16;
+ u16 tx_ttak[5];
+ int tx_phase1_done;
+
+ u32 rx_iv32;
+ u16 rx_iv16;
+ u16 rx_ttak[5];
+ int rx_phase1_done;
+ u32 rx_iv32_new;
+ u16 rx_iv16_new;
+
+ u32 dot11RSNAStatsTKIPReplays;
+ u32 dot11RSNAStatsTKIPICVErrors;
+ u32 dot11RSNAStatsTKIPLocalMICFailures;
+
+ int key_idx;
+
+ struct crypto_tfm *tfm_arc4;
+ struct crypto_tfm *tfm_michael;
+
+ /* scratch buffers for virt_to_page() (crypto API) */
+ u8 rx_hdr[16], tx_hdr[16];
+
+ unsigned long flags;
+};
+
+static unsigned long ieee80211_tkip_set_flags(unsigned long flags, void *priv)
+{
+ struct ieee80211_tkip_data *_priv = priv;
+ unsigned long old_flags = _priv->flags;
+ _priv->flags = flags;
+ return old_flags;
+}
+
+static unsigned long ieee80211_tkip_get_flags(void *priv)
+{
+ struct ieee80211_tkip_data *_priv = priv;
+ return _priv->flags;
+}
+
+static void *ieee80211_tkip_init(int key_idx)
+{
+ struct ieee80211_tkip_data *priv;
+
+ priv = kmalloc(sizeof(*priv), GFP_ATOMIC);
+ if (priv == NULL)
+ goto fail;
+ memset(priv, 0, sizeof(*priv));
+
+ priv->key_idx = key_idx;
+
+ priv->tfm_arc4 = crypto_alloc_tfm("arc4", 0);
+ if (priv->tfm_arc4 == NULL) {
+ printk(KERN_DEBUG "ieee80211_crypt_tkip: could not allocate "
+ "crypto API arc4\n");
+ goto fail;
+ }
+
+ priv->tfm_michael = crypto_alloc_tfm("michael_mic", 0);
+ if (priv->tfm_michael == NULL) {
+ printk(KERN_DEBUG "ieee80211_crypt_tkip: could not allocate "
+ "crypto API michael_mic\n");
+ goto fail;
+ }
+
+ return priv;
+
+ fail:
+ if (priv) {
+ if (priv->tfm_michael)
+ crypto_free_tfm(priv->tfm_michael);
+ if (priv->tfm_arc4)
+ crypto_free_tfm(priv->tfm_arc4);
+ kfree(priv);
+ }
+
+ return NULL;
+}
+
+static void ieee80211_tkip_deinit(void *priv)
+{
+ struct ieee80211_tkip_data *_priv = priv;
+ if (_priv && _priv->tfm_michael)
+ crypto_free_tfm(_priv->tfm_michael);
+ if (_priv && _priv->tfm_arc4)
+ crypto_free_tfm(_priv->tfm_arc4);
+ kfree(priv);
+}
+
+static inline u16 RotR1(u16 val)
+{
+ return (val >> 1) | (val << 15);
+}
+
+static inline u8 Lo8(u16 val)
+{
+ return val & 0xff;
+}
+
+static inline u8 Hi8(u16 val)
+{
+ return val >> 8;
+}
+
+static inline u16 Lo16(u32 val)
+{
+ return val & 0xffff;
+}
+
+static inline u16 Hi16(u32 val)
+{
+ return val >> 16;
+}
+
+static inline u16 Mk16(u8 hi, u8 lo)
+{
+ return lo | (((u16) hi) << 8);
+}
+
+static inline u16 Mk16_le(u16 * v)
+{
+ return le16_to_cpu(*v);
+}
+
+static const u16 Sbox[256] = {
+ 0xC6A5, 0xF884, 0xEE99, 0xF68D, 0xFF0D, 0xD6BD, 0xDEB1, 0x9154,
+ 0x6050, 0x0203, 0xCEA9, 0x567D, 0xE719, 0xB562, 0x4DE6, 0xEC9A,
+ 0x8F45, 0x1F9D, 0x8940, 0xFA87, 0xEF15, 0xB2EB, 0x8EC9, 0xFB0B,
+ 0x41EC, 0xB367, 0x5FFD, 0x45EA, 0x23BF, 0x53F7, 0xE496, 0x9B5B,
+ 0x75C2, 0xE11C, 0x3DAE, 0x4C6A, 0x6C5A, 0x7E41, 0xF502, 0x834F,
+ 0x685C, 0x51F4, 0xD134, 0xF908, 0xE293, 0xAB73, 0x6253, 0x2A3F,
+ 0x080C, 0x9552, 0x4665, 0x9D5E, 0x3028, 0x37A1, 0x0A0F, 0x2FB5,
+ 0x0E09, 0x2436, 0x1B9B, 0xDF3D, 0xCD26, 0x4E69, 0x7FCD, 0xEA9F,
+ 0x121B, 0x1D9E, 0x5874, 0x342E, 0x362D, 0xDCB2, 0xB4EE, 0x5BFB,
+ 0xA4F6, 0x764D, 0xB761, 0x7DCE, 0x527B, 0xDD3E, 0x5E71, 0x1397,
+ 0xA6F5, 0xB968, 0x0000, 0xC12C, 0x4060, 0xE31F, 0x79C8, 0xB6ED,
+ 0xD4BE, 0x8D46, 0x67D9, 0x724B, 0x94DE, 0x98D4, 0xB0E8, 0x854A,
+ 0xBB6B, 0xC52A, 0x4FE5, 0xED16, 0x86C5, 0x9AD7, 0x6655, 0x1194,
+ 0x8ACF, 0xE910, 0x0406, 0xFE81, 0xA0F0, 0x7844, 0x25BA, 0x4BE3,
+ 0xA2F3, 0x5DFE, 0x80C0, 0x058A, 0x3FAD, 0x21BC, 0x7048, 0xF104,
+ 0x63DF, 0x77C1, 0xAF75, 0x4263, 0x2030, 0xE51A, 0xFD0E, 0xBF6D,
+ 0x814C, 0x1814, 0x2635, 0xC32F, 0xBEE1, 0x35A2, 0x88CC, 0x2E39,
+ 0x9357, 0x55F2, 0xFC82, 0x7A47, 0xC8AC, 0xBAE7, 0x322B, 0xE695,
+ 0xC0A0, 0x1998, 0x9ED1, 0xA37F, 0x4466, 0x547E, 0x3BAB, 0x0B83,
+ 0x8CCA, 0xC729, 0x6BD3, 0x283C, 0xA779, 0xBCE2, 0x161D, 0xAD76,
+ 0xDB3B, 0x6456, 0x744E, 0x141E, 0x92DB, 0x0C0A, 0x486C, 0xB8E4,
+ 0x9F5D, 0xBD6E, 0x43EF, 0xC4A6, 0x39A8, 0x31A4, 0xD337, 0xF28B,
+ 0xD532, 0x8B43, 0x6E59, 0xDAB7, 0x018C, 0xB164, 0x9CD2, 0x49E0,
+ 0xD8B4, 0xACFA, 0xF307, 0xCF25, 0xCAAF, 0xF48E, 0x47E9, 0x1018,
+ 0x6FD5, 0xF088, 0x4A6F, 0x5C72, 0x3824, 0x57F1, 0x73C7, 0x9751,
+ 0xCB23, 0xA17C, 0xE89C, 0x3E21, 0x96DD, 0x61DC, 0x0D86, 0x0F85,
+ 0xE090, 0x7C42, 0x71C4, 0xCCAA, 0x90D8, 0x0605, 0xF701, 0x1C12,
+ 0xC2A3, 0x6A5F, 0xAEF9, 0x69D0, 0x1791, 0x9958, 0x3A27, 0x27B9,
+ 0xD938, 0xEB13, 0x2BB3, 0x2233, 0xD2BB, 0xA970, 0x0789, 0x33A7,
+ 0x2DB6, 0x3C22, 0x1592, 0xC920, 0x8749, 0xAAFF, 0x5078, 0xA57A,
+ 0x038F, 0x59F8, 0x0980, 0x1A17, 0x65DA, 0xD731, 0x84C6, 0xD0B8,
+ 0x82C3, 0x29B0, 0x5A77, 0x1E11, 0x7BCB, 0xA8FC, 0x6DD6, 0x2C3A,
+};
+
+static inline u16 _S_(u16 v)
+{
+ u16 t = Sbox[Hi8(v)];
+ return Sbox[Lo8(v)] ^ ((t << 8) | (t >> 8));
+}
+
+#define PHASE1_LOOP_COUNT 8
+
+static void tkip_mixing_phase1(u16 * TTAK, const u8 * TK, const u8 * TA,
+ u32 IV32)
+{
+ int i, j;
+
+ /* Initialize the 80-bit TTAK from TSC (IV32) and TA[0..5] */
+ TTAK[0] = Lo16(IV32);
+ TTAK[1] = Hi16(IV32);
+ TTAK[2] = Mk16(TA[1], TA[0]);
+ TTAK[3] = Mk16(TA[3], TA[2]);
+ TTAK[4] = Mk16(TA[5], TA[4]);
+
+ for (i = 0; i < PHASE1_LOOP_COUNT; i++) {
+ j = 2 * (i & 1);
+ TTAK[0] += _S_(TTAK[4] ^ Mk16(TK[1 + j], TK[0 + j]));
+ TTAK[1] += _S_(TTAK[0] ^ Mk16(TK[5 + j], TK[4 + j]));
+ TTAK[2] += _S_(TTAK[1] ^ Mk16(TK[9 + j], TK[8 + j]));
+ TTAK[3] += _S_(TTAK[2] ^ Mk16(TK[13 + j], TK[12 + j]));
+ TTAK[4] += _S_(TTAK[3] ^ Mk16(TK[1 + j], TK[0 + j])) + i;
+ }
+}
+
+static void tkip_mixing_phase2(u8 * WEPSeed, const u8 * TK, const u16 * TTAK,
+ u16 IV16)
+{
+ /* Make temporary area overlap WEP seed so that the final copy can be
+ * avoided on little endian hosts. */
+ u16 *PPK = (u16 *) & WEPSeed[4];
+
+ /* Step 1 - make copy of TTAK and bring in TSC */
+ PPK[0] = TTAK[0];
+ PPK[1] = TTAK[1];
+ PPK[2] = TTAK[2];
+ PPK[3] = TTAK[3];
+ PPK[4] = TTAK[4];
+ PPK[5] = TTAK[4] + IV16;
+
+ /* Step 2 - 96-bit bijective mixing using S-box */
+ PPK[0] += _S_(PPK[5] ^ Mk16_le((u16 *) & TK[0]));
+ PPK[1] += _S_(PPK[0] ^ Mk16_le((u16 *) & TK[2]));
+ PPK[2] += _S_(PPK[1] ^ Mk16_le((u16 *) & TK[4]));
+ PPK[3] += _S_(PPK[2] ^ Mk16_le((u16 *) & TK[6]));
+ PPK[4] += _S_(PPK[3] ^ Mk16_le((u16 *) & TK[8]));
+ PPK[5] += _S_(PPK[4] ^ Mk16_le((u16 *) & TK[10]));
+
+ PPK[0] += RotR1(PPK[5] ^ Mk16_le((u16 *) & TK[12]));
+ PPK[1] += RotR1(PPK[0] ^ Mk16_le((u16 *) & TK[14]));
+ PPK[2] += RotR1(PPK[1]);
+ PPK[3] += RotR1(PPK[2]);
+ PPK[4] += RotR1(PPK[3]);
+ PPK[5] += RotR1(PPK[4]);
+
+ /* Step 3 - bring in last of TK bits, assign 24-bit WEP IV value
+ * WEPSeed[0..2] is transmitted as WEP IV */
+ WEPSeed[0] = Hi8(IV16);
+ WEPSeed[1] = (Hi8(IV16) | 0x20) & 0x7F;
+ WEPSeed[2] = Lo8(IV16);
+ WEPSeed[3] = Lo8((PPK[5] ^ Mk16_le((u16 *) & TK[0])) >> 1);
+
+#ifdef __BIG_ENDIAN
+ {
+ int i;
+ for (i = 0; i < 6; i++)
+ PPK[i] = (PPK[i] << 8) | (PPK[i] >> 8);
+ }
+#endif
+}
+
+static u8 *ieee80211_tkip_hdr(struct sk_buff *skb, int hdr_len, void *priv)
+{
+ struct ieee80211_tkip_data *tkey = priv;
+ int len;
+ u8 *rc4key, *pos, *icv;
+ struct ieee80211_hdr_4addr *hdr;
+ u32 crc;
+
+ hdr = (struct ieee80211_hdr_4addr *)skb->data;
+
+ if (skb_headroom(skb) < 8 || skb->len < hdr_len)
+ return NULL;
+
+ if (!tkey->tx_phase1_done) {
+ tkip_mixing_phase1(tkey->tx_ttak, tkey->key, hdr->addr2,
+ tkey->tx_iv32);
+ tkey->tx_phase1_done = 1;
+ }
+ rc4key = kmalloc(16, GFP_ATOMIC);
+ if (!rc4key)
+ return NULL;
+ tkip_mixing_phase2(rc4key, tkey->key, tkey->tx_ttak, tkey->tx_iv16);
+
+ len = skb->len - hdr_len;
+ pos = skb_push(skb, 8);
+ memmove(pos, pos + 8, hdr_len);
+ pos += hdr_len;
+ icv = skb_put(skb, 4);
+
+ *pos++ = *rc4key;
+ *pos++ = *(rc4key + 1);
+ *pos++ = *(rc4key + 2);
+ *pos++ = (tkey->key_idx << 6) | (1 << 5) /* Ext IV included */ ;
+ *pos++ = tkey->tx_iv32 & 0xff;
+ *pos++ = (tkey->tx_iv32 >> 8) & 0xff;
+ *pos++ = (tkey->tx_iv32 >> 16) & 0xff;
+ *pos++ = (tkey->tx_iv32 >> 24) & 0xff;
+
+ crc = ~crc32_le(~0, pos, len);
+ icv[0] = crc;
+ icv[1] = crc >> 8;
+ icv[2] = crc >> 16;
+ icv[3] = crc >> 24;
+
+ return rc4key;
+}
+
+static int ieee80211_tkip_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
+{
+ struct ieee80211_tkip_data *tkey = priv;
+ int len;
+ const u8 *rc4key;
+ u8 *pos;
+ struct scatterlist sg;
+
+ if (tkey->flags & IEEE80211_CRYPTO_TKIP_COUNTERMEASURES) {
+ if (net_ratelimit()) {
+ struct ieee80211_hdr_4addr *hdr =
+ (struct ieee80211_hdr_4addr *)skb->data;
+ printk(KERN_DEBUG "TKIP countermeasures: dropped "
+ "TX packet to " MAC_FMT "\n",
+ MAC_ARG(hdr->addr1));
+ }
+ return -1;
+ }
+
+ if (skb_tailroom(skb) < 4 || skb->len < hdr_len)
+ return -1;
+
+ len = skb->len - hdr_len;
+ pos = skb->data + hdr_len;
+
+ rc4key = ieee80211_tkip_hdr(skb, hdr_len, priv);
+ if (!rc4key)
+ return -1;
+
+ crypto_cipher_setkey(tkey->tfm_arc4, rc4key, 16);
+ sg.page = virt_to_page(pos);
+ sg.offset = offset_in_page(pos);
+ sg.length = len + 4;
+ crypto_cipher_encrypt(tkey->tfm_arc4, &sg, &sg, len + 4);
+
+ tkey->tx_iv16++;
+ if (tkey->tx_iv16 == 0) {
+ tkey->tx_phase1_done = 0;
+ tkey->tx_iv32++;
+ }
+
+ return 0;
+}
+
+static int ieee80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
+{
+ struct ieee80211_tkip_data *tkey = priv;
+ u8 rc4key[16];
+ u8 keyidx, *pos;
+ u32 iv32;
+ u16 iv16;
+ struct ieee80211_hdr_4addr *hdr;
+ u8 icv[4];
+ u32 crc;
+ struct scatterlist sg;
+ int plen;
+
+ hdr = (struct ieee80211_hdr_4addr *)skb->data;
+
+ if (tkey->flags & IEEE80211_CRYPTO_TKIP_COUNTERMEASURES) {
+ if (net_ratelimit()) {
+ printk(KERN_DEBUG "TKIP countermeasures: dropped "
+ "received packet from " MAC_FMT "\n",
+ MAC_ARG(hdr->addr2));
+ }
+ return -1;
+ }
+
+ if (skb->len < hdr_len + 8 + 4)
+ return -1;
+
+ pos = skb->data + hdr_len;
+ keyidx = pos[3];
+ if (!(keyidx & (1 << 5))) {
+ if (net_ratelimit()) {
+ printk(KERN_DEBUG "TKIP: received packet without ExtIV"
+ " flag from " MAC_FMT "\n", MAC_ARG(hdr->addr2));
+ }
+ return -2;
+ }
+ keyidx >>= 6;
+ if (tkey->key_idx != keyidx) {
+ printk(KERN_DEBUG "TKIP: RX tkey->key_idx=%d frame "
+ "keyidx=%d priv=%p\n", tkey->key_idx, keyidx, priv);
+ return -6;
+ }
+ if (!tkey->key_set) {
+ if (net_ratelimit()) {
+ printk(KERN_DEBUG "TKIP: received packet from " MAC_FMT
+ " with keyid=%d that does not have a configured"
+ " key\n", MAC_ARG(hdr->addr2), keyidx);
+ }
+ return -3;
+ }
+ iv16 = (pos[0] << 8) | pos[2];
+ iv32 = pos[4] | (pos[5] << 8) | (pos[6] << 16) | (pos[7] << 24);
+ pos += 8;
+
+ if (iv32 < tkey->rx_iv32 ||
+ (iv32 == tkey->rx_iv32 && iv16 <= tkey->rx_iv16)) {
+ if (net_ratelimit()) {
+ printk(KERN_DEBUG "TKIP: replay detected: STA=" MAC_FMT
+ " previous TSC %08x%04x received TSC "
+ "%08x%04x\n", MAC_ARG(hdr->addr2),
+ tkey->rx_iv32, tkey->rx_iv16, iv32, iv16);
+ }
+ tkey->dot11RSNAStatsTKIPReplays++;
+ return -4;
+ }
+
+ if (iv32 != tkey->rx_iv32 || !tkey->rx_phase1_done) {
+ tkip_mixing_phase1(tkey->rx_ttak, tkey->key, hdr->addr2, iv32);
+ tkey->rx_phase1_done = 1;
+ }
+ tkip_mixing_phase2(rc4key, tkey->key, tkey->rx_ttak, iv16);
+
+ plen = skb->len - hdr_len - 12;
+
+ crypto_cipher_setkey(tkey->tfm_arc4, rc4key, 16);
+ sg.page = virt_to_page(pos);
+ sg.offset = offset_in_page(pos);
+ sg.length = plen + 4;
+ crypto_cipher_decrypt(tkey->tfm_arc4, &sg, &sg, plen + 4);
+
+ crc = ~crc32_le(~0, pos, plen);
+ icv[0] = crc;
+ icv[1] = crc >> 8;
+ icv[2] = crc >> 16;
+ icv[3] = crc >> 24;
+ if (memcmp(icv, pos + plen, 4) != 0) {
+ if (iv32 != tkey->rx_iv32) {
+ /* Previously cached Phase1 result was already lost, so
+ * it needs to be recalculated for the next packet. */
+ tkey->rx_phase1_done = 0;
+ }
+ if (net_ratelimit()) {
+ printk(KERN_DEBUG "TKIP: ICV error detected: STA="
+ MAC_FMT "\n", MAC_ARG(hdr->addr2));
+ }
+ tkey->dot11RSNAStatsTKIPICVErrors++;
+ return -5;
+ }
+
+ /* Update real counters only after Michael MIC verification has
+ * completed */
+ tkey->rx_iv32_new = iv32;
+ tkey->rx_iv16_new = iv16;
+
+ /* Remove IV and ICV */
+ memmove(skb->data + 8, skb->data, hdr_len);
+ skb_pull(skb, 8);
+ skb_trim(skb, skb->len - 4);
+
+ return keyidx;
+}
+
+static int michael_mic(struct ieee80211_tkip_data *tkey, u8 * key, u8 * hdr,
+ u8 * data, size_t data_len, u8 * mic)
+{
+ struct scatterlist sg[2];
+
+ if (tkey->tfm_michael == NULL) {
+ printk(KERN_WARNING "michael_mic: tfm_michael == NULL\n");
+ return -1;
+ }
+ sg[0].page = virt_to_page(hdr);
+ sg[0].offset = offset_in_page(hdr);
+ sg[0].length = 16;
+
+ sg[1].page = virt_to_page(data);
+ sg[1].offset = offset_in_page(data);
+ sg[1].length = data_len;
+
+ crypto_digest_init(tkey->tfm_michael);
+ crypto_digest_setkey(tkey->tfm_michael, key, 8);
+ crypto_digest_update(tkey->tfm_michael, sg, 2);
+ crypto_digest_final(tkey->tfm_michael, mic);
+
+ return 0;
+}
+
+static void michael_mic_hdr(struct sk_buff *skb, u8 * hdr)
+{
+ struct ieee80211_hdr_4addr *hdr11;
+
+ hdr11 = (struct ieee80211_hdr_4addr *)skb->data;
+ switch (le16_to_cpu(hdr11->frame_ctl) &
+ (IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS)) {
+ case IEEE80211_FCTL_TODS:
+ memcpy(hdr, hdr11->addr3, ETH_ALEN); /* DA */
+ memcpy(hdr + ETH_ALEN, hdr11->addr2, ETH_ALEN); /* SA */
+ break;
+ case IEEE80211_FCTL_FROMDS:
+ memcpy(hdr, hdr11->addr1, ETH_ALEN); /* DA */
+ memcpy(hdr + ETH_ALEN, hdr11->addr3, ETH_ALEN); /* SA */
+ break;
+ case IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS:
+ memcpy(hdr, hdr11->addr3, ETH_ALEN); /* DA */
+ memcpy(hdr + ETH_ALEN, hdr11->addr4, ETH_ALEN); /* SA */
+ break;
+ case 0:
+ memcpy(hdr, hdr11->addr1, ETH_ALEN); /* DA */
+ memcpy(hdr + ETH_ALEN, hdr11->addr2, ETH_ALEN); /* SA */
+ break;
+ }
+
+ hdr[12] = 0; /* priority */
+ hdr[13] = hdr[14] = hdr[15] = 0; /* reserved */
+}
+
+static int ieee80211_michael_mic_add(struct sk_buff *skb, int hdr_len,
+ void *priv)
+{
+ struct ieee80211_tkip_data *tkey = priv;
+ u8 *pos;
+
+ if (skb_tailroom(skb) < 8 || skb->len < hdr_len) {
+ printk(KERN_DEBUG "Invalid packet for Michael MIC add "
+ "(tailroom=%d hdr_len=%d skb->len=%d)\n",
+ skb_tailroom(skb), hdr_len, skb->len);
+ return -1;
+ }
+
+ michael_mic_hdr(skb, tkey->tx_hdr);
+ pos = skb_put(skb, 8);
+ if (michael_mic(tkey, &tkey->key[16], tkey->tx_hdr,
+ skb->data + hdr_len, skb->len - 8 - hdr_len, pos))
+ return -1;
+
+ return 0;
+}
+
+static void ieee80211_michael_mic_failure(struct net_device *dev,
+ struct ieee80211_hdr_4addr *hdr,
+ int keyidx)
+{
+ union iwreq_data wrqu;
+ struct iw_michaelmicfailure ev;
+
+ /* TODO: needed parameters: count, keyid, key type, TSC */
+ memset(&ev, 0, sizeof(ev));
+ ev.flags = keyidx & IW_MICFAILURE_KEY_ID;
+ if (hdr->addr1[0] & 0x01)
+ ev.flags |= IW_MICFAILURE_GROUP;
+ else
+ ev.flags |= IW_MICFAILURE_PAIRWISE;
+ ev.src_addr.sa_family = ARPHRD_ETHER;
+ memcpy(ev.src_addr.sa_data, hdr->addr2, ETH_ALEN);
+ memset(&wrqu, 0, sizeof(wrqu));
+ wrqu.data.length = sizeof(ev);
+ wireless_send_event(dev, IWEVMICHAELMICFAILURE, &wrqu, (char *)&ev);
+}
+
+static int ieee80211_michael_mic_verify(struct sk_buff *skb, int keyidx,
+ int hdr_len, void *priv)
+{
+ struct ieee80211_tkip_data *tkey = priv;
+ u8 mic[8];
+
+ if (!tkey->key_set)
+ return -1;
+
+ michael_mic_hdr(skb, tkey->rx_hdr);
+ if (michael_mic(tkey, &tkey->key[24], tkey->rx_hdr,
+ skb->data + hdr_len, skb->len - 8 - hdr_len, mic))
+ return -1;
+ if (memcmp(mic, skb->data + skb->len - 8, 8) != 0) {
+ struct ieee80211_hdr_4addr *hdr;
+ hdr = (struct ieee80211_hdr_4addr *)skb->data;
+ printk(KERN_DEBUG "%s: Michael MIC verification failed for "
+ "MSDU from " MAC_FMT " keyidx=%d\n",
+ skb->dev ? skb->dev->name : "N/A", MAC_ARG(hdr->addr2),
+ keyidx);
+ if (skb->dev)
+ ieee80211_michael_mic_failure(skb->dev, hdr, keyidx);
+ tkey->dot11RSNAStatsTKIPLocalMICFailures++;
+ return -1;
+ }
+
+ /* Update TSC counters for RX now that the packet verification has
+ * completed. */
+ tkey->rx_iv32 = tkey->rx_iv32_new;
+ tkey->rx_iv16 = tkey->rx_iv16_new;
+
+ skb_trim(skb, skb->len - 8);
+
+ return 0;
+}
+
+static int ieee80211_tkip_set_key(void *key, int len, u8 * seq, void *priv)
+{
+ struct ieee80211_tkip_data *tkey = priv;
+ int keyidx;
+ struct crypto_tfm *tfm = tkey->tfm_michael;
+ struct crypto_tfm *tfm2 = tkey->tfm_arc4;
+
+ keyidx = tkey->key_idx;
+ memset(tkey, 0, sizeof(*tkey));
+ tkey->key_idx = keyidx;
+ tkey->tfm_michael = tfm;
+ tkey->tfm_arc4 = tfm2;
+ if (len == TKIP_KEY_LEN) {
+ memcpy(tkey->key, key, TKIP_KEY_LEN);
+ tkey->key_set = 1;
+ tkey->tx_iv16 = 1; /* TSC is initialized to 1 */
+ if (seq) {
+ tkey->rx_iv32 = (seq[5] << 24) | (seq[4] << 16) |
+ (seq[3] << 8) | seq[2];
+ tkey->rx_iv16 = (seq[1] << 8) | seq[0];
+ }
+ } else if (len == 0)
+ tkey->key_set = 0;
+ else
+ return -1;
+
+ return 0;
+}
+
+static int ieee80211_tkip_get_key(void *key, int len, u8 * seq, void *priv)
+{
+ struct ieee80211_tkip_data *tkey = priv;
+
+ if (len < TKIP_KEY_LEN)
+ return -1;
+
+ if (!tkey->key_set)
+ return 0;
+ memcpy(key, tkey->key, TKIP_KEY_LEN);
+
+ if (seq) {
+ /* Return the sequence number of the last transmitted frame. */
+ u16 iv16 = tkey->tx_iv16;
+ u32 iv32 = tkey->tx_iv32;
+ if (iv16 == 0)
+ iv32--;
+ iv16--;
+ seq[0] = tkey->tx_iv16;
+ seq[1] = tkey->tx_iv16 >> 8;
+ seq[2] = tkey->tx_iv32;
+ seq[3] = tkey->tx_iv32 >> 8;
+ seq[4] = tkey->tx_iv32 >> 16;
+ seq[5] = tkey->tx_iv32 >> 24;
+ }
+
+ return TKIP_KEY_LEN;
+}
+
+static char *ieee80211_tkip_print_stats(char *p, void *priv)
+{
+ struct ieee80211_tkip_data *tkip = priv;
+ p += sprintf(p, "key[%d] alg=TKIP key_set=%d "
+ "tx_pn=%02x%02x%02x%02x%02x%02x "
+ "rx_pn=%02x%02x%02x%02x%02x%02x "
+ "replays=%d icv_errors=%d local_mic_failures=%d\n",
+ tkip->key_idx, tkip->key_set,
+ (tkip->tx_iv32 >> 24) & 0xff,
+ (tkip->tx_iv32 >> 16) & 0xff,
+ (tkip->tx_iv32 >> 8) & 0xff,
+ tkip->tx_iv32 & 0xff,
+ (tkip->tx_iv16 >> 8) & 0xff,
+ tkip->tx_iv16 & 0xff,
+ (tkip->rx_iv32 >> 24) & 0xff,
+ (tkip->rx_iv32 >> 16) & 0xff,
+ (tkip->rx_iv32 >> 8) & 0xff,
+ tkip->rx_iv32 & 0xff,
+ (tkip->rx_iv16 >> 8) & 0xff,
+ tkip->rx_iv16 & 0xff,
+ tkip->dot11RSNAStatsTKIPReplays,
+ tkip->dot11RSNAStatsTKIPICVErrors,
+ tkip->dot11RSNAStatsTKIPLocalMICFailures);
+ return p;
+}
+
+static struct ieee80211_crypto_ops ieee80211_crypt_tkip = {
+ .name = "TKIP",
+ .init = ieee80211_tkip_init,
+ .deinit = ieee80211_tkip_deinit,
+ .encrypt_mpdu = ieee80211_tkip_encrypt,
+ .decrypt_mpdu = ieee80211_tkip_decrypt,
+ .encrypt_msdu = ieee80211_michael_mic_add,
+ .decrypt_msdu = ieee80211_michael_mic_verify,
+ .set_key = ieee80211_tkip_set_key,
+ .get_key = ieee80211_tkip_get_key,
+ .print_stats = ieee80211_tkip_print_stats,
+ .extra_mpdu_prefix_len = 4 + 4, /* IV + ExtIV */
+ .extra_mpdu_postfix_len = 4, /* ICV */
+ .extra_msdu_postfix_len = 8, /* MIC */
+ .get_flags = ieee80211_tkip_get_flags,
+ .set_flags = ieee80211_tkip_set_flags,
+ .owner = THIS_MODULE,
+};
+
+static int __init ieee80211_crypto_tkip_init(void)
+{
+ return ieee80211_register_crypto_ops(&ieee80211_crypt_tkip);
+}
+
+static void __exit ieee80211_crypto_tkip_exit(void)
+{
+ ieee80211_unregister_crypto_ops(&ieee80211_crypt_tkip);
+}
+
+module_init(ieee80211_crypto_tkip_init);
+module_exit(ieee80211_crypto_tkip_exit);
diff --git a/net/ieee80211/ieee80211_crypt_wep.c b/net/ieee80211/ieee80211_crypt_wep.c
new file mode 100644
index 00000000000..7c08ed2f262
--- /dev/null
+++ b/net/ieee80211/ieee80211_crypt_wep.c
@@ -0,0 +1,258 @@
+/*
+ * Host AP crypt: host-based WEP encryption implementation for Host AP driver
+ *
+ * Copyright (c) 2002-2004, Jouni Malinen <jkmaline@cc.hut.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation. See README and COPYING for
+ * more details.
+ */
+
+#include <linux/config.h>
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/skbuff.h>
+#include <asm/string.h>
+
+#include <net/ieee80211.h>
+
+#include <linux/crypto.h>
+#include <asm/scatterlist.h>
+#include <linux/crc32.h>
+
+MODULE_AUTHOR("Jouni Malinen");
+MODULE_DESCRIPTION("Host AP crypt: WEP");
+MODULE_LICENSE("GPL");
+
+struct prism2_wep_data {
+ u32 iv;
+#define WEP_KEY_LEN 13
+ u8 key[WEP_KEY_LEN + 1];
+ u8 key_len;
+ u8 key_idx;
+ struct crypto_tfm *tfm;
+};
+
+static void *prism2_wep_init(int keyidx)
+{
+ struct prism2_wep_data *priv;
+
+ priv = kmalloc(sizeof(*priv), GFP_ATOMIC);
+ if (priv == NULL)
+ goto fail;
+ memset(priv, 0, sizeof(*priv));
+ priv->key_idx = keyidx;
+
+ priv->tfm = crypto_alloc_tfm("arc4", 0);
+ if (priv->tfm == NULL) {
+ printk(KERN_DEBUG "ieee80211_crypt_wep: could not allocate "
+ "crypto API arc4\n");
+ goto fail;
+ }
+
+ /* start WEP IV from a random value */
+ get_random_bytes(&priv->iv, 4);
+
+ return priv;
+
+ fail:
+ if (priv) {
+ if (priv->tfm)
+ crypto_free_tfm(priv->tfm);
+ kfree(priv);
+ }
+ return NULL;
+}
+
+static void prism2_wep_deinit(void *priv)
+{
+ struct prism2_wep_data *_priv = priv;
+ if (_priv && _priv->tfm)
+ crypto_free_tfm(_priv->tfm);
+ kfree(priv);
+}
+
+/* Perform WEP encryption on given skb that has at least 4 bytes of headroom
+ * for IV and 4 bytes of tailroom for ICV. Both IV and ICV will be transmitted,
+ * so the payload length increases with 8 bytes.
+ *
+ * WEP frame payload: IV + TX key idx, RC4(data), ICV = RC4(CRC32(data))
+ */
+static int prism2_wep_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
+{
+ struct prism2_wep_data *wep = priv;
+ u32 crc, klen, len;
+ u8 key[WEP_KEY_LEN + 3];
+ u8 *pos, *icv;
+ struct scatterlist sg;
+
+ if (skb_headroom(skb) < 4 || skb_tailroom(skb) < 4 ||
+ skb->len < hdr_len)
+ return -1;
+
+ len = skb->len - hdr_len;
+ pos = skb_push(skb, 4);
+ memmove(pos, pos + 4, hdr_len);
+ pos += hdr_len;
+
+ klen = 3 + wep->key_len;
+
+ wep->iv++;
+
+ /* Fluhrer, Mantin, and Shamir have reported weaknesses in the key
+ * scheduling algorithm of RC4. At least IVs (KeyByte + 3, 0xff, N)
+ * can be used to speedup attacks, so avoid using them. */
+ if ((wep->iv & 0xff00) == 0xff00) {
+ u8 B = (wep->iv >> 16) & 0xff;
+ if (B >= 3 && B < klen)
+ wep->iv += 0x0100;
+ }
+
+ /* Prepend 24-bit IV to RC4 key and TX frame */
+ *pos++ = key[0] = (wep->iv >> 16) & 0xff;
+ *pos++ = key[1] = (wep->iv >> 8) & 0xff;
+ *pos++ = key[2] = wep->iv & 0xff;
+ *pos++ = wep->key_idx << 6;
+
+ /* Copy rest of the WEP key (the secret part) */
+ memcpy(key + 3, wep->key, wep->key_len);
+
+ /* Append little-endian CRC32 and encrypt it to produce ICV */
+ crc = ~crc32_le(~0, pos, len);
+ icv = skb_put(skb, 4);
+ icv[0] = crc;
+ icv[1] = crc >> 8;
+ icv[2] = crc >> 16;
+ icv[3] = crc >> 24;
+
+ crypto_cipher_setkey(wep->tfm, key, klen);
+ sg.page = virt_to_page(pos);
+ sg.offset = offset_in_page(pos);
+ sg.length = len + 4;
+ crypto_cipher_encrypt(wep->tfm, &sg, &sg, len + 4);
+
+ return 0;
+}
+
+/* Perform WEP decryption on given buffer. Buffer includes whole WEP part of
+ * the frame: IV (4 bytes), encrypted payload (including SNAP header),
+ * ICV (4 bytes). len includes both IV and ICV.
+ *
+ * Returns 0 if frame was decrypted successfully and ICV was correct and -1 on
+ * failure. If frame is OK, IV and ICV will be removed.
+ */
+static int prism2_wep_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
+{
+ struct prism2_wep_data *wep = priv;
+ u32 crc, klen, plen;
+ u8 key[WEP_KEY_LEN + 3];
+ u8 keyidx, *pos, icv[4];
+ struct scatterlist sg;
+
+ if (skb->len < hdr_len + 8)
+ return -1;
+
+ pos = skb->data + hdr_len;
+ key[0] = *pos++;
+ key[1] = *pos++;
+ key[2] = *pos++;
+ keyidx = *pos++ >> 6;
+ if (keyidx != wep->key_idx)
+ return -1;
+
+ klen = 3 + wep->key_len;
+
+ /* Copy rest of the WEP key (the secret part) */
+ memcpy(key + 3, wep->key, wep->key_len);
+
+ /* Apply RC4 to data and compute CRC32 over decrypted data */
+ plen = skb->len - hdr_len - 8;
+
+ crypto_cipher_setkey(wep->tfm, key, klen);
+ sg.page = virt_to_page(pos);
+ sg.offset = offset_in_page(pos);
+ sg.length = plen + 4;
+ crypto_cipher_decrypt(wep->tfm, &sg, &sg, plen + 4);
+
+ crc = ~crc32_le(~0, pos, plen);
+ icv[0] = crc;
+ icv[1] = crc >> 8;
+ icv[2] = crc >> 16;
+ icv[3] = crc >> 24;
+ if (memcmp(icv, pos + plen, 4) != 0) {
+ /* ICV mismatch - drop frame */
+ return -2;
+ }
+
+ /* Remove IV and ICV */
+ memmove(skb->data + 4, skb->data, hdr_len);
+ skb_pull(skb, 4);
+ skb_trim(skb, skb->len - 4);
+
+ return 0;
+}
+
+static int prism2_wep_set_key(void *key, int len, u8 * seq, void *priv)
+{
+ struct prism2_wep_data *wep = priv;
+
+ if (len < 0 || len > WEP_KEY_LEN)
+ return -1;
+
+ memcpy(wep->key, key, len);
+ wep->key_len = len;
+
+ return 0;
+}
+
+static int prism2_wep_get_key(void *key, int len, u8 * seq, void *priv)
+{
+ struct prism2_wep_data *wep = priv;
+
+ if (len < wep->key_len)
+ return -1;
+
+ memcpy(key, wep->key, wep->key_len);
+
+ return wep->key_len;
+}
+
+static char *prism2_wep_print_stats(char *p, void *priv)
+{
+ struct prism2_wep_data *wep = priv;
+ p += sprintf(p, "key[%d] alg=WEP len=%d\n", wep->key_idx, wep->key_len);
+ return p;
+}
+
+static struct ieee80211_crypto_ops ieee80211_crypt_wep = {
+ .name = "WEP",
+ .init = prism2_wep_init,
+ .deinit = prism2_wep_deinit,
+ .encrypt_mpdu = prism2_wep_encrypt,
+ .decrypt_mpdu = prism2_wep_decrypt,
+ .encrypt_msdu = NULL,
+ .decrypt_msdu = NULL,
+ .set_key = prism2_wep_set_key,
+ .get_key = prism2_wep_get_key,
+ .print_stats = prism2_wep_print_stats,
+ .extra_mpdu_prefix_len = 4, /* IV */
+ .extra_mpdu_postfix_len = 4, /* ICV */
+ .owner = THIS_MODULE,
+};
+
+static int __init ieee80211_crypto_wep_init(void)
+{
+ return ieee80211_register_crypto_ops(&ieee80211_crypt_wep);
+}
+
+static void __exit ieee80211_crypto_wep_exit(void)
+{
+ ieee80211_unregister_crypto_ops(&ieee80211_crypt_wep);
+}
+
+module_init(ieee80211_crypto_wep_init);
+module_exit(ieee80211_crypto_wep_exit);
diff --git a/net/ieee80211/ieee80211_geo.c b/net/ieee80211/ieee80211_geo.c
new file mode 100644
index 00000000000..c4b54ef8f6d
--- /dev/null
+++ b/net/ieee80211/ieee80211_geo.c
@@ -0,0 +1,141 @@
+/******************************************************************************
+
+ Copyright(c) 2005 Intel Corporation. All rights reserved.
+
+ This program is free software; you can redistribute it and/or modify it
+ under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ You should have received a copy of the GNU General Public License along with
+ this program; if not, write to the Free Software Foundation, Inc., 59
+ Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+ The full GNU General Public License is included in this distribution in the
+ file called LICENSE.
+
+ Contact Information:
+ James P. Ketrenos <ipw2100-admin@linux.intel.com>
+ Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+
+******************************************************************************/
+#include <linux/compiler.h>
+#include <linux/config.h>
+#include <linux/errno.h>
+#include <linux/if_arp.h>
+#include <linux/in6.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/tcp.h>
+#include <linux/types.h>
+#include <linux/version.h>
+#include <linux/wireless.h>
+#include <linux/etherdevice.h>
+#include <asm/uaccess.h>
+
+#include <net/ieee80211.h>
+
+int ieee80211_is_valid_channel(struct ieee80211_device *ieee, u8 channel)
+{
+ int i;
+
+ /* Driver needs to initialize the geography map before using
+ * these helper functions */
+ BUG_ON(ieee->geo.bg_channels == 0 && ieee->geo.a_channels == 0);
+
+ if (ieee->freq_band & IEEE80211_24GHZ_BAND)
+ for (i = 0; i < ieee->geo.bg_channels; i++)
+ /* NOTE: If G mode is currently supported but
+ * this is a B only channel, we don't see it
+ * as valid. */
+ if ((ieee->geo.bg[i].channel == channel) &&
+ (!(ieee->mode & IEEE_G) ||
+ !(ieee->geo.bg[i].flags & IEEE80211_CH_B_ONLY)))
+ return IEEE80211_24GHZ_BAND;
+
+ if (ieee->freq_band & IEEE80211_52GHZ_BAND)
+ for (i = 0; i < ieee->geo.a_channels; i++)
+ if (ieee->geo.a[i].channel == channel)
+ return IEEE80211_52GHZ_BAND;
+
+ return 0;
+}
+
+int ieee80211_channel_to_index(struct ieee80211_device *ieee, u8 channel)
+{
+ int i;
+
+ /* Driver needs to initialize the geography map before using
+ * these helper functions */
+ BUG_ON(ieee->geo.bg_channels == 0 && ieee->geo.a_channels == 0);
+
+ if (ieee->freq_band & IEEE80211_24GHZ_BAND)
+ for (i = 0; i < ieee->geo.bg_channels; i++)
+ if (ieee->geo.bg[i].channel == channel)
+ return i;
+
+ if (ieee->freq_band & IEEE80211_52GHZ_BAND)
+ for (i = 0; i < ieee->geo.a_channels; i++)
+ if (ieee->geo.a[i].channel == channel)
+ return i;
+
+ return -1;
+}
+
+u8 ieee80211_freq_to_channel(struct ieee80211_device * ieee, u32 freq)
+{
+ int i;
+
+ /* Driver needs to initialize the geography map before using
+ * these helper functions */
+ BUG_ON(ieee->geo.bg_channels == 0 && ieee->geo.a_channels == 0);
+
+ freq /= 100000;
+
+ if (ieee->freq_band & IEEE80211_24GHZ_BAND)
+ for (i = 0; i < ieee->geo.bg_channels; i++)
+ if (ieee->geo.bg[i].freq == freq)
+ return ieee->geo.bg[i].channel;
+
+ if (ieee->freq_band & IEEE80211_52GHZ_BAND)
+ for (i = 0; i < ieee->geo.a_channels; i++)
+ if (ieee->geo.a[i].freq == freq)
+ return ieee->geo.a[i].channel;
+
+ return 0;
+}
+
+int ieee80211_set_geo(struct ieee80211_device *ieee,
+ const struct ieee80211_geo *geo)
+{
+ memcpy(ieee->geo.name, geo->name, 3);
+ ieee->geo.name[3] = '\0';
+ ieee->geo.bg_channels = geo->bg_channels;
+ ieee->geo.a_channels = geo->a_channels;
+ memcpy(ieee->geo.bg, geo->bg, geo->bg_channels *
+ sizeof(struct ieee80211_channel));
+ memcpy(ieee->geo.a, geo->a, ieee->geo.a_channels *
+ sizeof(struct ieee80211_channel));
+ return 0;
+}
+
+const struct ieee80211_geo *ieee80211_get_geo(struct ieee80211_device *ieee)
+{
+ return &ieee->geo;
+}
+
+EXPORT_SYMBOL(ieee80211_is_valid_channel);
+EXPORT_SYMBOL(ieee80211_freq_to_channel);
+EXPORT_SYMBOL(ieee80211_channel_to_index);
+EXPORT_SYMBOL(ieee80211_set_geo);
+EXPORT_SYMBOL(ieee80211_get_geo);
diff --git a/net/ieee80211/ieee80211_module.c b/net/ieee80211/ieee80211_module.c
new file mode 100644
index 00000000000..f66d792cd20
--- /dev/null
+++ b/net/ieee80211/ieee80211_module.c
@@ -0,0 +1,304 @@
+/*******************************************************************************
+
+ Copyright(c) 2004-2005 Intel Corporation. All rights reserved.
+
+ Portions of this file are based on the WEP enablement code provided by the
+ Host AP project hostap-drivers v0.1.3
+ Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen
+ <jkmaline@cc.hut.fi>
+ Copyright (c) 2002-2003, Jouni Malinen <jkmaline@cc.hut.fi>
+
+ This program is free software; you can redistribute it and/or modify it
+ under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ You should have received a copy of the GNU General Public License along with
+ this program; if not, write to the Free Software Foundation, Inc., 59
+ Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+ The full GNU General Public License is included in this distribution in the
+ file called LICENSE.
+
+ Contact Information:
+ James P. Ketrenos <ipw2100-admin@linux.intel.com>
+ Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+
+*******************************************************************************/
+
+#include <linux/compiler.h>
+#include <linux/config.h>
+#include <linux/errno.h>
+#include <linux/if_arp.h>
+#include <linux/in6.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/tcp.h>
+#include <linux/types.h>
+#include <linux/version.h>
+#include <linux/wireless.h>
+#include <linux/etherdevice.h>
+#include <asm/uaccess.h>
+#include <net/arp.h>
+
+#include <net/ieee80211.h>
+
+#define DRV_DESCRIPTION "802.11 data/management/control stack"
+#define DRV_NAME "ieee80211"
+#define DRV_VERSION IEEE80211_VERSION
+#define DRV_COPYRIGHT "Copyright (C) 2004-2005 Intel Corporation <jketreno@linux.intel.com>"
+
+MODULE_VERSION(DRV_VERSION);
+MODULE_DESCRIPTION(DRV_DESCRIPTION);
+MODULE_AUTHOR(DRV_COPYRIGHT);
+MODULE_LICENSE("GPL");
+
+static inline int ieee80211_networks_allocate(struct ieee80211_device *ieee)
+{
+ if (ieee->networks)
+ return 0;
+
+ ieee->networks =
+ kmalloc(MAX_NETWORK_COUNT * sizeof(struct ieee80211_network),
+ GFP_KERNEL);
+ if (!ieee->networks) {
+ printk(KERN_WARNING "%s: Out of memory allocating beacons\n",
+ ieee->dev->name);
+ return -ENOMEM;
+ }
+
+ memset(ieee->networks, 0,
+ MAX_NETWORK_COUNT * sizeof(struct ieee80211_network));
+
+ return 0;
+}
+
+static inline void ieee80211_networks_free(struct ieee80211_device *ieee)
+{
+ if (!ieee->networks)
+ return;
+ kfree(ieee->networks);
+ ieee->networks = NULL;
+}
+
+static inline void ieee80211_networks_initialize(struct ieee80211_device *ieee)
+{
+ int i;
+
+ INIT_LIST_HEAD(&ieee->network_free_list);
+ INIT_LIST_HEAD(&ieee->network_list);
+ for (i = 0; i < MAX_NETWORK_COUNT; i++)
+ list_add_tail(&ieee->networks[i].list,
+ &ieee->network_free_list);
+}
+
+struct net_device *alloc_ieee80211(int sizeof_priv)
+{
+ struct ieee80211_device *ieee;
+ struct net_device *dev;
+ int err;
+
+ IEEE80211_DEBUG_INFO("Initializing...\n");
+
+ dev = alloc_etherdev(sizeof(struct ieee80211_device) + sizeof_priv);
+ if (!dev) {
+ IEEE80211_ERROR("Unable to network device.\n");
+ goto failed;
+ }
+ ieee = netdev_priv(dev);
+ dev->hard_start_xmit = ieee80211_xmit;
+
+ ieee->dev = dev;
+
+ err = ieee80211_networks_allocate(ieee);
+ if (err) {
+ IEEE80211_ERROR("Unable to allocate beacon storage: %d\n", err);
+ goto failed;
+ }
+ ieee80211_networks_initialize(ieee);
+
+ /* Default fragmentation threshold is maximum payload size */
+ ieee->fts = DEFAULT_FTS;
+ ieee->rts = DEFAULT_FTS;
+ ieee->scan_age = DEFAULT_MAX_SCAN_AGE;
+ ieee->open_wep = 1;
+
+ /* Default to enabling full open WEP with host based encrypt/decrypt */
+ ieee->host_encrypt = 1;
+ ieee->host_decrypt = 1;
+ ieee->host_mc_decrypt = 1;
+
+ /* Host fragementation in Open mode. Default is enabled.
+ * Note: host fragmentation is always enabled if host encryption
+ * is enabled. For cards can do hardware encryption, they must do
+ * hardware fragmentation as well. So we don't need a variable
+ * like host_enc_frag. */
+ ieee->host_open_frag = 1;
+ ieee->ieee802_1x = 1; /* Default to supporting 802.1x */
+
+ INIT_LIST_HEAD(&ieee->crypt_deinit_list);
+ init_timer(&ieee->crypt_deinit_timer);
+ ieee->crypt_deinit_timer.data = (unsigned long)ieee;
+ ieee->crypt_deinit_timer.function = ieee80211_crypt_deinit_handler;
+ ieee->crypt_quiesced = 0;
+
+ spin_lock_init(&ieee->lock);
+
+ ieee->wpa_enabled = 0;
+ ieee->drop_unencrypted = 0;
+ ieee->privacy_invoked = 0;
+
+ return dev;
+
+ failed:
+ if (dev)
+ free_netdev(dev);
+ return NULL;
+}
+
+void free_ieee80211(struct net_device *dev)
+{
+ struct ieee80211_device *ieee = netdev_priv(dev);
+
+ int i;
+
+ ieee80211_crypt_quiescing(ieee);
+ del_timer_sync(&ieee->crypt_deinit_timer);
+ ieee80211_crypt_deinit_entries(ieee, 1);
+
+ for (i = 0; i < WEP_KEYS; i++) {
+ struct ieee80211_crypt_data *crypt = ieee->crypt[i];
+ if (crypt) {
+ if (crypt->ops) {
+ crypt->ops->deinit(crypt->priv);
+ module_put(crypt->ops->owner);
+ }
+ kfree(crypt);
+ ieee->crypt[i] = NULL;
+ }
+ }
+
+ ieee80211_networks_free(ieee);
+ free_netdev(dev);
+}
+
+#ifdef CONFIG_IEEE80211_DEBUG
+
+static int debug = 0;
+u32 ieee80211_debug_level = 0;
+struct proc_dir_entry *ieee80211_proc = NULL;
+
+static int show_debug_level(char *page, char **start, off_t offset,
+ int count, int *eof, void *data)
+{
+ return snprintf(page, count, "0x%08X\n", ieee80211_debug_level);
+}
+
+static int store_debug_level(struct file *file, const char __user * buffer,
+ unsigned long count, void *data)
+{
+ char buf[] = "0x00000000\n";
+ unsigned long len = min((unsigned long)sizeof(buf) - 1, count);
+ unsigned long val;
+
+ if (copy_from_user(buf, buffer, len))
+ return count;
+ buf[len] = 0;
+ if (sscanf(buf, "%li", &val) != 1)
+ printk(KERN_INFO DRV_NAME
+ ": %s is not in hex or decimal form.\n", buf);
+ else
+ ieee80211_debug_level = val;
+
+ return strnlen(buf, len);
+}
+#endif /* CONFIG_IEEE80211_DEBUG */
+
+static int __init ieee80211_init(void)
+{
+#ifdef CONFIG_IEEE80211_DEBUG
+ struct proc_dir_entry *e;
+
+ ieee80211_debug_level = debug;
+ ieee80211_proc = proc_mkdir(DRV_NAME, proc_net);
+ if (ieee80211_proc == NULL) {
+ IEEE80211_ERROR("Unable to create " DRV_NAME
+ " proc directory\n");
+ return -EIO;
+ }
+ e = create_proc_entry("debug_level", S_IFREG | S_IRUGO | S_IWUSR,
+ ieee80211_proc);
+ if (!e) {
+ remove_proc_entry(DRV_NAME, proc_net);
+ ieee80211_proc = NULL;
+ return -EIO;
+ }
+ e->read_proc = show_debug_level;
+ e->write_proc = store_debug_level;
+ e->data = NULL;
+#endif /* CONFIG_IEEE80211_DEBUG */
+
+ printk(KERN_INFO DRV_NAME ": " DRV_DESCRIPTION ", " DRV_VERSION "\n");
+ printk(KERN_INFO DRV_NAME ": " DRV_COPYRIGHT "\n");
+
+ return 0;
+}
+
+static void __exit ieee80211_exit(void)
+{
+#ifdef CONFIG_IEEE80211_DEBUG
+ if (ieee80211_proc) {
+ remove_proc_entry("debug_level", ieee80211_proc);
+ remove_proc_entry(DRV_NAME, proc_net);
+ ieee80211_proc = NULL;
+ }
+#endif /* CONFIG_IEEE80211_DEBUG */
+}
+
+#ifdef CONFIG_IEEE80211_DEBUG
+#include <linux/moduleparam.h>
+module_param(debug, int, 0444);
+MODULE_PARM_DESC(debug, "debug output mask");
+#endif /* CONFIG_IEEE80211_DEBUG */
+
+module_exit(ieee80211_exit);
+module_init(ieee80211_init);
+
+const char *escape_essid(const char *essid, u8 essid_len)
+{
+ static char escaped[IW_ESSID_MAX_SIZE * 2 + 1];
+ const char *s = essid;
+ char *d = escaped;
+
+ if (ieee80211_is_empty_essid(essid, essid_len)) {
+ memcpy(escaped, "<hidden>", sizeof("<hidden>"));
+ return escaped;
+ }
+
+ essid_len = min(essid_len, (u8) IW_ESSID_MAX_SIZE);
+ while (essid_len--) {
+ if (*s == '\0') {
+ *d++ = '\\';
+ *d++ = '0';
+ s++;
+ } else {
+ *d++ = *s++;
+ }
+ }
+ *d = '\0';
+ return escaped;
+}
+
+EXPORT_SYMBOL(alloc_ieee80211);
+EXPORT_SYMBOL(free_ieee80211);
+EXPORT_SYMBOL(escape_essid);
diff --git a/net/ieee80211/ieee80211_rx.c b/net/ieee80211/ieee80211_rx.c
new file mode 100644
index 00000000000..ce694cf5c16
--- /dev/null
+++ b/net/ieee80211/ieee80211_rx.c
@@ -0,0 +1,1511 @@
+/*
+ * Original code based Host AP (software wireless LAN access point) driver
+ * for Intersil Prism2/2.5/3 - hostap.o module, common routines
+ *
+ * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen
+ * <jkmaline@cc.hut.fi>
+ * Copyright (c) 2002-2003, Jouni Malinen <jkmaline@cc.hut.fi>
+ * Copyright (c) 2004-2005, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation. See README and COPYING for
+ * more details.
+ */
+
+#include <linux/compiler.h>
+#include <linux/config.h>
+#include <linux/errno.h>
+#include <linux/if_arp.h>
+#include <linux/in6.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/tcp.h>
+#include <linux/types.h>
+#include <linux/version.h>
+#include <linux/wireless.h>
+#include <linux/etherdevice.h>
+#include <asm/uaccess.h>
+#include <linux/ctype.h>
+
+#include <net/ieee80211.h>
+
+static inline void ieee80211_monitor_rx(struct ieee80211_device *ieee,
+ struct sk_buff *skb,
+ struct ieee80211_rx_stats *rx_stats)
+{
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+ u16 fc = le16_to_cpu(hdr->frame_ctl);
+
+ skb->dev = ieee->dev;
+ skb->mac.raw = skb->data;
+ skb_pull(skb, ieee80211_get_hdrlen(fc));
+ skb->pkt_type = PACKET_OTHERHOST;
+ skb->protocol = __constant_htons(ETH_P_80211_RAW);
+ memset(skb->cb, 0, sizeof(skb->cb));
+ netif_rx(skb);
+}
+
+/* Called only as a tasklet (software IRQ) */
+static struct ieee80211_frag_entry *ieee80211_frag_cache_find(struct
+ ieee80211_device
+ *ieee,
+ unsigned int seq,
+ unsigned int frag,
+ u8 * src,
+ u8 * dst)
+{
+ struct ieee80211_frag_entry *entry;
+ int i;
+
+ for (i = 0; i < IEEE80211_FRAG_CACHE_LEN; i++) {
+ entry = &ieee->frag_cache[i];
+ if (entry->skb != NULL &&
+ time_after(jiffies, entry->first_frag_time + 2 * HZ)) {
+ IEEE80211_DEBUG_FRAG("expiring fragment cache entry "
+ "seq=%u last_frag=%u\n",
+ entry->seq, entry->last_frag);
+ dev_kfree_skb_any(entry->skb);
+ entry->skb = NULL;
+ }
+
+ if (entry->skb != NULL && entry->seq == seq &&
+ (entry->last_frag + 1 == frag || frag == -1) &&
+ memcmp(entry->src_addr, src, ETH_ALEN) == 0 &&
+ memcmp(entry->dst_addr, dst, ETH_ALEN) == 0)
+ return entry;
+ }
+
+ return NULL;
+}
+
+/* Called only as a tasklet (software IRQ) */
+static struct sk_buff *ieee80211_frag_cache_get(struct ieee80211_device *ieee,
+ struct ieee80211_hdr_4addr *hdr)
+{
+ struct sk_buff *skb = NULL;
+ u16 sc;
+ unsigned int frag, seq;
+ struct ieee80211_frag_entry *entry;
+
+ sc = le16_to_cpu(hdr->seq_ctl);
+ frag = WLAN_GET_SEQ_FRAG(sc);
+ seq = WLAN_GET_SEQ_SEQ(sc);
+
+ if (frag == 0) {
+ /* Reserve enough space to fit maximum frame length */
+ skb = dev_alloc_skb(ieee->dev->mtu +
+ sizeof(struct ieee80211_hdr_4addr) +
+ 8 /* LLC */ +
+ 2 /* alignment */ +
+ 8 /* WEP */ + ETH_ALEN /* WDS */ );
+ if (skb == NULL)
+ return NULL;
+
+ entry = &ieee->frag_cache[ieee->frag_next_idx];
+ ieee->frag_next_idx++;
+ if (ieee->frag_next_idx >= IEEE80211_FRAG_CACHE_LEN)
+ ieee->frag_next_idx = 0;
+
+ if (entry->skb != NULL)
+ dev_kfree_skb_any(entry->skb);
+
+ entry->first_frag_time = jiffies;
+ entry->seq = seq;
+ entry->last_frag = frag;
+ entry->skb = skb;
+ memcpy(entry->src_addr, hdr->addr2, ETH_ALEN);
+ memcpy(entry->dst_addr, hdr->addr1, ETH_ALEN);
+ } else {
+ /* received a fragment of a frame for which the head fragment
+ * should have already been received */
+ entry = ieee80211_frag_cache_find(ieee, seq, frag, hdr->addr2,
+ hdr->addr1);
+ if (entry != NULL) {
+ entry->last_frag = frag;
+ skb = entry->skb;
+ }
+ }
+
+ return skb;
+}
+
+/* Called only as a tasklet (software IRQ) */
+static int ieee80211_frag_cache_invalidate(struct ieee80211_device *ieee,
+ struct ieee80211_hdr_4addr *hdr)
+{
+ u16 sc;
+ unsigned int seq;
+ struct ieee80211_frag_entry *entry;
+
+ sc = le16_to_cpu(hdr->seq_ctl);
+ seq = WLAN_GET_SEQ_SEQ(sc);
+
+ entry = ieee80211_frag_cache_find(ieee, seq, -1, hdr->addr2,
+ hdr->addr1);
+
+ if (entry == NULL) {
+ IEEE80211_DEBUG_FRAG("could not invalidate fragment cache "
+ "entry (seq=%u)\n", seq);
+ return -1;
+ }
+
+ entry->skb = NULL;
+ return 0;
+}
+
+#ifdef NOT_YET
+/* ieee80211_rx_frame_mgtmt
+ *
+ * Responsible for handling management control frames
+ *
+ * Called by ieee80211_rx */
+static inline int
+ieee80211_rx_frame_mgmt(struct ieee80211_device *ieee, struct sk_buff *skb,
+ struct ieee80211_rx_stats *rx_stats, u16 type,
+ u16 stype)
+{
+ if (ieee->iw_mode == IW_MODE_MASTER) {
+ printk(KERN_DEBUG "%s: Master mode not yet suppported.\n",
+ ieee->dev->name);
+ return 0;
+/*
+ hostap_update_sta_ps(ieee, (struct hostap_ieee80211_hdr_4addr *)
+ skb->data);*/
+ }
+
+ if (ieee->hostapd && type == WLAN_FC_TYPE_MGMT) {
+ if (stype == WLAN_FC_STYPE_BEACON &&
+ ieee->iw_mode == IW_MODE_MASTER) {
+ struct sk_buff *skb2;
+ /* Process beacon frames also in kernel driver to
+ * update STA(AP) table statistics */
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (skb2)
+ hostap_rx(skb2->dev, skb2, rx_stats);
+ }
+
+ /* send management frames to the user space daemon for
+ * processing */
+ ieee->apdevstats.rx_packets++;
+ ieee->apdevstats.rx_bytes += skb->len;
+ prism2_rx_80211(ieee->apdev, skb, rx_stats, PRISM2_RX_MGMT);
+ return 0;
+ }
+
+ if (ieee->iw_mode == IW_MODE_MASTER) {
+ if (type != WLAN_FC_TYPE_MGMT && type != WLAN_FC_TYPE_CTRL) {
+ printk(KERN_DEBUG "%s: unknown management frame "
+ "(type=0x%02x, stype=0x%02x) dropped\n",
+ skb->dev->name, type, stype);
+ return -1;
+ }
+
+ hostap_rx(skb->dev, skb, rx_stats);
+ return 0;
+ }
+
+ printk(KERN_DEBUG "%s: hostap_rx_frame_mgmt: management frame "
+ "received in non-Host AP mode\n", skb->dev->name);
+ return -1;
+}
+#endif
+
+/* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */
+/* Ethernet-II snap header (RFC1042 for most EtherTypes) */
+static unsigned char rfc1042_header[] = { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0x00 };
+
+/* Bridge-Tunnel header (for EtherTypes ETH_P_AARP and ETH_P_IPX) */
+static unsigned char bridge_tunnel_header[] =
+ { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0xf8 };
+/* No encapsulation header if EtherType < 0x600 (=length) */
+
+/* Called by ieee80211_rx_frame_decrypt */
+static int ieee80211_is_eapol_frame(struct ieee80211_device *ieee,
+ struct sk_buff *skb)
+{
+ struct net_device *dev = ieee->dev;
+ u16 fc, ethertype;
+ struct ieee80211_hdr_3addr *hdr;
+ u8 *pos;
+
+ if (skb->len < 24)
+ return 0;
+
+ hdr = (struct ieee80211_hdr_3addr *)skb->data;
+ fc = le16_to_cpu(hdr->frame_ctl);
+
+ /* check that the frame is unicast frame to us */
+ if ((fc & (IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) ==
+ IEEE80211_FCTL_TODS &&
+ memcmp(hdr->addr1, dev->dev_addr, ETH_ALEN) == 0 &&
+ memcmp(hdr->addr3, dev->dev_addr, ETH_ALEN) == 0) {
+ /* ToDS frame with own addr BSSID and DA */
+ } else if ((fc & (IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) ==
+ IEEE80211_FCTL_FROMDS &&
+ memcmp(hdr->addr1, dev->dev_addr, ETH_ALEN) == 0) {
+ /* FromDS frame with own addr as DA */
+ } else
+ return 0;
+
+ if (skb->len < 24 + 8)
+ return 0;
+
+ /* check for port access entity Ethernet type */
+ pos = skb->data + 24;
+ ethertype = (pos[6] << 8) | pos[7];
+ if (ethertype == ETH_P_PAE)
+ return 1;
+
+ return 0;
+}
+
+/* Called only as a tasklet (software IRQ), by ieee80211_rx */
+static inline int
+ieee80211_rx_frame_decrypt(struct ieee80211_device *ieee, struct sk_buff *skb,
+ struct ieee80211_crypt_data *crypt)
+{
+ struct ieee80211_hdr_3addr *hdr;
+ int res, hdrlen;
+
+ if (crypt == NULL || crypt->ops->decrypt_mpdu == NULL)
+ return 0;
+
+ hdr = (struct ieee80211_hdr_3addr *)skb->data;
+ hdrlen = ieee80211_get_hdrlen(le16_to_cpu(hdr->frame_ctl));
+
+ atomic_inc(&crypt->refcnt);
+ res = crypt->ops->decrypt_mpdu(skb, hdrlen, crypt->priv);
+ atomic_dec(&crypt->refcnt);
+ if (res < 0) {
+ IEEE80211_DEBUG_DROP("decryption failed (SA=" MAC_FMT
+ ") res=%d\n", MAC_ARG(hdr->addr2), res);
+ if (res == -2)
+ IEEE80211_DEBUG_DROP("Decryption failed ICV "
+ "mismatch (key %d)\n",
+ skb->data[hdrlen + 3] >> 6);
+ ieee->ieee_stats.rx_discards_undecryptable++;
+ return -1;
+ }
+
+ return res;
+}
+
+/* Called only as a tasklet (software IRQ), by ieee80211_rx */
+static inline int
+ieee80211_rx_frame_decrypt_msdu(struct ieee80211_device *ieee,
+ struct sk_buff *skb, int keyidx,
+ struct ieee80211_crypt_data *crypt)
+{
+ struct ieee80211_hdr_3addr *hdr;
+ int res, hdrlen;
+
+ if (crypt == NULL || crypt->ops->decrypt_msdu == NULL)
+ return 0;
+
+ hdr = (struct ieee80211_hdr_3addr *)skb->data;
+ hdrlen = ieee80211_get_hdrlen(le16_to_cpu(hdr->frame_ctl));
+
+ atomic_inc(&crypt->refcnt);
+ res = crypt->ops->decrypt_msdu(skb, keyidx, hdrlen, crypt->priv);
+ atomic_dec(&crypt->refcnt);
+ if (res < 0) {
+ printk(KERN_DEBUG "%s: MSDU decryption/MIC verification failed"
+ " (SA=" MAC_FMT " keyidx=%d)\n",
+ ieee->dev->name, MAC_ARG(hdr->addr2), keyidx);
+ return -1;
+ }
+
+ return 0;
+}
+
+/* All received frames are sent to this function. @skb contains the frame in
+ * IEEE 802.11 format, i.e., in the format it was sent over air.
+ * This function is called only as a tasklet (software IRQ). */
+int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb,
+ struct ieee80211_rx_stats *rx_stats)
+{
+ struct net_device *dev = ieee->dev;
+ struct ieee80211_hdr_4addr *hdr;
+ size_t hdrlen;
+ u16 fc, type, stype, sc;
+ struct net_device_stats *stats;
+ unsigned int frag;
+ u8 *payload;
+ u16 ethertype;
+#ifdef NOT_YET
+ struct net_device *wds = NULL;
+ struct sk_buff *skb2 = NULL;
+ struct net_device *wds = NULL;
+ int frame_authorized = 0;
+ int from_assoc_ap = 0;
+ void *sta = NULL;
+#endif
+ u8 dst[ETH_ALEN];
+ u8 src[ETH_ALEN];
+ struct ieee80211_crypt_data *crypt = NULL;
+ int keyidx = 0;
+
+ hdr = (struct ieee80211_hdr_4addr *)skb->data;
+ stats = &ieee->stats;
+
+ if (skb->len < 10) {
+ printk(KERN_INFO "%s: SKB length < 10\n", dev->name);
+ goto rx_dropped;
+ }
+
+ fc = le16_to_cpu(hdr->frame_ctl);
+ type = WLAN_FC_GET_TYPE(fc);
+ stype = WLAN_FC_GET_STYPE(fc);
+ sc = le16_to_cpu(hdr->seq_ctl);
+ frag = WLAN_GET_SEQ_FRAG(sc);
+ hdrlen = ieee80211_get_hdrlen(fc);
+
+ /* Put this code here so that we avoid duplicating it in all
+ * Rx paths. - Jean II */
+#ifdef IW_WIRELESS_SPY /* defined in iw_handler.h */
+ /* If spy monitoring on */
+ if (ieee->spy_data.spy_number > 0) {
+ struct iw_quality wstats;
+
+ wstats.updated = 0;
+ if (rx_stats->mask & IEEE80211_STATMASK_RSSI) {
+ wstats.level = rx_stats->rssi;
+ wstats.updated |= IW_QUAL_LEVEL_UPDATED;
+ } else
+ wstats.updated |= IW_QUAL_LEVEL_INVALID;
+
+ if (rx_stats->mask & IEEE80211_STATMASK_NOISE) {
+ wstats.noise = rx_stats->noise;
+ wstats.updated |= IW_QUAL_NOISE_UPDATED;
+ } else
+ wstats.updated |= IW_QUAL_NOISE_INVALID;
+
+ if (rx_stats->mask & IEEE80211_STATMASK_SIGNAL) {
+ wstats.qual = rx_stats->signal;
+ wstats.updated |= IW_QUAL_QUAL_UPDATED;
+ } else
+ wstats.updated |= IW_QUAL_QUAL_INVALID;
+
+ /* Update spy records */
+ wireless_spy_update(ieee->dev, hdr->addr2, &wstats);
+ }
+#endif /* IW_WIRELESS_SPY */
+
+#ifdef NOT_YET
+ hostap_update_rx_stats(local->ap, hdr, rx_stats);
+#endif
+
+ if (ieee->iw_mode == IW_MODE_MONITOR) {
+ ieee80211_monitor_rx(ieee, skb, rx_stats);
+ stats->rx_packets++;
+ stats->rx_bytes += skb->len;
+ return 1;
+ }
+
+ if ((is_multicast_ether_addr(hdr->addr1) ||
+ is_broadcast_ether_addr(hdr->addr2)) ? ieee->host_mc_decrypt :
+ ieee->host_decrypt) {
+ int idx = 0;
+ if (skb->len >= hdrlen + 3)
+ idx = skb->data[hdrlen + 3] >> 6;
+ crypt = ieee->crypt[idx];
+#ifdef NOT_YET
+ sta = NULL;
+
+ /* Use station specific key to override default keys if the
+ * receiver address is a unicast address ("individual RA"). If
+ * bcrx_sta_key parameter is set, station specific key is used
+ * even with broad/multicast targets (this is against IEEE
+ * 802.11, but makes it easier to use different keys with
+ * stations that do not support WEP key mapping). */
+
+ if (!(hdr->addr1[0] & 0x01) || local->bcrx_sta_key)
+ (void)hostap_handle_sta_crypto(local, hdr, &crypt,
+ &sta);
+#endif
+
+ /* allow NULL decrypt to indicate an station specific override
+ * for default encryption */
+ if (crypt && (crypt->ops == NULL ||
+ crypt->ops->decrypt_mpdu == NULL))
+ crypt = NULL;
+
+ if (!crypt && (fc & IEEE80211_FCTL_PROTECTED)) {
+ /* This seems to be triggered by some (multicast?)
+ * frames from other than current BSS, so just drop the
+ * frames silently instead of filling system log with
+ * these reports. */
+ IEEE80211_DEBUG_DROP("Decryption failed (not set)"
+ " (SA=" MAC_FMT ")\n",
+ MAC_ARG(hdr->addr2));
+ ieee->ieee_stats.rx_discards_undecryptable++;
+ goto rx_dropped;
+ }
+ }
+#ifdef NOT_YET
+ if (type != WLAN_FC_TYPE_DATA) {
+ if (type == WLAN_FC_TYPE_MGMT && stype == WLAN_FC_STYPE_AUTH &&
+ fc & IEEE80211_FCTL_PROTECTED && ieee->host_decrypt &&
+ (keyidx = hostap_rx_frame_decrypt(ieee, skb, crypt)) < 0) {
+ printk(KERN_DEBUG "%s: failed to decrypt mgmt::auth "
+ "from " MAC_FMT "\n", dev->name,
+ MAC_ARG(hdr->addr2));
+ /* TODO: could inform hostapd about this so that it
+ * could send auth failure report */
+ goto rx_dropped;
+ }
+
+ if (ieee80211_rx_frame_mgmt(ieee, skb, rx_stats, type, stype))
+ goto rx_dropped;
+ else
+ goto rx_exit;
+ }
+#endif
+
+ /* Data frame - extract src/dst addresses */
+ if (skb->len < IEEE80211_3ADDR_LEN)
+ goto rx_dropped;
+
+ switch (fc & (IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS)) {
+ case IEEE80211_FCTL_FROMDS:
+ memcpy(dst, hdr->addr1, ETH_ALEN);
+ memcpy(src, hdr->addr3, ETH_ALEN);
+ break;
+ case IEEE80211_FCTL_TODS:
+ memcpy(dst, hdr->addr3, ETH_ALEN);
+ memcpy(src, hdr->addr2, ETH_ALEN);
+ break;
+ case IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS:
+ if (skb->len < IEEE80211_4ADDR_LEN)
+ goto rx_dropped;
+ memcpy(dst, hdr->addr3, ETH_ALEN);
+ memcpy(src, hdr->addr4, ETH_ALEN);
+ break;
+ case 0:
+ memcpy(dst, hdr->addr1, ETH_ALEN);
+ memcpy(src, hdr->addr2, ETH_ALEN);
+ break;
+ }
+
+#ifdef NOT_YET
+ if (hostap_rx_frame_wds(ieee, hdr, fc, &wds))
+ goto rx_dropped;
+ if (wds) {
+ skb->dev = dev = wds;
+ stats = hostap_get_stats(dev);
+ }
+
+ if (ieee->iw_mode == IW_MODE_MASTER && !wds &&
+ (fc & (IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) ==
+ IEEE80211_FCTL_FROMDS && ieee->stadev
+ && memcmp(hdr->addr2, ieee->assoc_ap_addr, ETH_ALEN) == 0) {
+ /* Frame from BSSID of the AP for which we are a client */
+ skb->dev = dev = ieee->stadev;
+ stats = hostap_get_stats(dev);
+ from_assoc_ap = 1;
+ }
+#endif
+
+ dev->last_rx = jiffies;
+
+#ifdef NOT_YET
+ if ((ieee->iw_mode == IW_MODE_MASTER ||
+ ieee->iw_mode == IW_MODE_REPEAT) && !from_assoc_ap) {
+ switch (hostap_handle_sta_rx(ieee, dev, skb, rx_stats,
+ wds != NULL)) {
+ case AP_RX_CONTINUE_NOT_AUTHORIZED:
+ frame_authorized = 0;
+ break;
+ case AP_RX_CONTINUE:
+ frame_authorized = 1;
+ break;
+ case AP_RX_DROP:
+ goto rx_dropped;
+ case AP_RX_EXIT:
+ goto rx_exit;
+ }
+ }
+#endif
+
+ /* Nullfunc frames may have PS-bit set, so they must be passed to
+ * hostap_handle_sta_rx() before being dropped here. */
+
+ stype &= ~IEEE80211_STYPE_QOS_DATA;
+
+ if (stype != IEEE80211_STYPE_DATA &&
+ stype != IEEE80211_STYPE_DATA_CFACK &&
+ stype != IEEE80211_STYPE_DATA_CFPOLL &&
+ stype != IEEE80211_STYPE_DATA_CFACKPOLL) {
+ if (stype != IEEE80211_STYPE_NULLFUNC)
+ IEEE80211_DEBUG_DROP("RX: dropped data frame "
+ "with no data (type=0x%02x, "
+ "subtype=0x%02x, len=%d)\n",
+ type, stype, skb->len);
+ goto rx_dropped;
+ }
+
+ /* skb: hdr + (possibly fragmented, possibly encrypted) payload */
+
+ if (ieee->host_decrypt && (fc & IEEE80211_FCTL_PROTECTED) &&
+ (keyidx = ieee80211_rx_frame_decrypt(ieee, skb, crypt)) < 0)
+ goto rx_dropped;
+
+ hdr = (struct ieee80211_hdr_4addr *)skb->data;
+
+ /* skb: hdr + (possibly fragmented) plaintext payload */
+ // PR: FIXME: hostap has additional conditions in the "if" below:
+ // ieee->host_decrypt && (fc & IEEE80211_FCTL_PROTECTED) &&
+ if ((frag != 0 || (fc & IEEE80211_FCTL_MOREFRAGS))) {
+ int flen;
+ struct sk_buff *frag_skb = ieee80211_frag_cache_get(ieee, hdr);
+ IEEE80211_DEBUG_FRAG("Rx Fragment received (%u)\n", frag);
+
+ if (!frag_skb) {
+ IEEE80211_DEBUG(IEEE80211_DL_RX | IEEE80211_DL_FRAG,
+ "Rx cannot get skb from fragment "
+ "cache (morefrag=%d seq=%u frag=%u)\n",
+ (fc & IEEE80211_FCTL_MOREFRAGS) != 0,
+ WLAN_GET_SEQ_SEQ(sc), frag);
+ goto rx_dropped;
+ }
+
+ flen = skb->len;
+ if (frag != 0)
+ flen -= hdrlen;
+
+ if (frag_skb->tail + flen > frag_skb->end) {
+ printk(KERN_WARNING "%s: host decrypted and "
+ "reassembled frame did not fit skb\n",
+ dev->name);
+ ieee80211_frag_cache_invalidate(ieee, hdr);
+ goto rx_dropped;
+ }
+
+ if (frag == 0) {
+ /* copy first fragment (including full headers) into
+ * beginning of the fragment cache skb */
+ memcpy(skb_put(frag_skb, flen), skb->data, flen);
+ } else {
+ /* append frame payload to the end of the fragment
+ * cache skb */
+ memcpy(skb_put(frag_skb, flen), skb->data + hdrlen,
+ flen);
+ }
+ dev_kfree_skb_any(skb);
+ skb = NULL;
+
+ if (fc & IEEE80211_FCTL_MOREFRAGS) {
+ /* more fragments expected - leave the skb in fragment
+ * cache for now; it will be delivered to upper layers
+ * after all fragments have been received */
+ goto rx_exit;
+ }
+
+ /* this was the last fragment and the frame will be
+ * delivered, so remove skb from fragment cache */
+ skb = frag_skb;
+ hdr = (struct ieee80211_hdr_4addr *)skb->data;
+ ieee80211_frag_cache_invalidate(ieee, hdr);
+ }
+
+ /* skb: hdr + (possible reassembled) full MSDU payload; possibly still
+ * encrypted/authenticated */
+ if (ieee->host_decrypt && (fc & IEEE80211_FCTL_PROTECTED) &&
+ ieee80211_rx_frame_decrypt_msdu(ieee, skb, keyidx, crypt))
+ goto rx_dropped;
+
+ hdr = (struct ieee80211_hdr_4addr *)skb->data;
+ if (crypt && !(fc & IEEE80211_FCTL_PROTECTED) && !ieee->open_wep) {
+ if ( /*ieee->ieee802_1x && */
+ ieee80211_is_eapol_frame(ieee, skb)) {
+ /* pass unencrypted EAPOL frames even if encryption is
+ * configured */
+ } else {
+ IEEE80211_DEBUG_DROP("encryption configured, but RX "
+ "frame not encrypted (SA=" MAC_FMT
+ ")\n", MAC_ARG(hdr->addr2));
+ goto rx_dropped;
+ }
+ }
+
+ if (crypt && !(fc & IEEE80211_FCTL_PROTECTED) && !ieee->open_wep &&
+ !ieee80211_is_eapol_frame(ieee, skb)) {
+ IEEE80211_DEBUG_DROP("dropped unencrypted RX data "
+ "frame from " MAC_FMT
+ " (drop_unencrypted=1)\n",
+ MAC_ARG(hdr->addr2));
+ goto rx_dropped;
+ }
+
+ /* skb: hdr + (possible reassembled) full plaintext payload */
+
+ payload = skb->data + hdrlen;
+ ethertype = (payload[6] << 8) | payload[7];
+
+#ifdef NOT_YET
+ /* If IEEE 802.1X is used, check whether the port is authorized to send
+ * the received frame. */
+ if (ieee->ieee802_1x && ieee->iw_mode == IW_MODE_MASTER) {
+ if (ethertype == ETH_P_PAE) {
+ printk(KERN_DEBUG "%s: RX: IEEE 802.1X frame\n",
+ dev->name);
+ if (ieee->hostapd && ieee->apdev) {
+ /* Send IEEE 802.1X frames to the user
+ * space daemon for processing */
+ prism2_rx_80211(ieee->apdev, skb, rx_stats,
+ PRISM2_RX_MGMT);
+ ieee->apdevstats.rx_packets++;
+ ieee->apdevstats.rx_bytes += skb->len;
+ goto rx_exit;
+ }
+ } else if (!frame_authorized) {
+ printk(KERN_DEBUG "%s: dropped frame from "
+ "unauthorized port (IEEE 802.1X): "
+ "ethertype=0x%04x\n", dev->name, ethertype);
+ goto rx_dropped;
+ }
+ }
+#endif
+
+ /* convert hdr + possible LLC headers into Ethernet header */
+ if (skb->len - hdrlen >= 8 &&
+ ((memcmp(payload, rfc1042_header, SNAP_SIZE) == 0 &&
+ ethertype != ETH_P_AARP && ethertype != ETH_P_IPX) ||
+ memcmp(payload, bridge_tunnel_header, SNAP_SIZE) == 0)) {
+ /* remove RFC1042 or Bridge-Tunnel encapsulation and
+ * replace EtherType */
+ skb_pull(skb, hdrlen + SNAP_SIZE);
+ memcpy(skb_push(skb, ETH_ALEN), src, ETH_ALEN);
+ memcpy(skb_push(skb, ETH_ALEN), dst, ETH_ALEN);
+ } else {
+ u16 len;
+ /* Leave Ethernet header part of hdr and full payload */
+ skb_pull(skb, hdrlen);
+ len = htons(skb->len);
+ memcpy(skb_push(skb, 2), &len, 2);
+ memcpy(skb_push(skb, ETH_ALEN), src, ETH_ALEN);
+ memcpy(skb_push(skb, ETH_ALEN), dst, ETH_ALEN);
+ }
+
+#ifdef NOT_YET
+ if (wds && ((fc & (IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) ==
+ IEEE80211_FCTL_TODS) && skb->len >= ETH_HLEN + ETH_ALEN) {
+ /* Non-standard frame: get addr4 from its bogus location after
+ * the payload */
+ memcpy(skb->data + ETH_ALEN,
+ skb->data + skb->len - ETH_ALEN, ETH_ALEN);
+ skb_trim(skb, skb->len - ETH_ALEN);
+ }
+#endif
+
+ stats->rx_packets++;
+ stats->rx_bytes += skb->len;
+
+#ifdef NOT_YET
+ if (ieee->iw_mode == IW_MODE_MASTER && !wds && ieee->ap->bridge_packets) {
+ if (dst[0] & 0x01) {
+ /* copy multicast frame both to the higher layers and
+ * to the wireless media */
+ ieee->ap->bridged_multicast++;
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (skb2 == NULL)
+ printk(KERN_DEBUG "%s: skb_clone failed for "
+ "multicast frame\n", dev->name);
+ } else if (hostap_is_sta_assoc(ieee->ap, dst)) {
+ /* send frame directly to the associated STA using
+ * wireless media and not passing to higher layers */
+ ieee->ap->bridged_unicast++;
+ skb2 = skb;
+ skb = NULL;
+ }
+ }
+
+ if (skb2 != NULL) {
+ /* send to wireless media */
+ skb2->protocol = __constant_htons(ETH_P_802_3);
+ skb2->mac.raw = skb2->nh.raw = skb2->data;
+ /* skb2->nh.raw = skb2->data + ETH_HLEN; */
+ skb2->dev = dev;
+ dev_queue_xmit(skb2);
+ }
+#endif
+
+ if (skb) {
+ skb->protocol = eth_type_trans(skb, dev);
+ memset(skb->cb, 0, sizeof(skb->cb));
+ skb->dev = dev;
+ skb->ip_summed = CHECKSUM_NONE; /* 802.11 crc not sufficient */
+ netif_rx(skb);
+ }
+
+ rx_exit:
+#ifdef NOT_YET
+ if (sta)
+ hostap_handle_sta_release(sta);
+#endif
+ return 1;
+
+ rx_dropped:
+ stats->rx_dropped++;
+
+ /* Returning 0 indicates to caller that we have not handled the SKB--
+ * so it is still allocated and can be used again by underlying
+ * hardware as a DMA target */
+ return 0;
+}
+
+#define MGMT_FRAME_FIXED_PART_LENGTH 0x24
+
+static u8 qos_oui[QOS_OUI_LEN] = { 0x00, 0x50, 0xF2 };
+
+/*
+* Make ther structure we read from the beacon packet has
+* the right values
+*/
+static int ieee80211_verify_qos_info(struct ieee80211_qos_information_element
+ *info_element, int sub_type)
+{
+
+ if (info_element->qui_subtype != sub_type)
+ return -1;
+ if (memcmp(info_element->qui, qos_oui, QOS_OUI_LEN))
+ return -1;
+ if (info_element->qui_type != QOS_OUI_TYPE)
+ return -1;
+ if (info_element->version != QOS_VERSION_1)
+ return -1;
+
+ return 0;
+}
+
+/*
+ * Parse a QoS parameter element
+ */
+static int ieee80211_read_qos_param_element(struct ieee80211_qos_parameter_info
+ *element_param, struct ieee80211_info_element
+ *info_element)
+{
+ int ret = 0;
+ u16 size = sizeof(struct ieee80211_qos_parameter_info) - 2;
+
+ if ((info_element == NULL) || (element_param == NULL))
+ return -1;
+
+ if (info_element->id == QOS_ELEMENT_ID && info_element->len == size) {
+ memcpy(element_param->info_element.qui, info_element->data,
+ info_element->len);
+ element_param->info_element.elementID = info_element->id;
+ element_param->info_element.length = info_element->len;
+ } else
+ ret = -1;
+ if (ret == 0)
+ ret = ieee80211_verify_qos_info(&element_param->info_element,
+ QOS_OUI_PARAM_SUB_TYPE);
+ return ret;
+}
+
+/*
+ * Parse a QoS information element
+ */
+static int ieee80211_read_qos_info_element(struct
+ ieee80211_qos_information_element
+ *element_info, struct ieee80211_info_element
+ *info_element)
+{
+ int ret = 0;
+ u16 size = sizeof(struct ieee80211_qos_information_element) - 2;
+
+ if (element_info == NULL)
+ return -1;
+ if (info_element == NULL)
+ return -1;
+
+ if ((info_element->id == QOS_ELEMENT_ID) && (info_element->len == size)) {
+ memcpy(element_info->qui, info_element->data,
+ info_element->len);
+ element_info->elementID = info_element->id;
+ element_info->length = info_element->len;
+ } else
+ ret = -1;
+
+ if (ret == 0)
+ ret = ieee80211_verify_qos_info(element_info,
+ QOS_OUI_INFO_SUB_TYPE);
+ return ret;
+}
+
+/*
+ * Write QoS parameters from the ac parameters.
+ */
+static int ieee80211_qos_convert_ac_to_parameters(struct
+ ieee80211_qos_parameter_info
+ *param_elm, struct
+ ieee80211_qos_parameters
+ *qos_param)
+{
+ int rc = 0;
+ int i;
+ struct ieee80211_qos_ac_parameter *ac_params;
+ u32 txop;
+ u8 cw_min;
+ u8 cw_max;
+
+ for (i = 0; i < QOS_QUEUE_NUM; i++) {
+ ac_params = &(param_elm->ac_params_record[i]);
+
+ qos_param->aifs[i] = (ac_params->aci_aifsn) & 0x0F;
+ qos_param->aifs[i] -= (qos_param->aifs[i] < 2) ? 0 : 2;
+
+ cw_min = ac_params->ecw_min_max & 0x0F;
+ qos_param->cw_min[i] = (u16) ((1 << cw_min) - 1);
+
+ cw_max = (ac_params->ecw_min_max & 0xF0) >> 4;
+ qos_param->cw_max[i] = (u16) ((1 << cw_max) - 1);
+
+ qos_param->flag[i] =
+ (ac_params->aci_aifsn & 0x10) ? 0x01 : 0x00;
+
+ txop = le16_to_cpu(ac_params->tx_op_limit) * 32;
+ qos_param->tx_op_limit[i] = (u16) txop;
+ }
+ return rc;
+}
+
+/*
+ * we have a generic data element which it may contain QoS information or
+ * parameters element. check the information element length to decide
+ * which type to read
+ */
+static int ieee80211_parse_qos_info_param_IE(struct ieee80211_info_element
+ *info_element,
+ struct ieee80211_network *network)
+{
+ int rc = 0;
+ struct ieee80211_qos_parameters *qos_param = NULL;
+ struct ieee80211_qos_information_element qos_info_element;
+
+ rc = ieee80211_read_qos_info_element(&qos_info_element, info_element);
+
+ if (rc == 0) {
+ network->qos_data.param_count = qos_info_element.ac_info & 0x0F;
+ network->flags |= NETWORK_HAS_QOS_INFORMATION;
+ } else {
+ struct ieee80211_qos_parameter_info param_element;
+
+ rc = ieee80211_read_qos_param_element(&param_element,
+ info_element);
+ if (rc == 0) {
+ qos_param = &(network->qos_data.parameters);
+ ieee80211_qos_convert_ac_to_parameters(&param_element,
+ qos_param);
+ network->flags |= NETWORK_HAS_QOS_PARAMETERS;
+ network->qos_data.param_count =
+ param_element.info_element.ac_info & 0x0F;
+ }
+ }
+
+ if (rc == 0) {
+ IEEE80211_DEBUG_QOS("QoS is supported\n");
+ network->qos_data.supported = 1;
+ }
+ return rc;
+}
+
+static int ieee80211_parse_info_param(struct ieee80211_info_element
+ *info_element, u16 length,
+ struct ieee80211_network *network)
+{
+ u8 i;
+#ifdef CONFIG_IEEE80211_DEBUG
+ char rates_str[64];
+ char *p;
+#endif
+
+ while (length >= sizeof(*info_element)) {
+ if (sizeof(*info_element) + info_element->len > length) {
+ IEEE80211_DEBUG_MGMT("Info elem: parse failed: "
+ "info_element->len + 2 > left : "
+ "info_element->len+2=%zd left=%d, id=%d.\n",
+ info_element->len +
+ sizeof(*info_element),
+ length, info_element->id);
+ return 1;
+ }
+
+ switch (info_element->id) {
+ case MFIE_TYPE_SSID:
+ if (ieee80211_is_empty_essid(info_element->data,
+ info_element->len)) {
+ network->flags |= NETWORK_EMPTY_ESSID;
+ break;
+ }
+
+ network->ssid_len = min(info_element->len,
+ (u8) IW_ESSID_MAX_SIZE);
+ memcpy(network->ssid, info_element->data,
+ network->ssid_len);
+ if (network->ssid_len < IW_ESSID_MAX_SIZE)
+ memset(network->ssid + network->ssid_len, 0,
+ IW_ESSID_MAX_SIZE - network->ssid_len);
+
+ IEEE80211_DEBUG_MGMT("MFIE_TYPE_SSID: '%s' len=%d.\n",
+ network->ssid, network->ssid_len);
+ break;
+
+ case MFIE_TYPE_RATES:
+#ifdef CONFIG_IEEE80211_DEBUG
+ p = rates_str;
+#endif
+ network->rates_len = min(info_element->len,
+ MAX_RATES_LENGTH);
+ for (i = 0; i < network->rates_len; i++) {
+ network->rates[i] = info_element->data[i];
+#ifdef CONFIG_IEEE80211_DEBUG
+ p += snprintf(p, sizeof(rates_str) -
+ (p - rates_str), "%02X ",
+ network->rates[i]);
+#endif
+ if (ieee80211_is_ofdm_rate
+ (info_element->data[i])) {
+ network->flags |= NETWORK_HAS_OFDM;
+ if (info_element->data[i] &
+ IEEE80211_BASIC_RATE_MASK)
+ network->flags &=
+ ~NETWORK_HAS_CCK;
+ }
+ }
+
+ IEEE80211_DEBUG_MGMT("MFIE_TYPE_RATES: '%s' (%d)\n",
+ rates_str, network->rates_len);
+ break;
+
+ case MFIE_TYPE_RATES_EX:
+#ifdef CONFIG_IEEE80211_DEBUG
+ p = rates_str;
+#endif
+ network->rates_ex_len = min(info_element->len,
+ MAX_RATES_EX_LENGTH);
+ for (i = 0; i < network->rates_ex_len; i++) {
+ network->rates_ex[i] = info_element->data[i];
+#ifdef CONFIG_IEEE80211_DEBUG
+ p += snprintf(p, sizeof(rates_str) -
+ (p - rates_str), "%02X ",
+ network->rates[i]);
+#endif
+ if (ieee80211_is_ofdm_rate
+ (info_element->data[i])) {
+ network->flags |= NETWORK_HAS_OFDM;
+ if (info_element->data[i] &
+ IEEE80211_BASIC_RATE_MASK)
+ network->flags &=
+ ~NETWORK_HAS_CCK;
+ }
+ }
+
+ IEEE80211_DEBUG_MGMT("MFIE_TYPE_RATES_EX: '%s' (%d)\n",
+ rates_str, network->rates_ex_len);
+ break;
+
+ case MFIE_TYPE_DS_SET:
+ IEEE80211_DEBUG_MGMT("MFIE_TYPE_DS_SET: %d\n",
+ info_element->data[0]);
+ network->channel = info_element->data[0];
+ break;
+
+ case MFIE_TYPE_FH_SET:
+ IEEE80211_DEBUG_MGMT("MFIE_TYPE_FH_SET: ignored\n");
+ break;
+
+ case MFIE_TYPE_CF_SET:
+ IEEE80211_DEBUG_MGMT("MFIE_TYPE_CF_SET: ignored\n");
+ break;
+
+ case MFIE_TYPE_TIM:
+ IEEE80211_DEBUG_MGMT("MFIE_TYPE_TIM: ignored\n");
+ break;
+
+ case MFIE_TYPE_ERP_INFO:
+ network->erp_value = info_element->data[0];
+ IEEE80211_DEBUG_MGMT("MFIE_TYPE_ERP_SET: %d\n",
+ network->erp_value);
+ break;
+
+ case MFIE_TYPE_IBSS_SET:
+ network->atim_window = info_element->data[0];
+ IEEE80211_DEBUG_MGMT("MFIE_TYPE_IBSS_SET: %d\n",
+ network->atim_window);
+ break;
+
+ case MFIE_TYPE_CHALLENGE:
+ IEEE80211_DEBUG_MGMT("MFIE_TYPE_CHALLENGE: ignored\n");
+ break;
+
+ case MFIE_TYPE_GENERIC:
+ IEEE80211_DEBUG_MGMT("MFIE_TYPE_GENERIC: %d bytes\n",
+ info_element->len);
+ if (!ieee80211_parse_qos_info_param_IE(info_element,
+ network))
+ break;
+
+ if (info_element->len >= 4 &&
+ info_element->data[0] == 0x00 &&
+ info_element->data[1] == 0x50 &&
+ info_element->data[2] == 0xf2 &&
+ info_element->data[3] == 0x01) {
+ network->wpa_ie_len = min(info_element->len + 2,
+ MAX_WPA_IE_LEN);
+ memcpy(network->wpa_ie, info_element,
+ network->wpa_ie_len);
+ }
+ break;
+
+ case MFIE_TYPE_RSN:
+ IEEE80211_DEBUG_MGMT("MFIE_TYPE_RSN: %d bytes\n",
+ info_element->len);
+ network->rsn_ie_len = min(info_element->len + 2,
+ MAX_WPA_IE_LEN);
+ memcpy(network->rsn_ie, info_element,
+ network->rsn_ie_len);
+ break;
+
+ case MFIE_TYPE_QOS_PARAMETER:
+ printk(KERN_ERR
+ "QoS Error need to parse QOS_PARAMETER IE\n");
+ break;
+
+ default:
+ IEEE80211_DEBUG_MGMT("unsupported IE %d\n",
+ info_element->id);
+ break;
+ }
+
+ length -= sizeof(*info_element) + info_element->len;
+ info_element =
+ (struct ieee80211_info_element *)&info_element->
+ data[info_element->len];
+ }
+
+ return 0;
+}
+
+static int ieee80211_handle_assoc_resp(struct ieee80211_device *ieee, struct ieee80211_assoc_response
+ *frame, struct ieee80211_rx_stats *stats)
+{
+ struct ieee80211_network network_resp;
+ struct ieee80211_network *network = &network_resp;
+ struct net_device *dev = ieee->dev;
+
+ network->flags = 0;
+ network->qos_data.active = 0;
+ network->qos_data.supported = 0;
+ network->qos_data.param_count = 0;
+ network->qos_data.old_param_count = 0;
+
+ //network->atim_window = le16_to_cpu(frame->aid) & (0x3FFF);
+ network->atim_window = le16_to_cpu(frame->aid);
+ network->listen_interval = le16_to_cpu(frame->status);
+ memcpy(network->bssid, frame->header.addr3, ETH_ALEN);
+ network->capability = le16_to_cpu(frame->capability);
+ network->last_scanned = jiffies;
+ network->rates_len = network->rates_ex_len = 0;
+ network->last_associate = 0;
+ network->ssid_len = 0;
+ network->erp_value =
+ (network->capability & WLAN_CAPABILITY_IBSS) ? 0x3 : 0x0;
+
+ if (stats->freq == IEEE80211_52GHZ_BAND) {
+ /* for A band (No DS info) */
+ network->channel = stats->received_channel;
+ } else
+ network->flags |= NETWORK_HAS_CCK;
+
+ network->wpa_ie_len = 0;
+ network->rsn_ie_len = 0;
+
+ if (ieee80211_parse_info_param
+ (frame->info_element, stats->len - sizeof(*frame), network))
+ return 1;
+
+ network->mode = 0;
+ if (stats->freq == IEEE80211_52GHZ_BAND)
+ network->mode = IEEE_A;
+ else {
+ if (network->flags & NETWORK_HAS_OFDM)
+ network->mode |= IEEE_G;
+ if (network->flags & NETWORK_HAS_CCK)
+ network->mode |= IEEE_B;
+ }
+
+ if (ieee80211_is_empty_essid(network->ssid, network->ssid_len))
+ network->flags |= NETWORK_EMPTY_ESSID;
+
+ memcpy(&network->stats, stats, sizeof(network->stats));
+
+ if (ieee->handle_assoc_response != NULL)
+ ieee->handle_assoc_response(dev, frame, network);
+
+ return 0;
+}
+
+/***************************************************/
+
+static inline int ieee80211_network_init(struct ieee80211_device *ieee, struct ieee80211_probe_response
+ *beacon,
+ struct ieee80211_network *network,
+ struct ieee80211_rx_stats *stats)
+{
+ network->qos_data.active = 0;
+ network->qos_data.supported = 0;
+ network->qos_data.param_count = 0;
+ network->qos_data.old_param_count = 0;
+
+ /* Pull out fixed field data */
+ memcpy(network->bssid, beacon->header.addr3, ETH_ALEN);
+ network->capability = le16_to_cpu(beacon->capability);
+ network->last_scanned = jiffies;
+ network->time_stamp[0] = le32_to_cpu(beacon->time_stamp[0]);
+ network->time_stamp[1] = le32_to_cpu(beacon->time_stamp[1]);
+ network->beacon_interval = le16_to_cpu(beacon->beacon_interval);
+ /* Where to pull this? beacon->listen_interval; */
+ network->listen_interval = 0x0A;
+ network->rates_len = network->rates_ex_len = 0;
+ network->last_associate = 0;
+ network->ssid_len = 0;
+ network->flags = 0;
+ network->atim_window = 0;
+ network->erp_value = (network->capability & WLAN_CAPABILITY_IBSS) ?
+ 0x3 : 0x0;
+
+ if (stats->freq == IEEE80211_52GHZ_BAND) {
+ /* for A band (No DS info) */
+ network->channel = stats->received_channel;
+ } else
+ network->flags |= NETWORK_HAS_CCK;
+
+ network->wpa_ie_len = 0;
+ network->rsn_ie_len = 0;
+
+ if (ieee80211_parse_info_param
+ (beacon->info_element, stats->len - sizeof(*beacon), network))
+ return 1;
+
+ network->mode = 0;
+ if (stats->freq == IEEE80211_52GHZ_BAND)
+ network->mode = IEEE_A;
+ else {
+ if (network->flags & NETWORK_HAS_OFDM)
+ network->mode |= IEEE_G;
+ if (network->flags & NETWORK_HAS_CCK)
+ network->mode |= IEEE_B;
+ }
+
+ if (network->mode == 0) {
+ IEEE80211_DEBUG_SCAN("Filtered out '%s (" MAC_FMT ")' "
+ "network.\n",
+ escape_essid(network->ssid,
+ network->ssid_len),
+ MAC_ARG(network->bssid));
+ return 1;
+ }
+
+ if (ieee80211_is_empty_essid(network->ssid, network->ssid_len))
+ network->flags |= NETWORK_EMPTY_ESSID;
+
+ memcpy(&network->stats, stats, sizeof(network->stats));
+
+ return 0;
+}
+
+static inline int is_same_network(struct ieee80211_network *src,
+ struct ieee80211_network *dst)
+{
+ /* A network is only a duplicate if the channel, BSSID, and ESSID
+ * all match. We treat all <hidden> with the same BSSID and channel
+ * as one network */
+ return ((src->ssid_len == dst->ssid_len) &&
+ (src->channel == dst->channel) &&
+ !memcmp(src->bssid, dst->bssid, ETH_ALEN) &&
+ !memcmp(src->ssid, dst->ssid, src->ssid_len));
+}
+
+static inline void update_network(struct ieee80211_network *dst,
+ struct ieee80211_network *src)
+{
+ int qos_active;
+ u8 old_param;
+
+ memcpy(&dst->stats, &src->stats, sizeof(struct ieee80211_rx_stats));
+ dst->capability = src->capability;
+ memcpy(dst->rates, src->rates, src->rates_len);
+ dst->rates_len = src->rates_len;
+ memcpy(dst->rates_ex, src->rates_ex, src->rates_ex_len);
+ dst->rates_ex_len = src->rates_ex_len;
+
+ dst->mode = src->mode;
+ dst->flags = src->flags;
+ dst->time_stamp[0] = src->time_stamp[0];
+ dst->time_stamp[1] = src->time_stamp[1];
+
+ dst->beacon_interval = src->beacon_interval;
+ dst->listen_interval = src->listen_interval;
+ dst->atim_window = src->atim_window;
+ dst->erp_value = src->erp_value;
+
+ memcpy(dst->wpa_ie, src->wpa_ie, src->wpa_ie_len);
+ dst->wpa_ie_len = src->wpa_ie_len;
+ memcpy(dst->rsn_ie, src->rsn_ie, src->rsn_ie_len);
+ dst->rsn_ie_len = src->rsn_ie_len;
+
+ dst->last_scanned = jiffies;
+ qos_active = src->qos_data.active;
+ old_param = dst->qos_data.old_param_count;
+ if (dst->flags & NETWORK_HAS_QOS_MASK)
+ memcpy(&dst->qos_data, &src->qos_data,
+ sizeof(struct ieee80211_qos_data));
+ else {
+ dst->qos_data.supported = src->qos_data.supported;
+ dst->qos_data.param_count = src->qos_data.param_count;
+ }
+
+ if (dst->qos_data.supported == 1) {
+ if (dst->ssid_len)
+ IEEE80211_DEBUG_QOS
+ ("QoS the network %s is QoS supported\n",
+ dst->ssid);
+ else
+ IEEE80211_DEBUG_QOS
+ ("QoS the network is QoS supported\n");
+ }
+ dst->qos_data.active = qos_active;
+ dst->qos_data.old_param_count = old_param;
+
+ /* dst->last_associate is not overwritten */
+}
+
+static inline int is_beacon(int fc)
+{
+ return (WLAN_FC_GET_STYPE(le16_to_cpu(fc)) == IEEE80211_STYPE_BEACON);
+}
+
+static inline void ieee80211_process_probe_response(struct ieee80211_device
+ *ieee, struct
+ ieee80211_probe_response
+ *beacon, struct ieee80211_rx_stats
+ *stats)
+{
+ struct net_device *dev = ieee->dev;
+ struct ieee80211_network network;
+ struct ieee80211_network *target;
+ struct ieee80211_network *oldest = NULL;
+#ifdef CONFIG_IEEE80211_DEBUG
+ struct ieee80211_info_element *info_element = beacon->info_element;
+#endif
+ unsigned long flags;
+
+ IEEE80211_DEBUG_SCAN("'%s' (" MAC_FMT
+ "): %c%c%c%c %c%c%c%c-%c%c%c%c %c%c%c%c\n",
+ escape_essid(info_element->data,
+ info_element->len),
+ MAC_ARG(beacon->header.addr3),
+ (beacon->capability & (1 << 0xf)) ? '1' : '0',
+ (beacon->capability & (1 << 0xe)) ? '1' : '0',
+ (beacon->capability & (1 << 0xd)) ? '1' : '0',
+ (beacon->capability & (1 << 0xc)) ? '1' : '0',
+ (beacon->capability & (1 << 0xb)) ? '1' : '0',
+ (beacon->capability & (1 << 0xa)) ? '1' : '0',
+ (beacon->capability & (1 << 0x9)) ? '1' : '0',
+ (beacon->capability & (1 << 0x8)) ? '1' : '0',
+ (beacon->capability & (1 << 0x7)) ? '1' : '0',
+ (beacon->capability & (1 << 0x6)) ? '1' : '0',
+ (beacon->capability & (1 << 0x5)) ? '1' : '0',
+ (beacon->capability & (1 << 0x4)) ? '1' : '0',
+ (beacon->capability & (1 << 0x3)) ? '1' : '0',
+ (beacon->capability & (1 << 0x2)) ? '1' : '0',
+ (beacon->capability & (1 << 0x1)) ? '1' : '0',
+ (beacon->capability & (1 << 0x0)) ? '1' : '0');
+
+ if (ieee80211_network_init(ieee, beacon, &network, stats)) {
+ IEEE80211_DEBUG_SCAN("Dropped '%s' (" MAC_FMT ") via %s.\n",
+ escape_essid(info_element->data,
+ info_element->len),
+ MAC_ARG(beacon->header.addr3),
+ is_beacon(le16_to_cpu
+ (beacon->header.
+ frame_ctl)) ?
+ "BEACON" : "PROBE RESPONSE");
+ return;
+ }
+
+ /* The network parsed correctly -- so now we scan our known networks
+ * to see if we can find it in our list.
+ *
+ * NOTE: This search is definitely not optimized. Once its doing
+ * the "right thing" we'll optimize it for efficiency if
+ * necessary */
+
+ /* Search for this entry in the list and update it if it is
+ * already there. */
+
+ spin_lock_irqsave(&ieee->lock, flags);
+
+ list_for_each_entry(target, &ieee->network_list, list) {
+ if (is_same_network(target, &network))
+ break;
+
+ if ((oldest == NULL) ||
+ (target->last_scanned < oldest->last_scanned))
+ oldest = target;
+ }
+
+ /* If we didn't find a match, then get a new network slot to initialize
+ * with this beacon's information */
+ if (&target->list == &ieee->network_list) {
+ if (list_empty(&ieee->network_free_list)) {
+ /* If there are no more slots, expire the oldest */
+ list_del(&oldest->list);
+ target = oldest;
+ IEEE80211_DEBUG_SCAN("Expired '%s' (" MAC_FMT ") from "
+ "network list.\n",
+ escape_essid(target->ssid,
+ target->ssid_len),
+ MAC_ARG(target->bssid));
+ } else {
+ /* Otherwise just pull from the free list */
+ target = list_entry(ieee->network_free_list.next,
+ struct ieee80211_network, list);
+ list_del(ieee->network_free_list.next);
+ }
+
+#ifdef CONFIG_IEEE80211_DEBUG
+ IEEE80211_DEBUG_SCAN("Adding '%s' (" MAC_FMT ") via %s.\n",
+ escape_essid(network.ssid,
+ network.ssid_len),
+ MAC_ARG(network.bssid),
+ is_beacon(le16_to_cpu
+ (beacon->header.
+ frame_ctl)) ?
+ "BEACON" : "PROBE RESPONSE");
+#endif
+ memcpy(target, &network, sizeof(*target));
+ list_add_tail(&target->list, &ieee->network_list);
+ } else {
+ IEEE80211_DEBUG_SCAN("Updating '%s' (" MAC_FMT ") via %s.\n",
+ escape_essid(target->ssid,
+ target->ssid_len),
+ MAC_ARG(target->bssid),
+ is_beacon(le16_to_cpu
+ (beacon->header.
+ frame_ctl)) ?
+ "BEACON" : "PROBE RESPONSE");
+ update_network(target, &network);
+ }
+
+ spin_unlock_irqrestore(&ieee->lock, flags);
+
+ if (is_beacon(le16_to_cpu(beacon->header.frame_ctl))) {
+ if (ieee->handle_beacon != NULL)
+ ieee->handle_beacon(dev, beacon, &network);
+ } else {
+ if (ieee->handle_probe_response != NULL)
+ ieee->handle_probe_response(dev, beacon, &network);
+ }
+}
+
+void ieee80211_rx_mgt(struct ieee80211_device *ieee,
+ struct ieee80211_hdr_4addr *header,
+ struct ieee80211_rx_stats *stats)
+{
+ switch (WLAN_FC_GET_STYPE(le16_to_cpu(header->frame_ctl))) {
+ case IEEE80211_STYPE_ASSOC_RESP:
+ IEEE80211_DEBUG_MGMT("received ASSOCIATION RESPONSE (%d)\n",
+ WLAN_FC_GET_STYPE(le16_to_cpu
+ (header->frame_ctl)));
+ ieee80211_handle_assoc_resp(ieee,
+ (struct ieee80211_assoc_response *)
+ header, stats);
+ break;
+
+ case IEEE80211_STYPE_REASSOC_RESP:
+ IEEE80211_DEBUG_MGMT("received REASSOCIATION RESPONSE (%d)\n",
+ WLAN_FC_GET_STYPE(le16_to_cpu
+ (header->frame_ctl)));
+ break;
+
+ case IEEE80211_STYPE_PROBE_REQ:
+ IEEE80211_DEBUG_MGMT("recieved auth (%d)\n",
+ WLAN_FC_GET_STYPE(le16_to_cpu
+ (header->frame_ctl)));
+
+ if (ieee->handle_probe_request != NULL)
+ ieee->handle_probe_request(ieee->dev,
+ (struct
+ ieee80211_probe_request *)
+ header, stats);
+ break;
+
+ case IEEE80211_STYPE_PROBE_RESP:
+ IEEE80211_DEBUG_MGMT("received PROBE RESPONSE (%d)\n",
+ WLAN_FC_GET_STYPE(le16_to_cpu
+ (header->frame_ctl)));
+ IEEE80211_DEBUG_SCAN("Probe response\n");
+ ieee80211_process_probe_response(ieee,
+ (struct
+ ieee80211_probe_response *)
+ header, stats);
+ break;
+
+ case IEEE80211_STYPE_BEACON:
+ IEEE80211_DEBUG_MGMT("received BEACON (%d)\n",
+ WLAN_FC_GET_STYPE(le16_to_cpu
+ (header->frame_ctl)));
+ IEEE80211_DEBUG_SCAN("Beacon\n");
+ ieee80211_process_probe_response(ieee,
+ (struct
+ ieee80211_probe_response *)
+ header, stats);
+ break;
+ case IEEE80211_STYPE_AUTH:
+
+ IEEE80211_DEBUG_MGMT("recieved auth (%d)\n",
+ WLAN_FC_GET_STYPE(le16_to_cpu
+ (header->frame_ctl)));
+
+ if (ieee->handle_auth != NULL)
+ ieee->handle_auth(ieee->dev,
+ (struct ieee80211_auth *)header);
+ break;
+
+ case IEEE80211_STYPE_DISASSOC:
+ if (ieee->handle_disassoc != NULL)
+ ieee->handle_disassoc(ieee->dev,
+ (struct ieee80211_disassoc *)
+ header);
+ break;
+
+ case IEEE80211_STYPE_DEAUTH:
+ printk("DEAUTH from AP\n");
+ if (ieee->handle_deauth != NULL)
+ ieee->handle_deauth(ieee->dev, (struct ieee80211_auth *)
+ header);
+ break;
+ default:
+ IEEE80211_DEBUG_MGMT("received UNKNOWN (%d)\n",
+ WLAN_FC_GET_STYPE(le16_to_cpu
+ (header->frame_ctl)));
+ IEEE80211_WARNING("%s: Unknown management packet: %d\n",
+ ieee->dev->name,
+ WLAN_FC_GET_STYPE(le16_to_cpu
+ (header->frame_ctl)));
+ break;
+ }
+}
+
+EXPORT_SYMBOL(ieee80211_rx_mgt);
+EXPORT_SYMBOL(ieee80211_rx);
diff --git a/net/ieee80211/ieee80211_tx.c b/net/ieee80211/ieee80211_tx.c
new file mode 100644
index 00000000000..95ccbadbf55
--- /dev/null
+++ b/net/ieee80211/ieee80211_tx.c
@@ -0,0 +1,581 @@
+/******************************************************************************
+
+ Copyright(c) 2003 - 2005 Intel Corporation. All rights reserved.
+
+ This program is free software; you can redistribute it and/or modify it
+ under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ You should have received a copy of the GNU General Public License along with
+ this program; if not, write to the Free Software Foundation, Inc., 59
+ Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+ The full GNU General Public License is included in this distribution in the
+ file called LICENSE.
+
+ Contact Information:
+ James P. Ketrenos <ipw2100-admin@linux.intel.com>
+ Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+
+******************************************************************************/
+#include <linux/compiler.h>
+#include <linux/config.h>
+#include <linux/errno.h>
+#include <linux/if_arp.h>
+#include <linux/in6.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/tcp.h>
+#include <linux/types.h>
+#include <linux/version.h>
+#include <linux/wireless.h>
+#include <linux/etherdevice.h>
+#include <asm/uaccess.h>
+
+#include <net/ieee80211.h>
+
+/*
+
+802.11 Data Frame
+
+ ,-------------------------------------------------------------------.
+Bytes | 2 | 2 | 6 | 6 | 6 | 2 | 0..2312 | 4 |
+ |------|------|---------|---------|---------|------|---------|------|
+Desc. | ctrl | dura | DA/RA | TA | SA | Sequ | Frame | fcs |
+ | | tion | (BSSID) | | | ence | data | |
+ `--------------------------------------------------| |------'
+Total: 28 non-data bytes `----.----'
+ |
+ .- 'Frame data' expands to <---------------------------'
+ |
+ V
+ ,---------------------------------------------------.
+Bytes | 1 | 1 | 1 | 3 | 2 | 0-2304 |
+ |------|------|---------|----------|------|---------|
+Desc. | SNAP | SNAP | Control |Eth Tunnel| Type | IP |
+ | DSAP | SSAP | | | | Packet |
+ | 0xAA | 0xAA |0x03 (UI)|0x00-00-F8| | |
+ `-----------------------------------------| |
+Total: 8 non-data bytes `----.----'
+ |
+ .- 'IP Packet' expands, if WEP enabled, to <--'
+ |
+ V
+ ,-----------------------.
+Bytes | 4 | 0-2296 | 4 |
+ |-----|-----------|-----|
+Desc. | IV | Encrypted | ICV |
+ | | IP Packet | |
+ `-----------------------'
+Total: 8 non-data bytes
+
+802.3 Ethernet Data Frame
+
+ ,-----------------------------------------.
+Bytes | 6 | 6 | 2 | Variable | 4 |
+ |-------|-------|------|-----------|------|
+Desc. | Dest. | Source| Type | IP Packet | fcs |
+ | MAC | MAC | | | |
+ `-----------------------------------------'
+Total: 18 non-data bytes
+
+In the event that fragmentation is required, the incoming payload is split into
+N parts of size ieee->fts. The first fragment contains the SNAP header and the
+remaining packets are just data.
+
+If encryption is enabled, each fragment payload size is reduced by enough space
+to add the prefix and postfix (IV and ICV totalling 8 bytes in the case of WEP)
+So if you have 1500 bytes of payload with ieee->fts set to 500 without
+encryption it will take 3 frames. With WEP it will take 4 frames as the
+payload of each frame is reduced to 492 bytes.
+
+* SKB visualization
+*
+* ,- skb->data
+* |
+* | ETHERNET HEADER ,-<-- PAYLOAD
+* | | 14 bytes from skb->data
+* | 2 bytes for Type --> ,T. | (sizeof ethhdr)
+* | | | |
+* |,-Dest.--. ,--Src.---. | | |
+* | 6 bytes| | 6 bytes | | | |
+* v | | | | | |
+* 0 | v 1 | v | v 2
+* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
+* ^ | ^ | ^ |
+* | | | | | |
+* | | | | `T' <---- 2 bytes for Type
+* | | | |
+* | | '---SNAP--' <-------- 6 bytes for SNAP
+* | |
+* `-IV--' <-------------------- 4 bytes for IV (WEP)
+*
+* SNAP HEADER
+*
+*/
+
+static u8 P802_1H_OUI[P80211_OUI_LEN] = { 0x00, 0x00, 0xf8 };
+static u8 RFC1042_OUI[P80211_OUI_LEN] = { 0x00, 0x00, 0x00 };
+
+static inline int ieee80211_copy_snap(u8 * data, u16 h_proto)
+{
+ struct ieee80211_snap_hdr *snap;
+ u8 *oui;
+
+ snap = (struct ieee80211_snap_hdr *)data;
+ snap->dsap = 0xaa;
+ snap->ssap = 0xaa;
+ snap->ctrl = 0x03;
+
+ if (h_proto == 0x8137 || h_proto == 0x80f3)
+ oui = P802_1H_OUI;
+ else
+ oui = RFC1042_OUI;
+ snap->oui[0] = oui[0];
+ snap->oui[1] = oui[1];
+ snap->oui[2] = oui[2];
+
+ *(u16 *) (data + SNAP_SIZE) = htons(h_proto);
+
+ return SNAP_SIZE + sizeof(u16);
+}
+
+static inline int ieee80211_encrypt_fragment(struct ieee80211_device *ieee,
+ struct sk_buff *frag, int hdr_len)
+{
+ struct ieee80211_crypt_data *crypt = ieee->crypt[ieee->tx_keyidx];
+ int res;
+
+ if (crypt == NULL)
+ return -1;
+
+ /* To encrypt, frame format is:
+ * IV (4 bytes), clear payload (including SNAP), ICV (4 bytes) */
+ atomic_inc(&crypt->refcnt);
+ res = 0;
+ if (crypt->ops && crypt->ops->encrypt_mpdu)
+ res = crypt->ops->encrypt_mpdu(frag, hdr_len, crypt->priv);
+
+ atomic_dec(&crypt->refcnt);
+ if (res < 0) {
+ printk(KERN_INFO "%s: Encryption failed: len=%d.\n",
+ ieee->dev->name, frag->len);
+ ieee->ieee_stats.tx_discards++;
+ return -1;
+ }
+
+ return 0;
+}
+
+void ieee80211_txb_free(struct ieee80211_txb *txb)
+{
+ int i;
+ if (unlikely(!txb))
+ return;
+ for (i = 0; i < txb->nr_frags; i++)
+ if (txb->fragments[i])
+ dev_kfree_skb_any(txb->fragments[i]);
+ kfree(txb);
+}
+
+static struct ieee80211_txb *ieee80211_alloc_txb(int nr_frags, int txb_size,
+ int headroom, gfp_t gfp_mask)
+{
+ struct ieee80211_txb *txb;
+ int i;
+ txb = kmalloc(sizeof(struct ieee80211_txb) + (sizeof(u8 *) * nr_frags),
+ gfp_mask);
+ if (!txb)
+ return NULL;
+
+ memset(txb, 0, sizeof(struct ieee80211_txb));
+ txb->nr_frags = nr_frags;
+ txb->frag_size = txb_size;
+
+ for (i = 0; i < nr_frags; i++) {
+ txb->fragments[i] = __dev_alloc_skb(txb_size + headroom,
+ gfp_mask);
+ if (unlikely(!txb->fragments[i])) {
+ i--;
+ break;
+ }
+ skb_reserve(txb->fragments[i], headroom);
+ }
+ if (unlikely(i != nr_frags)) {
+ while (i >= 0)
+ dev_kfree_skb_any(txb->fragments[i--]);
+ kfree(txb);
+ return NULL;
+ }
+ return txb;
+}
+
+/* Incoming skb is converted to a txb which consists of
+ * a block of 802.11 fragment packets (stored as skbs) */
+int ieee80211_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ieee80211_device *ieee = netdev_priv(dev);
+ struct ieee80211_txb *txb = NULL;
+ struct ieee80211_hdr_3addr *frag_hdr;
+ int i, bytes_per_frag, nr_frags, bytes_last_frag, frag_size,
+ rts_required;
+ unsigned long flags;
+ struct net_device_stats *stats = &ieee->stats;
+ int ether_type, encrypt, host_encrypt, host_encrypt_msdu, host_build_iv;
+ int bytes, fc, hdr_len;
+ struct sk_buff *skb_frag;
+ struct ieee80211_hdr_3addr header = { /* Ensure zero initialized */
+ .duration_id = 0,
+ .seq_ctl = 0
+ };
+ u8 dest[ETH_ALEN], src[ETH_ALEN];
+ struct ieee80211_crypt_data *crypt;
+ int priority = skb->priority;
+ int snapped = 0;
+
+ if (ieee->is_queue_full && (*ieee->is_queue_full) (dev, priority))
+ return NETDEV_TX_BUSY;
+
+ spin_lock_irqsave(&ieee->lock, flags);
+
+ /* If there is no driver handler to take the TXB, dont' bother
+ * creating it... */
+ if (!ieee->hard_start_xmit) {
+ printk(KERN_WARNING "%s: No xmit handler.\n", ieee->dev->name);
+ goto success;
+ }
+
+ if (unlikely(skb->len < SNAP_SIZE + sizeof(u16))) {
+ printk(KERN_WARNING "%s: skb too small (%d).\n",
+ ieee->dev->name, skb->len);
+ goto success;
+ }
+
+ ether_type = ntohs(((struct ethhdr *)skb->data)->h_proto);
+
+ crypt = ieee->crypt[ieee->tx_keyidx];
+
+ encrypt = !(ether_type == ETH_P_PAE && ieee->ieee802_1x) &&
+ ieee->sec.encrypt;
+
+ host_encrypt = ieee->host_encrypt && encrypt && crypt;
+ host_encrypt_msdu = ieee->host_encrypt_msdu && encrypt && crypt;
+ host_build_iv = ieee->host_build_iv && encrypt && crypt;
+
+ if (!encrypt && ieee->ieee802_1x &&
+ ieee->drop_unencrypted && ether_type != ETH_P_PAE) {
+ stats->tx_dropped++;
+ goto success;
+ }
+
+ /* Save source and destination addresses */
+ memcpy(dest, skb->data, ETH_ALEN);
+ memcpy(src, skb->data + ETH_ALEN, ETH_ALEN);
+
+ /* Advance the SKB to the start of the payload */
+ skb_pull(skb, sizeof(struct ethhdr));
+
+ /* Determine total amount of storage required for TXB packets */
+ bytes = skb->len + SNAP_SIZE + sizeof(u16);
+
+ if (host_encrypt)
+ fc = IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA |
+ IEEE80211_FCTL_PROTECTED;
+ else
+ fc = IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA;
+
+ if (ieee->iw_mode == IW_MODE_INFRA) {
+ fc |= IEEE80211_FCTL_TODS;
+ /* To DS: Addr1 = BSSID, Addr2 = SA, Addr3 = DA */
+ memcpy(header.addr1, ieee->bssid, ETH_ALEN);
+ memcpy(header.addr2, src, ETH_ALEN);
+ memcpy(header.addr3, dest, ETH_ALEN);
+ } else if (ieee->iw_mode == IW_MODE_ADHOC) {
+ /* not From/To DS: Addr1 = DA, Addr2 = SA, Addr3 = BSSID */
+ memcpy(header.addr1, dest, ETH_ALEN);
+ memcpy(header.addr2, src, ETH_ALEN);
+ memcpy(header.addr3, ieee->bssid, ETH_ALEN);
+ }
+ header.frame_ctl = cpu_to_le16(fc);
+ hdr_len = IEEE80211_3ADDR_LEN;
+
+ /* Encrypt msdu first on the whole data packet. */
+ if ((host_encrypt || host_encrypt_msdu) &&
+ crypt && crypt->ops && crypt->ops->encrypt_msdu) {
+ int res = 0;
+ int len = bytes + hdr_len + crypt->ops->extra_msdu_prefix_len +
+ crypt->ops->extra_msdu_postfix_len;
+ struct sk_buff *skb_new = dev_alloc_skb(len);
+
+ if (unlikely(!skb_new))
+ goto failed;
+
+ skb_reserve(skb_new, crypt->ops->extra_msdu_prefix_len);
+ memcpy(skb_put(skb_new, hdr_len), &header, hdr_len);
+ snapped = 1;
+ ieee80211_copy_snap(skb_put(skb_new, SNAP_SIZE + sizeof(u16)),
+ ether_type);
+ memcpy(skb_put(skb_new, skb->len), skb->data, skb->len);
+ res = crypt->ops->encrypt_msdu(skb_new, hdr_len, crypt->priv);
+ if (res < 0) {
+ IEEE80211_ERROR("msdu encryption failed\n");
+ dev_kfree_skb_any(skb_new);
+ goto failed;
+ }
+ dev_kfree_skb_any(skb);
+ skb = skb_new;
+ bytes += crypt->ops->extra_msdu_prefix_len +
+ crypt->ops->extra_msdu_postfix_len;
+ skb_pull(skb, hdr_len);
+ }
+
+ if (host_encrypt || ieee->host_open_frag) {
+ /* Determine fragmentation size based on destination (multicast
+ * and broadcast are not fragmented) */
+ if (is_multicast_ether_addr(dest) ||
+ is_broadcast_ether_addr(dest))
+ frag_size = MAX_FRAG_THRESHOLD;
+ else
+ frag_size = ieee->fts;
+
+ /* Determine amount of payload per fragment. Regardless of if
+ * this stack is providing the full 802.11 header, one will
+ * eventually be affixed to this fragment -- so we must account
+ * for it when determining the amount of payload space. */
+ bytes_per_frag = frag_size - IEEE80211_3ADDR_LEN;
+ if (ieee->config &
+ (CFG_IEEE80211_COMPUTE_FCS | CFG_IEEE80211_RESERVE_FCS))
+ bytes_per_frag -= IEEE80211_FCS_LEN;
+
+ /* Each fragment may need to have room for encryptiong
+ * pre/postfix */
+ if (host_encrypt)
+ bytes_per_frag -= crypt->ops->extra_mpdu_prefix_len +
+ crypt->ops->extra_mpdu_postfix_len;
+
+ /* Number of fragments is the total
+ * bytes_per_frag / payload_per_fragment */
+ nr_frags = bytes / bytes_per_frag;
+ bytes_last_frag = bytes % bytes_per_frag;
+ if (bytes_last_frag)
+ nr_frags++;
+ else
+ bytes_last_frag = bytes_per_frag;
+ } else {
+ nr_frags = 1;
+ bytes_per_frag = bytes_last_frag = bytes;
+ frag_size = bytes + IEEE80211_3ADDR_LEN;
+ }
+
+ rts_required = (frag_size > ieee->rts
+ && ieee->config & CFG_IEEE80211_RTS);
+ if (rts_required)
+ nr_frags++;
+
+ /* When we allocate the TXB we allocate enough space for the reserve
+ * and full fragment bytes (bytes_per_frag doesn't include prefix,
+ * postfix, header, FCS, etc.) */
+ txb = ieee80211_alloc_txb(nr_frags, frag_size,
+ ieee->tx_headroom, GFP_ATOMIC);
+ if (unlikely(!txb)) {
+ printk(KERN_WARNING "%s: Could not allocate TXB\n",
+ ieee->dev->name);
+ goto failed;
+ }
+ txb->encrypted = encrypt;
+ if (host_encrypt)
+ txb->payload_size = frag_size * (nr_frags - 1) +
+ bytes_last_frag;
+ else
+ txb->payload_size = bytes;
+
+ if (rts_required) {
+ skb_frag = txb->fragments[0];
+ frag_hdr =
+ (struct ieee80211_hdr_3addr *)skb_put(skb_frag, hdr_len);
+
+ /*
+ * Set header frame_ctl to the RTS.
+ */
+ header.frame_ctl =
+ cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_RTS);
+ memcpy(frag_hdr, &header, hdr_len);
+
+ /*
+ * Restore header frame_ctl to the original data setting.
+ */
+ header.frame_ctl = cpu_to_le16(fc);
+
+ if (ieee->config &
+ (CFG_IEEE80211_COMPUTE_FCS | CFG_IEEE80211_RESERVE_FCS))
+ skb_put(skb_frag, 4);
+
+ txb->rts_included = 1;
+ i = 1;
+ } else
+ i = 0;
+
+ for (; i < nr_frags; i++) {
+ skb_frag = txb->fragments[i];
+
+ if (host_encrypt || host_build_iv)
+ skb_reserve(skb_frag,
+ crypt->ops->extra_mpdu_prefix_len);
+
+ frag_hdr =
+ (struct ieee80211_hdr_3addr *)skb_put(skb_frag, hdr_len);
+ memcpy(frag_hdr, &header, hdr_len);
+
+ /* If this is not the last fragment, then add the MOREFRAGS
+ * bit to the frame control */
+ if (i != nr_frags - 1) {
+ frag_hdr->frame_ctl =
+ cpu_to_le16(fc | IEEE80211_FCTL_MOREFRAGS);
+ bytes = bytes_per_frag;
+ } else {
+ /* The last fragment takes the remaining length */
+ bytes = bytes_last_frag;
+ }
+
+ if (i == 0 && !snapped) {
+ ieee80211_copy_snap(skb_put
+ (skb_frag, SNAP_SIZE + sizeof(u16)),
+ ether_type);
+ bytes -= SNAP_SIZE + sizeof(u16);
+ }
+
+ memcpy(skb_put(skb_frag, bytes), skb->data, bytes);
+
+ /* Advance the SKB... */
+ skb_pull(skb, bytes);
+
+ /* Encryption routine will move the header forward in order
+ * to insert the IV between the header and the payload */
+ if (host_encrypt)
+ ieee80211_encrypt_fragment(ieee, skb_frag, hdr_len);
+ else if (host_build_iv) {
+ struct ieee80211_crypt_data *crypt;
+
+ crypt = ieee->crypt[ieee->tx_keyidx];
+ atomic_inc(&crypt->refcnt);
+ if (crypt->ops->build_iv)
+ crypt->ops->build_iv(skb_frag, hdr_len,
+ crypt->priv);
+ atomic_dec(&crypt->refcnt);
+ }
+
+ if (ieee->config &
+ (CFG_IEEE80211_COMPUTE_FCS | CFG_IEEE80211_RESERVE_FCS))
+ skb_put(skb_frag, 4);
+ }
+
+ success:
+ spin_unlock_irqrestore(&ieee->lock, flags);
+
+ dev_kfree_skb_any(skb);
+
+ if (txb) {
+ int ret = (*ieee->hard_start_xmit) (txb, dev, priority);
+ if (ret == 0) {
+ stats->tx_packets++;
+ stats->tx_bytes += txb->payload_size;
+ return 0;
+ }
+
+ if (ret == NETDEV_TX_BUSY) {
+ printk(KERN_ERR "%s: NETDEV_TX_BUSY returned; "
+ "driver should report queue full via "
+ "ieee_device->is_queue_full.\n",
+ ieee->dev->name);
+ }
+
+ ieee80211_txb_free(txb);
+ }
+
+ return 0;
+
+ failed:
+ spin_unlock_irqrestore(&ieee->lock, flags);
+ netif_stop_queue(dev);
+ stats->tx_errors++;
+ return 1;
+}
+
+/* Incoming 802.11 strucure is converted to a TXB
+ * a block of 802.11 fragment packets (stored as skbs) */
+int ieee80211_tx_frame(struct ieee80211_device *ieee,
+ struct ieee80211_hdr *frame, int len)
+{
+ struct ieee80211_txb *txb = NULL;
+ unsigned long flags;
+ struct net_device_stats *stats = &ieee->stats;
+ struct sk_buff *skb_frag;
+ int priority = -1;
+
+ spin_lock_irqsave(&ieee->lock, flags);
+
+ /* If there is no driver handler to take the TXB, dont' bother
+ * creating it... */
+ if (!ieee->hard_start_xmit) {
+ printk(KERN_WARNING "%s: No xmit handler.\n", ieee->dev->name);
+ goto success;
+ }
+
+ if (unlikely(len < 24)) {
+ printk(KERN_WARNING "%s: skb too small (%d).\n",
+ ieee->dev->name, len);
+ goto success;
+ }
+
+ /* When we allocate the TXB we allocate enough space for the reserve
+ * and full fragment bytes (bytes_per_frag doesn't include prefix,
+ * postfix, header, FCS, etc.) */
+ txb = ieee80211_alloc_txb(1, len, ieee->tx_headroom, GFP_ATOMIC);
+ if (unlikely(!txb)) {
+ printk(KERN_WARNING "%s: Could not allocate TXB\n",
+ ieee->dev->name);
+ goto failed;
+ }
+ txb->encrypted = 0;
+ txb->payload_size = len;
+
+ skb_frag = txb->fragments[0];
+
+ memcpy(skb_put(skb_frag, len), frame, len);
+
+ if (ieee->config &
+ (CFG_IEEE80211_COMPUTE_FCS | CFG_IEEE80211_RESERVE_FCS))
+ skb_put(skb_frag, 4);
+
+ success:
+ spin_unlock_irqrestore(&ieee->lock, flags);
+
+ if (txb) {
+ if ((*ieee->hard_start_xmit) (txb, ieee->dev, priority) == 0) {
+ stats->tx_packets++;
+ stats->tx_bytes += txb->payload_size;
+ return 0;
+ }
+ ieee80211_txb_free(txb);
+ }
+ return 0;
+
+ failed:
+ spin_unlock_irqrestore(&ieee->lock, flags);
+ stats->tx_errors++;
+ return 1;
+}
+
+EXPORT_SYMBOL(ieee80211_tx_frame);
+EXPORT_SYMBOL(ieee80211_txb_free);
diff --git a/net/ieee80211/ieee80211_wx.c b/net/ieee80211/ieee80211_wx.c
new file mode 100644
index 00000000000..1ce7af9bec3
--- /dev/null
+++ b/net/ieee80211/ieee80211_wx.c
@@ -0,0 +1,730 @@
+/******************************************************************************
+
+ Copyright(c) 2004-2005 Intel Corporation. All rights reserved.
+
+ Portions of this file are based on the WEP enablement code provided by the
+ Host AP project hostap-drivers v0.1.3
+ Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen
+ <jkmaline@cc.hut.fi>
+ Copyright (c) 2002-2003, Jouni Malinen <jkmaline@cc.hut.fi>
+
+ This program is free software; you can redistribute it and/or modify it
+ under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ You should have received a copy of the GNU General Public License along with
+ this program; if not, write to the Free Software Foundation, Inc., 59
+ Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+ The full GNU General Public License is included in this distribution in the
+ file called LICENSE.
+
+ Contact Information:
+ James P. Ketrenos <ipw2100-admin@linux.intel.com>
+ Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+
+******************************************************************************/
+
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/jiffies.h>
+
+#include <net/ieee80211.h>
+#include <linux/wireless.h>
+
+static const char *ieee80211_modes[] = {
+ "?", "a", "b", "ab", "g", "ag", "bg", "abg"
+};
+
+#define MAX_CUSTOM_LEN 64
+static inline char *ipw2100_translate_scan(struct ieee80211_device *ieee,
+ char *start, char *stop,
+ struct ieee80211_network *network)
+{
+ char custom[MAX_CUSTOM_LEN];
+ char *p;
+ struct iw_event iwe;
+ int i, j;
+ u8 max_rate, rate;
+
+ /* First entry *MUST* be the AP MAC address */
+ iwe.cmd = SIOCGIWAP;
+ iwe.u.ap_addr.sa_family = ARPHRD_ETHER;
+ memcpy(iwe.u.ap_addr.sa_data, network->bssid, ETH_ALEN);
+ start = iwe_stream_add_event(start, stop, &iwe, IW_EV_ADDR_LEN);
+
+ /* Remaining entries will be displayed in the order we provide them */
+
+ /* Add the ESSID */
+ iwe.cmd = SIOCGIWESSID;
+ iwe.u.data.flags = 1;
+ if (network->flags & NETWORK_EMPTY_ESSID) {
+ iwe.u.data.length = sizeof("<hidden>");
+ start = iwe_stream_add_point(start, stop, &iwe, "<hidden>");
+ } else {
+ iwe.u.data.length = min(network->ssid_len, (u8) 32);
+ start = iwe_stream_add_point(start, stop, &iwe, network->ssid);
+ }
+
+ /* Add the protocol name */
+ iwe.cmd = SIOCGIWNAME;
+ snprintf(iwe.u.name, IFNAMSIZ, "IEEE 802.11%s",
+ ieee80211_modes[network->mode]);
+ start = iwe_stream_add_event(start, stop, &iwe, IW_EV_CHAR_LEN);
+
+ /* Add mode */
+ iwe.cmd = SIOCGIWMODE;
+ if (network->capability & (WLAN_CAPABILITY_ESS | WLAN_CAPABILITY_IBSS)) {
+ if (network->capability & WLAN_CAPABILITY_ESS)
+ iwe.u.mode = IW_MODE_MASTER;
+ else
+ iwe.u.mode = IW_MODE_ADHOC;
+
+ start = iwe_stream_add_event(start, stop, &iwe, IW_EV_UINT_LEN);
+ }
+
+ /* Add frequency/channel */
+ iwe.cmd = SIOCGIWFREQ;
+/* iwe.u.freq.m = ieee80211_frequency(network->channel, network->mode);
+ iwe.u.freq.e = 3; */
+ iwe.u.freq.m = network->channel;
+ iwe.u.freq.e = 0;
+ iwe.u.freq.i = 0;
+ start = iwe_stream_add_event(start, stop, &iwe, IW_EV_FREQ_LEN);
+
+ /* Add encryption capability */
+ iwe.cmd = SIOCGIWENCODE;
+ if (network->capability & WLAN_CAPABILITY_PRIVACY)
+ iwe.u.data.flags = IW_ENCODE_ENABLED | IW_ENCODE_NOKEY;
+ else
+ iwe.u.data.flags = IW_ENCODE_DISABLED;
+ iwe.u.data.length = 0;
+ start = iwe_stream_add_point(start, stop, &iwe, network->ssid);
+
+ /* Add basic and extended rates */
+ max_rate = 0;
+ p = custom;
+ p += snprintf(p, MAX_CUSTOM_LEN - (p - custom), " Rates (Mb/s): ");
+ for (i = 0, j = 0; i < network->rates_len;) {
+ if (j < network->rates_ex_len &&
+ ((network->rates_ex[j] & 0x7F) <
+ (network->rates[i] & 0x7F)))
+ rate = network->rates_ex[j++] & 0x7F;
+ else
+ rate = network->rates[i++] & 0x7F;
+ if (rate > max_rate)
+ max_rate = rate;
+ p += snprintf(p, MAX_CUSTOM_LEN - (p - custom),
+ "%d%s ", rate >> 1, (rate & 1) ? ".5" : "");
+ }
+ for (; j < network->rates_ex_len; j++) {
+ rate = network->rates_ex[j] & 0x7F;
+ p += snprintf(p, MAX_CUSTOM_LEN - (p - custom),
+ "%d%s ", rate >> 1, (rate & 1) ? ".5" : "");
+ if (rate > max_rate)
+ max_rate = rate;
+ }
+
+ iwe.cmd = SIOCGIWRATE;
+ iwe.u.bitrate.fixed = iwe.u.bitrate.disabled = 0;
+ iwe.u.bitrate.value = max_rate * 500000;
+ start = iwe_stream_add_event(start, stop, &iwe, IW_EV_PARAM_LEN);
+
+ iwe.cmd = IWEVCUSTOM;
+ iwe.u.data.length = p - custom;
+ if (iwe.u.data.length)
+ start = iwe_stream_add_point(start, stop, &iwe, custom);
+
+ /* Add quality statistics */
+ iwe.cmd = IWEVQUAL;
+ iwe.u.qual.updated = IW_QUAL_QUAL_UPDATED | IW_QUAL_LEVEL_UPDATED |
+ IW_QUAL_NOISE_UPDATED;
+
+ if (!(network->stats.mask & IEEE80211_STATMASK_RSSI)) {
+ iwe.u.qual.updated |= IW_QUAL_QUAL_INVALID |
+ IW_QUAL_LEVEL_INVALID;
+ iwe.u.qual.qual = 0;
+ iwe.u.qual.level = 0;
+ } else {
+ iwe.u.qual.level = network->stats.rssi;
+ if (ieee->perfect_rssi == ieee->worst_rssi)
+ iwe.u.qual.qual = 100;
+ else
+ iwe.u.qual.qual =
+ (100 *
+ (ieee->perfect_rssi - ieee->worst_rssi) *
+ (ieee->perfect_rssi - ieee->worst_rssi) -
+ (ieee->perfect_rssi - network->stats.rssi) *
+ (15 * (ieee->perfect_rssi - ieee->worst_rssi) +
+ 62 * (ieee->perfect_rssi - network->stats.rssi))) /
+ ((ieee->perfect_rssi - ieee->worst_rssi) *
+ (ieee->perfect_rssi - ieee->worst_rssi));
+ if (iwe.u.qual.qual > 100)
+ iwe.u.qual.qual = 100;
+ else if (iwe.u.qual.qual < 1)
+ iwe.u.qual.qual = 0;
+ }
+
+ if (!(network->stats.mask & IEEE80211_STATMASK_NOISE)) {
+ iwe.u.qual.updated |= IW_QUAL_NOISE_INVALID;
+ iwe.u.qual.noise = 0;
+ } else {
+ iwe.u.qual.noise = network->stats.noise;
+ }
+
+ start = iwe_stream_add_event(start, stop, &iwe, IW_EV_QUAL_LEN);
+
+ iwe.cmd = IWEVCUSTOM;
+ p = custom;
+
+ iwe.u.data.length = p - custom;
+ if (iwe.u.data.length)
+ start = iwe_stream_add_point(start, stop, &iwe, custom);
+
+ if (network->wpa_ie_len) {
+ char buf[MAX_WPA_IE_LEN * 2 + 30];
+
+ u8 *p = buf;
+ p += sprintf(p, "wpa_ie=");
+ for (i = 0; i < network->wpa_ie_len; i++) {
+ p += sprintf(p, "%02x", network->wpa_ie[i]);
+ }
+
+ memset(&iwe, 0, sizeof(iwe));
+ iwe.cmd = IWEVCUSTOM;
+ iwe.u.data.length = strlen(buf);
+ start = iwe_stream_add_point(start, stop, &iwe, buf);
+ }
+
+ if (network->rsn_ie_len) {
+ char buf[MAX_WPA_IE_LEN * 2 + 30];
+
+ u8 *p = buf;
+ p += sprintf(p, "rsn_ie=");
+ for (i = 0; i < network->rsn_ie_len; i++) {
+ p += sprintf(p, "%02x", network->rsn_ie[i]);
+ }
+
+ memset(&iwe, 0, sizeof(iwe));
+ iwe.cmd = IWEVCUSTOM;
+ iwe.u.data.length = strlen(buf);
+ start = iwe_stream_add_point(start, stop, &iwe, buf);
+ }
+
+ /* Add EXTRA: Age to display seconds since last beacon/probe response
+ * for given network. */
+ iwe.cmd = IWEVCUSTOM;
+ p = custom;
+ p += snprintf(p, MAX_CUSTOM_LEN - (p - custom),
+ " Last beacon: %dms ago",
+ jiffies_to_msecs(jiffies - network->last_scanned));
+ iwe.u.data.length = p - custom;
+ if (iwe.u.data.length)
+ start = iwe_stream_add_point(start, stop, &iwe, custom);
+
+ return start;
+}
+
+int ieee80211_wx_get_scan(struct ieee80211_device *ieee,
+ struct iw_request_info *info,
+ union iwreq_data *wrqu, char *extra)
+{
+ struct ieee80211_network *network;
+ unsigned long flags;
+
+ char *ev = extra;
+ char *stop = ev + IW_SCAN_MAX_DATA;
+ int i = 0;
+
+ IEEE80211_DEBUG_WX("Getting scan\n");
+
+ spin_lock_irqsave(&ieee->lock, flags);
+
+ list_for_each_entry(network, &ieee->network_list, list) {
+ i++;
+ if (ieee->scan_age == 0 ||
+ time_after(network->last_scanned + ieee->scan_age, jiffies))
+ ev = ipw2100_translate_scan(ieee, ev, stop, network);
+ else
+ IEEE80211_DEBUG_SCAN("Not showing network '%s ("
+ MAC_FMT ")' due to age (%dms).\n",
+ escape_essid(network->ssid,
+ network->ssid_len),
+ MAC_ARG(network->bssid),
+ jiffies_to_msecs(jiffies -
+ network->
+ last_scanned));
+ }
+
+ spin_unlock_irqrestore(&ieee->lock, flags);
+
+ wrqu->data.length = ev - extra;
+ wrqu->data.flags = 0;
+
+ IEEE80211_DEBUG_WX("exit: %d networks returned.\n", i);
+
+ return 0;
+}
+
+int ieee80211_wx_set_encode(struct ieee80211_device *ieee,
+ struct iw_request_info *info,
+ union iwreq_data *wrqu, char *keybuf)
+{
+ struct iw_point *erq = &(wrqu->encoding);
+ struct net_device *dev = ieee->dev;
+ struct ieee80211_security sec = {
+ .flags = 0
+ };
+ int i, key, key_provided, len;
+ struct ieee80211_crypt_data **crypt;
+ int host_crypto = ieee->host_encrypt || ieee->host_decrypt;
+
+ IEEE80211_DEBUG_WX("SET_ENCODE\n");
+
+ key = erq->flags & IW_ENCODE_INDEX;
+ if (key) {
+ if (key > WEP_KEYS)
+ return -EINVAL;
+ key--;
+ key_provided = 1;
+ } else {
+ key_provided = 0;
+ key = ieee->tx_keyidx;
+ }
+
+ IEEE80211_DEBUG_WX("Key: %d [%s]\n", key, key_provided ?
+ "provided" : "default");
+
+ crypt = &ieee->crypt[key];
+
+ if (erq->flags & IW_ENCODE_DISABLED) {
+ if (key_provided && *crypt) {
+ IEEE80211_DEBUG_WX("Disabling encryption on key %d.\n",
+ key);
+ ieee80211_crypt_delayed_deinit(ieee, crypt);
+ } else
+ IEEE80211_DEBUG_WX("Disabling encryption.\n");
+
+ /* Check all the keys to see if any are still configured,
+ * and if no key index was provided, de-init them all */
+ for (i = 0; i < WEP_KEYS; i++) {
+ if (ieee->crypt[i] != NULL) {
+ if (key_provided)
+ break;
+ ieee80211_crypt_delayed_deinit(ieee,
+ &ieee->crypt[i]);
+ }
+ }
+
+ if (i == WEP_KEYS) {
+ sec.enabled = 0;
+ sec.encrypt = 0;
+ sec.level = SEC_LEVEL_0;
+ sec.flags |= SEC_ENABLED | SEC_LEVEL | SEC_ENCRYPT;
+ }
+
+ goto done;
+ }
+
+ sec.enabled = 1;
+ sec.encrypt = 1;
+ sec.flags |= SEC_ENABLED | SEC_ENCRYPT;
+
+ if (*crypt != NULL && (*crypt)->ops != NULL &&
+ strcmp((*crypt)->ops->name, "WEP") != 0) {
+ /* changing to use WEP; deinit previously used algorithm
+ * on this key */
+ ieee80211_crypt_delayed_deinit(ieee, crypt);
+ }
+
+ if (*crypt == NULL && host_crypto) {
+ struct ieee80211_crypt_data *new_crypt;
+
+ /* take WEP into use */
+ new_crypt = kmalloc(sizeof(struct ieee80211_crypt_data),
+ GFP_KERNEL);
+ if (new_crypt == NULL)
+ return -ENOMEM;
+ memset(new_crypt, 0, sizeof(struct ieee80211_crypt_data));
+ new_crypt->ops = ieee80211_get_crypto_ops("WEP");
+ if (!new_crypt->ops) {
+ request_module("ieee80211_crypt_wep");
+ new_crypt->ops = ieee80211_get_crypto_ops("WEP");
+ }
+
+ if (new_crypt->ops && try_module_get(new_crypt->ops->owner))
+ new_crypt->priv = new_crypt->ops->init(key);
+
+ if (!new_crypt->ops || !new_crypt->priv) {
+ kfree(new_crypt);
+ new_crypt = NULL;
+
+ printk(KERN_WARNING "%s: could not initialize WEP: "
+ "load module ieee80211_crypt_wep\n", dev->name);
+ return -EOPNOTSUPP;
+ }
+ *crypt = new_crypt;
+ }
+
+ /* If a new key was provided, set it up */
+ if (erq->length > 0) {
+ len = erq->length <= 5 ? 5 : 13;
+ memcpy(sec.keys[key], keybuf, erq->length);
+ if (len > erq->length)
+ memset(sec.keys[key] + erq->length, 0,
+ len - erq->length);
+ IEEE80211_DEBUG_WX("Setting key %d to '%s' (%d:%d bytes)\n",
+ key, escape_essid(sec.keys[key], len),
+ erq->length, len);
+ sec.key_sizes[key] = len;
+ if (*crypt)
+ (*crypt)->ops->set_key(sec.keys[key], len, NULL,
+ (*crypt)->priv);
+ sec.flags |= (1 << key);
+ /* This ensures a key will be activated if no key is
+ * explicitely set */
+ if (key == sec.active_key)
+ sec.flags |= SEC_ACTIVE_KEY;
+
+ } else {
+ if (host_crypto) {
+ len = (*crypt)->ops->get_key(sec.keys[key], WEP_KEY_LEN,
+ NULL, (*crypt)->priv);
+ if (len == 0) {
+ /* Set a default key of all 0 */
+ IEEE80211_DEBUG_WX("Setting key %d to all "
+ "zero.\n", key);
+ memset(sec.keys[key], 0, 13);
+ (*crypt)->ops->set_key(sec.keys[key], 13, NULL,
+ (*crypt)->priv);
+ sec.key_sizes[key] = 13;
+ sec.flags |= (1 << key);
+ }
+ }
+ /* No key data - just set the default TX key index */
+ if (key_provided) {
+ IEEE80211_DEBUG_WX("Setting key %d to default Tx "
+ "key.\n", key);
+ ieee->tx_keyidx = key;
+ sec.active_key = key;
+ sec.flags |= SEC_ACTIVE_KEY;
+ }
+ }
+ if (erq->flags & (IW_ENCODE_OPEN | IW_ENCODE_RESTRICTED)) {
+ ieee->open_wep = !(erq->flags & IW_ENCODE_RESTRICTED);
+ sec.auth_mode = ieee->open_wep ? WLAN_AUTH_OPEN :
+ WLAN_AUTH_SHARED_KEY;
+ sec.flags |= SEC_AUTH_MODE;
+ IEEE80211_DEBUG_WX("Auth: %s\n",
+ sec.auth_mode == WLAN_AUTH_OPEN ?
+ "OPEN" : "SHARED KEY");
+ }
+
+ /* For now we just support WEP, so only set that security level...
+ * TODO: When WPA is added this is one place that needs to change */
+ sec.flags |= SEC_LEVEL;
+ sec.level = SEC_LEVEL_1; /* 40 and 104 bit WEP */
+ sec.encode_alg[key] = SEC_ALG_WEP;
+
+ done:
+ if (ieee->set_security)
+ ieee->set_security(dev, &sec);
+
+ /* Do not reset port if card is in Managed mode since resetting will
+ * generate new IEEE 802.11 authentication which may end up in looping
+ * with IEEE 802.1X. If your hardware requires a reset after WEP
+ * configuration (for example... Prism2), implement the reset_port in
+ * the callbacks structures used to initialize the 802.11 stack. */
+ if (ieee->reset_on_keychange &&
+ ieee->iw_mode != IW_MODE_INFRA &&
+ ieee->reset_port && ieee->reset_port(dev)) {
+ printk(KERN_DEBUG "%s: reset_port failed\n", dev->name);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int ieee80211_wx_get_encode(struct ieee80211_device *ieee,
+ struct iw_request_info *info,
+ union iwreq_data *wrqu, char *keybuf)
+{
+ struct iw_point *erq = &(wrqu->encoding);
+ int len, key;
+ struct ieee80211_crypt_data *crypt;
+ struct ieee80211_security *sec = &ieee->sec;
+
+ IEEE80211_DEBUG_WX("GET_ENCODE\n");
+
+ key = erq->flags & IW_ENCODE_INDEX;
+ if (key) {
+ if (key > WEP_KEYS)
+ return -EINVAL;
+ key--;
+ } else
+ key = ieee->tx_keyidx;
+
+ crypt = ieee->crypt[key];
+ erq->flags = key + 1;
+
+ if (!sec->enabled) {
+ erq->length = 0;
+ erq->flags |= IW_ENCODE_DISABLED;
+ return 0;
+ }
+
+ len = sec->key_sizes[key];
+ memcpy(keybuf, sec->keys[key], len);
+
+ erq->length = (len >= 0 ? len : 0);
+ erq->flags |= IW_ENCODE_ENABLED;
+
+ if (ieee->open_wep)
+ erq->flags |= IW_ENCODE_OPEN;
+ else
+ erq->flags |= IW_ENCODE_RESTRICTED;
+
+ return 0;
+}
+
+int ieee80211_wx_set_encodeext(struct ieee80211_device *ieee,
+ struct iw_request_info *info,
+ union iwreq_data *wrqu, char *extra)
+{
+ struct net_device *dev = ieee->dev;
+ struct iw_point *encoding = &wrqu->encoding;
+ struct iw_encode_ext *ext = (struct iw_encode_ext *)extra;
+ int i, idx, ret = 0;
+ int group_key = 0;
+ const char *alg, *module;
+ struct ieee80211_crypto_ops *ops;
+ struct ieee80211_crypt_data **crypt;
+
+ struct ieee80211_security sec = {
+ .flags = 0,
+ };
+
+ idx = encoding->flags & IW_ENCODE_INDEX;
+ if (idx) {
+ if (idx < 1 || idx > WEP_KEYS)
+ return -EINVAL;
+ idx--;
+ } else
+ idx = ieee->tx_keyidx;
+
+ if (ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY) {
+ crypt = &ieee->crypt[idx];
+ group_key = 1;
+ } else {
+ if (idx != 0)
+ return -EINVAL;
+ if (ieee->iw_mode == IW_MODE_INFRA)
+ crypt = &ieee->crypt[idx];
+ else
+ return -EINVAL;
+ }
+
+ sec.flags |= SEC_ENABLED | SEC_ENCRYPT;
+ if ((encoding->flags & IW_ENCODE_DISABLED) ||
+ ext->alg == IW_ENCODE_ALG_NONE) {
+ if (*crypt)
+ ieee80211_crypt_delayed_deinit(ieee, crypt);
+
+ for (i = 0; i < WEP_KEYS; i++)
+ if (ieee->crypt[i] != NULL)
+ break;
+
+ if (i == WEP_KEYS) {
+ sec.enabled = 0;
+ sec.encrypt = 0;
+ sec.level = SEC_LEVEL_0;
+ sec.flags |= SEC_LEVEL;
+ }
+ goto done;
+ }
+
+ sec.enabled = 1;
+ sec.encrypt = 1;
+
+ if (group_key ? !ieee->host_mc_decrypt :
+ !(ieee->host_encrypt || ieee->host_decrypt ||
+ ieee->host_encrypt_msdu))
+ goto skip_host_crypt;
+
+ switch (ext->alg) {
+ case IW_ENCODE_ALG_WEP:
+ alg = "WEP";
+ module = "ieee80211_crypt_wep";
+ break;
+ case IW_ENCODE_ALG_TKIP:
+ alg = "TKIP";
+ module = "ieee80211_crypt_tkip";
+ break;
+ case IW_ENCODE_ALG_CCMP:
+ alg = "CCMP";
+ module = "ieee80211_crypt_ccmp";
+ break;
+ default:
+ IEEE80211_DEBUG_WX("%s: unknown crypto alg %d\n",
+ dev->name, ext->alg);
+ ret = -EINVAL;
+ goto done;
+ }
+
+ ops = ieee80211_get_crypto_ops(alg);
+ if (ops == NULL) {
+ request_module(module);
+ ops = ieee80211_get_crypto_ops(alg);
+ }
+ if (ops == NULL) {
+ IEEE80211_DEBUG_WX("%s: unknown crypto alg %d\n",
+ dev->name, ext->alg);
+ ret = -EINVAL;
+ goto done;
+ }
+
+ if (*crypt == NULL || (*crypt)->ops != ops) {
+ struct ieee80211_crypt_data *new_crypt;
+
+ ieee80211_crypt_delayed_deinit(ieee, crypt);
+
+ new_crypt = (struct ieee80211_crypt_data *)
+ kmalloc(sizeof(*new_crypt), GFP_KERNEL);
+ if (new_crypt == NULL) {
+ ret = -ENOMEM;
+ goto done;
+ }
+ memset(new_crypt, 0, sizeof(struct ieee80211_crypt_data));
+ new_crypt->ops = ops;
+ if (new_crypt->ops && try_module_get(new_crypt->ops->owner))
+ new_crypt->priv = new_crypt->ops->init(idx);
+ if (new_crypt->priv == NULL) {
+ kfree(new_crypt);
+ ret = -EINVAL;
+ goto done;
+ }
+ *crypt = new_crypt;
+ }
+
+ if (ext->key_len > 0 && (*crypt)->ops->set_key &&
+ (*crypt)->ops->set_key(ext->key, ext->key_len, ext->rx_seq,
+ (*crypt)->priv) < 0) {
+ IEEE80211_DEBUG_WX("%s: key setting failed\n", dev->name);
+ ret = -EINVAL;
+ goto done;
+ }
+
+ skip_host_crypt:
+ if (ext->ext_flags & IW_ENCODE_EXT_SET_TX_KEY) {
+ ieee->tx_keyidx = idx;
+ sec.active_key = idx;
+ sec.flags |= SEC_ACTIVE_KEY;
+ }
+
+ if (ext->alg != IW_ENCODE_ALG_NONE) {
+ memcpy(sec.keys[idx], ext->key, ext->key_len);
+ sec.key_sizes[idx] = ext->key_len;
+ sec.flags |= (1 << idx);
+ if (ext->alg == IW_ENCODE_ALG_WEP) {
+ sec.encode_alg[idx] = SEC_ALG_WEP;
+ sec.flags |= SEC_LEVEL;
+ sec.level = SEC_LEVEL_1;
+ } else if (ext->alg == IW_ENCODE_ALG_TKIP) {
+ sec.encode_alg[idx] = SEC_ALG_TKIP;
+ sec.flags |= SEC_LEVEL;
+ sec.level = SEC_LEVEL_2;
+ } else if (ext->alg == IW_ENCODE_ALG_CCMP) {
+ sec.encode_alg[idx] = SEC_ALG_CCMP;
+ sec.flags |= SEC_LEVEL;
+ sec.level = SEC_LEVEL_3;
+ }
+ /* Don't set sec level for group keys. */
+ if (group_key)
+ sec.flags &= ~SEC_LEVEL;
+ }
+ done:
+ if (ieee->set_security)
+ ieee->set_security(ieee->dev, &sec);
+
+ /*
+ * Do not reset port if card is in Managed mode since resetting will
+ * generate new IEEE 802.11 authentication which may end up in looping
+ * with IEEE 802.1X. If your hardware requires a reset after WEP
+ * configuration (for example... Prism2), implement the reset_port in
+ * the callbacks structures used to initialize the 802.11 stack.
+ */
+ if (ieee->reset_on_keychange &&
+ ieee->iw_mode != IW_MODE_INFRA &&
+ ieee->reset_port && ieee->reset_port(dev)) {
+ IEEE80211_DEBUG_WX("%s: reset_port failed\n", dev->name);
+ return -EINVAL;
+ }
+
+ return ret;
+}
+
+int ieee80211_wx_get_encodeext(struct ieee80211_device *ieee,
+ struct iw_request_info *info,
+ union iwreq_data *wrqu, char *extra)
+{
+ struct iw_point *encoding = &wrqu->encoding;
+ struct iw_encode_ext *ext = (struct iw_encode_ext *)extra;
+ struct ieee80211_security *sec = &ieee->sec;
+ int idx, max_key_len;
+
+ max_key_len = encoding->length - sizeof(*ext);
+ if (max_key_len < 0)
+ return -EINVAL;
+
+ idx = encoding->flags & IW_ENCODE_INDEX;
+ if (idx) {
+ if (idx < 1 || idx > WEP_KEYS)
+ return -EINVAL;
+ idx--;
+ } else
+ idx = ieee->tx_keyidx;
+
+ if (!ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY)
+ if (idx != 0 || ieee->iw_mode != IW_MODE_INFRA)
+ return -EINVAL;
+
+ encoding->flags = idx + 1;
+ memset(ext, 0, sizeof(*ext));
+
+ if (!sec->enabled) {
+ ext->alg = IW_ENCODE_ALG_NONE;
+ ext->key_len = 0;
+ encoding->flags |= IW_ENCODE_DISABLED;
+ } else {
+ if (sec->encode_alg[idx] == SEC_ALG_WEP)
+ ext->alg = IW_ENCODE_ALG_WEP;
+ else if (sec->encode_alg[idx] == SEC_ALG_TKIP)
+ ext->alg = IW_ENCODE_ALG_TKIP;
+ else if (sec->encode_alg[idx] == SEC_ALG_CCMP)
+ ext->alg = IW_ENCODE_ALG_CCMP;
+ else
+ return -EINVAL;
+
+ ext->key_len = sec->key_sizes[idx];
+ memcpy(ext->key, sec->keys[idx], ext->key_len);
+ encoding->flags |= IW_ENCODE_ENABLED;
+ if (ext->key_len &&
+ (ext->alg == IW_ENCODE_ALG_TKIP ||
+ ext->alg == IW_ENCODE_ALG_CCMP))
+ ext->ext_flags |= IW_ENCODE_EXT_TX_SEQ_VALID;
+
+ }
+
+ return 0;
+}
+
+EXPORT_SYMBOL(ieee80211_wx_set_encodeext);
+EXPORT_SYMBOL(ieee80211_wx_get_encodeext);
+
+EXPORT_SYMBOL(ieee80211_wx_get_scan);
+EXPORT_SYMBOL(ieee80211_wx_set_encode);
+EXPORT_SYMBOL(ieee80211_wx_get_encode);
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 0b3d9f1d806..e55136ae09f 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -413,20 +413,19 @@ config INET_TUNNEL
If unsure, say Y.
-config IP_TCPDIAG
- tristate "IP: TCP socket monitoring interface"
+config INET_DIAG
+ tristate "INET: socket monitoring interface"
default y
---help---
- Support for TCP socket monitoring interface used by native Linux
- tools such as ss. ss is included in iproute2, currently downloadable
- at <http://developer.osdl.org/dev/iproute2>. If you want IPv6 support
- and have selected IPv6 as a module, you need to build this as a
- module too.
+ Support for INET (TCP, DCCP, etc) socket monitoring interface used by
+ native Linux tools such as ss. ss is included in iproute2, currently
+ downloadable at <http://developer.osdl.org/dev/iproute2>.
If unsure, say Y.
-config IP_TCPDIAG_IPV6
- def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6)
+config INET_TCP_DIAG
+ depends on INET_DIAG
+ def_tristate INET_DIAG
config TCP_CONG_ADVANCED
bool "TCP: advanced congestion control"
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 55dc6cca1e7..f0435d00db6 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -4,11 +4,12 @@
obj-y := route.o inetpeer.o protocol.o \
ip_input.o ip_fragment.o ip_forward.o ip_options.o \
- ip_output.o ip_sockglue.o \
+ ip_output.o ip_sockglue.o inet_hashtables.o \
+ inet_timewait_sock.o inet_connection_sock.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
tcp_minisocks.o tcp_cong.o \
datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \
- sysctl_net_ipv4.o fib_frontend.o fib_semantics.o
+ sysctl_net_ipv4.o fib_frontend.o fib_semantics.o netfilter.o
obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o
obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o
@@ -29,8 +30,9 @@ obj-$(CONFIG_IP_ROUTE_MULTIPATH_WRANDOM) += multipath_wrandom.o
obj-$(CONFIG_IP_ROUTE_MULTIPATH_DRR) += multipath_drr.o
obj-$(CONFIG_NETFILTER) += netfilter/
obj-$(CONFIG_IP_VS) += ipvs/
-obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o
+obj-$(CONFIG_INET_DIAG) += inet_diag.o
obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
+obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 163ae4068b5..a9d84f93442 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -99,6 +99,7 @@
#include <net/arp.h>
#include <net/route.h>
#include <net/ip_fib.h>
+#include <net/inet_connection_sock.h>
#include <net/tcp.h>
#include <net/udp.h>
#include <linux/skbuff.h>
@@ -112,11 +113,7 @@
#include <linux/mroute.h>
#endif
-DEFINE_SNMP_STAT(struct linux_mib, net_statistics);
-
-#ifdef INET_REFCNT_DEBUG
-atomic_t inet_sock_nr;
-#endif
+DEFINE_SNMP_STAT(struct linux_mib, net_statistics) __read_mostly;
extern void ip_mc_drop_socket(struct sock *sk);
@@ -153,11 +150,7 @@ void inet_sock_destruct(struct sock *sk)
if (inet->opt)
kfree(inet->opt);
dst_release(sk->sk_dst_cache);
-#ifdef INET_REFCNT_DEBUG
- atomic_dec(&inet_sock_nr);
- printk(KERN_DEBUG "INET socket %p released, %d are still alive\n",
- sk, atomic_read(&inet_sock_nr));
-#endif
+ sk_refcnt_debug_dec(sk);
}
/*
@@ -210,7 +203,7 @@ int inet_listen(struct socket *sock, int backlog)
* we can only allow the backlog to be adjusted.
*/
if (old_state != TCP_LISTEN) {
- err = tcp_listen_start(sk);
+ err = inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
if (err)
goto out;
}
@@ -235,12 +228,14 @@ static int inet_create(struct socket *sock, int protocol)
struct proto *answer_prot;
unsigned char answer_flags;
char answer_no_check;
- int err;
+ int try_loading_module = 0;
+ int err = -ESOCKTNOSUPPORT;
sock->state = SS_UNCONNECTED;
/* Look for the requested type/protocol pair. */
answer = NULL;
+lookup_protocol:
rcu_read_lock();
list_for_each_rcu(p, &inetsw[sock->type]) {
answer = list_entry(p, struct inet_protosw, list);
@@ -261,9 +256,28 @@ static int inet_create(struct socket *sock, int protocol)
answer = NULL;
}
- err = -ESOCKTNOSUPPORT;
- if (!answer)
- goto out_rcu_unlock;
+ if (unlikely(answer == NULL)) {
+ if (try_loading_module < 2) {
+ rcu_read_unlock();
+ /*
+ * Be more specific, e.g. net-pf-2-proto-132-type-1
+ * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
+ */
+ if (++try_loading_module == 1)
+ request_module("net-pf-%d-proto-%d-type-%d",
+ PF_INET, protocol, sock->type);
+ /*
+ * Fall back to generic, e.g. net-pf-2-proto-132
+ * (net-pf-PF_INET-proto-IPPROTO_SCTP)
+ */
+ else
+ request_module("net-pf-%d-proto-%d",
+ PF_INET, protocol);
+ goto lookup_protocol;
+ } else
+ goto out_rcu_unlock;
+ }
+
err = -EPERM;
if (answer->capability > 0 && !capable(answer->capability))
goto out_rcu_unlock;
@@ -317,9 +331,7 @@ static int inet_create(struct socket *sock, int protocol)
inet->mc_index = 0;
inet->mc_list = NULL;
-#ifdef INET_REFCNT_DEBUG
- atomic_inc(&inet_sock_nr);
-#endif
+ sk_refcnt_debug_inc(sk);
if (inet->num) {
/* It assumes that any protocol which allows
@@ -847,10 +859,6 @@ static struct net_proto_family inet_family_ops = {
.owner = THIS_MODULE,
};
-
-extern void tcp_init(void);
-extern void tcp_v4_init(struct net_proto_family *);
-
/* Upon startup we insert all the elements in inetsw_array[] into
* the linked list inetsw.
*/
@@ -961,6 +969,119 @@ void inet_unregister_protosw(struct inet_protosw *p)
}
}
+/*
+ * Shall we try to damage output packets if routing dev changes?
+ */
+
+int sysctl_ip_dynaddr;
+
+static int inet_sk_reselect_saddr(struct sock *sk)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ int err;
+ struct rtable *rt;
+ __u32 old_saddr = inet->saddr;
+ __u32 new_saddr;
+ __u32 daddr = inet->daddr;
+
+ if (inet->opt && inet->opt->srr)
+ daddr = inet->opt->faddr;
+
+ /* Query new route. */
+ err = ip_route_connect(&rt, daddr, 0,
+ RT_CONN_FLAGS(sk),
+ sk->sk_bound_dev_if,
+ sk->sk_protocol,
+ inet->sport, inet->dport, sk);
+ if (err)
+ return err;
+
+ sk_setup_caps(sk, &rt->u.dst);
+
+ new_saddr = rt->rt_src;
+
+ if (new_saddr == old_saddr)
+ return 0;
+
+ if (sysctl_ip_dynaddr > 1) {
+ printk(KERN_INFO "%s(): shifting inet->"
+ "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
+ __FUNCTION__,
+ NIPQUAD(old_saddr),
+ NIPQUAD(new_saddr));
+ }
+
+ inet->saddr = inet->rcv_saddr = new_saddr;
+
+ /*
+ * XXX The only one ugly spot where we need to
+ * XXX really change the sockets identity after
+ * XXX it has entered the hashes. -DaveM
+ *
+ * Besides that, it does not check for connection
+ * uniqueness. Wait for troubles.
+ */
+ __sk_prot_rehash(sk);
+ return 0;
+}
+
+int inet_sk_rebuild_header(struct sock *sk)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
+ u32 daddr;
+ int err;
+
+ /* Route is OK, nothing to do. */
+ if (rt)
+ return 0;
+
+ /* Reroute. */
+ daddr = inet->daddr;
+ if (inet->opt && inet->opt->srr)
+ daddr = inet->opt->faddr;
+{
+ struct flowi fl = {
+ .oif = sk->sk_bound_dev_if,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = daddr,
+ .saddr = inet->saddr,
+ .tos = RT_CONN_FLAGS(sk),
+ },
+ },
+ .proto = sk->sk_protocol,
+ .uli_u = {
+ .ports = {
+ .sport = inet->sport,
+ .dport = inet->dport,
+ },
+ },
+ };
+
+ err = ip_route_output_flow(&rt, &fl, sk, 0);
+}
+ if (!err)
+ sk_setup_caps(sk, &rt->u.dst);
+ else {
+ /* Routing failed... */
+ sk->sk_route_caps = 0;
+ /*
+ * Other protocols have to map its equivalent state to TCP_SYN_SENT.
+ * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme
+ */
+ if (!sysctl_ip_dynaddr ||
+ sk->sk_state != TCP_SYN_SENT ||
+ (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
+ (err = inet_sk_reselect_saddr(sk)) != 0)
+ sk->sk_err_soft = -err;
+ }
+
+ return err;
+}
+
+EXPORT_SYMBOL(inet_sk_rebuild_header);
+
#ifdef CONFIG_IP_MULTICAST
static struct net_protocol igmp_protocol = {
.handler = igmp_rcv,
@@ -1007,7 +1128,6 @@ static int __init init_ipv4_mibs(void)
}
static int ipv4_proc_init(void);
-extern void ipfrag_init(void);
/*
* IP protocol layer initialiser
@@ -1128,20 +1248,6 @@ module_init(inet_init);
/* ------------------------------------------------------------------------ */
#ifdef CONFIG_PROC_FS
-extern int fib_proc_init(void);
-extern void fib_proc_exit(void);
-#ifdef CONFIG_IP_FIB_TRIE
-extern int fib_stat_proc_init(void);
-extern void fib_stat_proc_exit(void);
-#endif
-extern int ip_misc_proc_init(void);
-extern int raw_proc_init(void);
-extern void raw_proc_exit(void);
-extern int tcp4_proc_init(void);
-extern void tcp4_proc_exit(void);
-extern int udp4_proc_init(void);
-extern void udp4_proc_exit(void);
-
static int __init ipv4_proc_init(void)
{
int rc = 0;
@@ -1154,19 +1260,11 @@ static int __init ipv4_proc_init(void)
goto out_udp;
if (fib_proc_init())
goto out_fib;
-#ifdef CONFIG_IP_FIB_TRIE
- if (fib_stat_proc_init())
- goto out_fib_stat;
-#endif
if (ip_misc_proc_init())
goto out_misc;
out:
return rc;
out_misc:
-#ifdef CONFIG_IP_FIB_TRIE
- fib_stat_proc_exit();
-out_fib_stat:
-#endif
fib_proc_exit();
out_fib:
udp4_proc_exit();
@@ -1205,7 +1303,3 @@ EXPORT_SYMBOL(inet_stream_ops);
EXPORT_SYMBOL(inet_unregister_protosw);
EXPORT_SYMBOL(net_statistics);
EXPORT_SYMBOL(sysctl_ip_nonlocal_bind);
-
-#ifdef INET_REFCNT_DEBUG
-EXPORT_SYMBOL(inet_sock_nr);
-#endif
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 514c85b2631..035ad2c9e1b 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -263,10 +263,8 @@ static int ah_init_state(struct xfrm_state *x)
error:
if (ahp) {
- if (ahp->work_icv)
- kfree(ahp->work_icv);
- if (ahp->tfm)
- crypto_free_tfm(ahp->tfm);
+ kfree(ahp->work_icv);
+ crypto_free_tfm(ahp->tfm);
kfree(ahp);
}
return -EINVAL;
@@ -279,14 +277,10 @@ static void ah_destroy(struct xfrm_state *x)
if (!ahp)
return;
- if (ahp->work_icv) {
- kfree(ahp->work_icv);
- ahp->work_icv = NULL;
- }
- if (ahp->tfm) {
- crypto_free_tfm(ahp->tfm);
- ahp->tfm = NULL;
- }
+ kfree(ahp->work_icv);
+ ahp->work_icv = NULL;
+ crypto_free_tfm(ahp->tfm);
+ ahp->tfm = NULL;
kfree(ahp);
}
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index a642fd61285..b425748f02d 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -241,7 +241,7 @@ static int arp_constructor(struct neighbour *neigh)
neigh->type = inet_addr_type(addr);
rcu_read_lock();
- in_dev = rcu_dereference(__in_dev_get(dev));
+ in_dev = __in_dev_get_rcu(dev);
if (in_dev == NULL) {
rcu_read_unlock();
return -EINVAL;
@@ -697,12 +697,6 @@ void arp_send(int type, int ptype, u32 dest_ip,
arp_xmit(skb);
}
-static void parp_redo(struct sk_buff *skb)
-{
- nf_reset(skb);
- arp_rcv(skb, skb->dev, NULL);
-}
-
/*
* Process an arp request.
*/
@@ -865,7 +859,7 @@ static int arp_process(struct sk_buff *skb)
if (n)
neigh_release(n);
- if (skb->stamp.tv_sec == LOCALLY_ENQUEUED ||
+ if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
skb->pkt_type == PACKET_HOST ||
in_dev->arp_parms->proxy_delay == 0) {
arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
@@ -922,12 +916,17 @@ out:
return 0;
}
+static void parp_redo(struct sk_buff *skb)
+{
+ arp_process(skb);
+}
+
/*
* Receive an arp request from the device layer.
*/
-int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
struct arphdr *arp;
@@ -948,6 +947,8 @@ int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
goto out_of_mem;
+ memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
+
return NF_HOOK(NF_ARP, NF_ARP_IN, skb, dev, NULL, arp_process);
freeskb:
@@ -988,8 +989,8 @@ static int arp_req_set(struct arpreq *r, struct net_device * dev)
ipv4_devconf.proxy_arp = 1;
return 0;
}
- if (__in_dev_get(dev)) {
- __in_dev_get(dev)->cnf.proxy_arp = 1;
+ if (__in_dev_get_rtnl(dev)) {
+ __in_dev_get_rtnl(dev)->cnf.proxy_arp = 1;
return 0;
}
return -ENXIO;
@@ -1094,8 +1095,8 @@ static int arp_req_delete(struct arpreq *r, struct net_device * dev)
ipv4_devconf.proxy_arp = 0;
return 0;
}
- if (__in_dev_get(dev)) {
- __in_dev_get(dev)->cnf.proxy_arp = 0;
+ if (__in_dev_get_rtnl(dev)) {
+ __in_dev_get_rtnl(dev)->cnf.proxy_arp = 0;
return 0;
}
return -ENXIO;
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index b1db561f254..c1b42b5257f 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -16,9 +16,10 @@
#include <linux/module.h>
#include <linux/ip.h>
#include <linux/in.h>
+#include <net/ip.h>
#include <net/sock.h>
-#include <net/tcp.h>
#include <net/route.h>
+#include <net/tcp_states.h>
int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index d8a10e3dd77..4ec4b2ca6ab 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -351,7 +351,7 @@ static int inet_insert_ifa(struct in_ifaddr *ifa)
static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
{
- struct in_device *in_dev = __in_dev_get(dev);
+ struct in_device *in_dev = __in_dev_get_rtnl(dev);
ASSERT_RTNL();
@@ -449,7 +449,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
goto out;
rc = -ENOBUFS;
- if ((in_dev = __in_dev_get(dev)) == NULL) {
+ if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) {
in_dev = inetdev_init(dev);
if (!in_dev)
goto out;
@@ -584,7 +584,7 @@ int devinet_ioctl(unsigned int cmd, void __user *arg)
if (colon)
*colon = ':';
- if ((in_dev = __in_dev_get(dev)) != NULL) {
+ if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
if (tryaddrmatch) {
/* Matthias Andree */
/* compare label and address (4.4BSD style) */
@@ -715,6 +715,7 @@ int devinet_ioctl(unsigned int cmd, void __user *arg)
break;
ret = 0;
if (ifa->ifa_mask != sin->sin_addr.s_addr) {
+ u32 old_mask = ifa->ifa_mask;
inet_del_ifa(in_dev, ifap, 0);
ifa->ifa_mask = sin->sin_addr.s_addr;
ifa->ifa_prefixlen = inet_mask_len(ifa->ifa_mask);
@@ -728,7 +729,7 @@ int devinet_ioctl(unsigned int cmd, void __user *arg)
if ((dev->flags & IFF_BROADCAST) &&
(ifa->ifa_prefixlen < 31) &&
(ifa->ifa_broadcast ==
- (ifa->ifa_local|~ifa->ifa_mask))) {
+ (ifa->ifa_local|~old_mask))) {
ifa->ifa_broadcast = (ifa->ifa_local |
~sin->sin_addr.s_addr);
}
@@ -748,7 +749,7 @@ rarok:
static int inet_gifconf(struct net_device *dev, char __user *buf, int len)
{
- struct in_device *in_dev = __in_dev_get(dev);
+ struct in_device *in_dev = __in_dev_get_rtnl(dev);
struct in_ifaddr *ifa;
struct ifreq ifr;
int done = 0;
@@ -791,7 +792,7 @@ u32 inet_select_addr(const struct net_device *dev, u32 dst, int scope)
struct in_device *in_dev;
rcu_read_lock();
- in_dev = __in_dev_get(dev);
+ in_dev = __in_dev_get_rcu(dev);
if (!in_dev)
goto no_in_dev;
@@ -818,7 +819,7 @@ no_in_dev:
read_lock(&dev_base_lock);
rcu_read_lock();
for (dev = dev_base; dev; dev = dev->next) {
- if ((in_dev = __in_dev_get(dev)) == NULL)
+ if ((in_dev = __in_dev_get_rcu(dev)) == NULL)
continue;
for_primary_ifa(in_dev) {
@@ -887,7 +888,7 @@ u32 inet_confirm_addr(const struct net_device *dev, u32 dst, u32 local, int scop
if (dev) {
rcu_read_lock();
- if ((in_dev = __in_dev_get(dev)))
+ if ((in_dev = __in_dev_get_rcu(dev)))
addr = confirm_addr_indev(in_dev, dst, local, scope);
rcu_read_unlock();
@@ -897,7 +898,7 @@ u32 inet_confirm_addr(const struct net_device *dev, u32 dst, u32 local, int scop
read_lock(&dev_base_lock);
rcu_read_lock();
for (dev = dev_base; dev; dev = dev->next) {
- if ((in_dev = __in_dev_get(dev))) {
+ if ((in_dev = __in_dev_get_rcu(dev))) {
addr = confirm_addr_indev(in_dev, dst, local, scope);
if (addr)
break;
@@ -957,7 +958,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
void *ptr)
{
struct net_device *dev = ptr;
- struct in_device *in_dev = __in_dev_get(dev);
+ struct in_device *in_dev = __in_dev_get_rtnl(dev);
ASSERT_RTNL();
@@ -1078,7 +1079,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
if (idx > s_idx)
s_ip_idx = 0;
rcu_read_lock();
- if ((in_dev = __in_dev_get(dev)) == NULL) {
+ if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
rcu_read_unlock();
continue;
}
@@ -1111,13 +1112,12 @@ static void rtmsg_ifa(int event, struct in_ifaddr* ifa)
struct sk_buff *skb = alloc_skb(size, GFP_KERNEL);
if (!skb)
- netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, ENOBUFS);
+ netlink_set_err(rtnl, 0, RTNLGRP_IPV4_IFADDR, ENOBUFS);
else if (inet_fill_ifaddr(skb, ifa, current->pid, 0, event, 0) < 0) {
kfree_skb(skb);
- netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, EINVAL);
+ netlink_set_err(rtnl, 0, RTNLGRP_IPV4_IFADDR, EINVAL);
} else {
- NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_IFADDR;
- netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV4_IFADDR, GFP_KERNEL);
+ netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV4_IFADDR, GFP_KERNEL);
}
}
@@ -1150,7 +1150,7 @@ void inet_forward_change(void)
for (dev = dev_base; dev; dev = dev->next) {
struct in_device *in_dev;
rcu_read_lock();
- in_dev = __in_dev_get(dev);
+ in_dev = __in_dev_get_rcu(dev);
if (in_dev)
in_dev->cnf.forwarding = on;
rcu_read_unlock();
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index ba57446d5d1..1b18ce66e7b 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -5,6 +5,7 @@
#include <net/esp.h>
#include <asm/scatterlist.h>
#include <linux/crypto.h>
+#include <linux/kernel.h>
#include <linux/pfkeyv2.h>
#include <linux/random.h>
#include <net/icmp.h>
@@ -42,10 +43,10 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
esp = x->data;
alen = esp->auth.icv_trunc_len;
tfm = esp->conf.tfm;
- blksize = (crypto_tfm_alg_blocksize(tfm) + 3) & ~3;
- clen = (clen + 2 + blksize-1)&~(blksize-1);
+ blksize = ALIGN(crypto_tfm_alg_blocksize(tfm), 4);
+ clen = ALIGN(clen + 2, blksize);
if (esp->conf.padlen)
- clen = (clen + esp->conf.padlen-1)&~(esp->conf.padlen-1);
+ clen = ALIGN(clen, esp->conf.padlen);
if ((nfrags = skb_cow_data(skb, clen-skb->len+alen, &trailer)) < 0)
goto error;
@@ -143,7 +144,7 @@ static int esp_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struc
struct ip_esp_hdr *esph;
struct esp_data *esp = x->data;
struct sk_buff *trailer;
- int blksize = crypto_tfm_alg_blocksize(esp->conf.tfm);
+ int blksize = ALIGN(crypto_tfm_alg_blocksize(esp->conf.tfm), 4);
int alen = esp->auth.icv_trunc_len;
int elen = skb->len - sizeof(struct ip_esp_hdr) - esp->conf.ivlen - alen;
int nfrags;
@@ -304,16 +305,16 @@ static int esp_post_input(struct xfrm_state *x, struct xfrm_decap_state *decap,
static u32 esp4_get_max_size(struct xfrm_state *x, int mtu)
{
struct esp_data *esp = x->data;
- u32 blksize = crypto_tfm_alg_blocksize(esp->conf.tfm);
+ u32 blksize = ALIGN(crypto_tfm_alg_blocksize(esp->conf.tfm), 4);
if (x->props.mode) {
- mtu = (mtu + 2 + blksize-1)&~(blksize-1);
+ mtu = ALIGN(mtu + 2, blksize);
} else {
/* The worst case. */
- mtu += 2 + blksize;
+ mtu = ALIGN(mtu + 2, 4) + blksize - 4;
}
if (esp->conf.padlen)
- mtu = (mtu + esp->conf.padlen-1)&~(esp->conf.padlen-1);
+ mtu = ALIGN(mtu, esp->conf.padlen);
return mtu + x->props.header_len + esp->auth.icv_trunc_len;
}
@@ -331,8 +332,8 @@ static void esp4_err(struct sk_buff *skb, u32 info)
x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET);
if (!x)
return;
- NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
- ntohl(esph->spi), ntohl(iph->daddr)));
+ NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
+ ntohl(esph->spi), ntohl(iph->daddr));
xfrm_state_put(x);
}
@@ -343,22 +344,14 @@ static void esp_destroy(struct xfrm_state *x)
if (!esp)
return;
- if (esp->conf.tfm) {
- crypto_free_tfm(esp->conf.tfm);
- esp->conf.tfm = NULL;
- }
- if (esp->conf.ivec) {
- kfree(esp->conf.ivec);
- esp->conf.ivec = NULL;
- }
- if (esp->auth.tfm) {
- crypto_free_tfm(esp->auth.tfm);
- esp->auth.tfm = NULL;
- }
- if (esp->auth.work_icv) {
- kfree(esp->auth.work_icv);
- esp->auth.work_icv = NULL;
- }
+ crypto_free_tfm(esp->conf.tfm);
+ esp->conf.tfm = NULL;
+ kfree(esp->conf.ivec);
+ esp->conf.ivec = NULL;
+ crypto_free_tfm(esp->auth.tfm);
+ esp->auth.tfm = NULL;
+ kfree(esp->auth.work_icv);
+ esp->auth.work_icv = NULL;
kfree(esp);
}
@@ -395,10 +388,10 @@ static int esp_init_state(struct xfrm_state *x)
if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
crypto_tfm_alg_digestsize(esp->auth.tfm)) {
- NETDEBUG(printk(KERN_INFO "ESP: %s digestsize %u != %hu\n",
- x->aalg->alg_name,
- crypto_tfm_alg_digestsize(esp->auth.tfm),
- aalg_desc->uinfo.auth.icv_fullbits/8));
+ NETDEBUG(KERN_INFO "ESP: %s digestsize %u != %hu\n",
+ x->aalg->alg_name,
+ crypto_tfm_alg_digestsize(esp->auth.tfm),
+ aalg_desc->uinfo.auth.icv_fullbits/8);
goto error;
}
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index cd8e45ab958..990633c09df 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -173,7 +173,7 @@ int fib_validate_source(u32 src, u32 dst, u8 tos, int oif,
no_addr = rpf = 0;
rcu_read_lock();
- in_dev = __in_dev_get(dev);
+ in_dev = __in_dev_get_rcu(dev);
if (in_dev) {
no_addr = in_dev->ifa_list == NULL;
rpf = IN_DEV_RPFILTER(in_dev);
@@ -558,16 +558,15 @@ static void nl_fib_input(struct sock *sk, int len)
nl_fib_lookup(frn, tb);
pid = nlh->nlmsg_pid; /*pid of sending process */
- NETLINK_CB(skb).groups = 0; /* not in mcast group */
NETLINK_CB(skb).pid = 0; /* from kernel */
NETLINK_CB(skb).dst_pid = pid;
- NETLINK_CB(skb).dst_groups = 0; /* unicast */
+ NETLINK_CB(skb).dst_group = 0; /* unicast */
netlink_unicast(sk, skb, pid, MSG_DONTWAIT);
}
static void nl_fib_lookup_init(void)
{
- netlink_kernel_create(NETLINK_FIB_LOOKUP, nl_fib_input);
+ netlink_kernel_create(NETLINK_FIB_LOOKUP, 0, nl_fib_input, THIS_MODULE);
}
static void fib_disable_ip(struct net_device *dev, int force)
@@ -592,7 +591,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
break;
case NETDEV_DOWN:
fib_del_ifaddr(ifa);
- if (ifa->ifa_dev && ifa->ifa_dev->ifa_list == NULL) {
+ if (ifa->ifa_dev->ifa_list == NULL) {
/* Last address was deleted from this interface.
Disable IP.
*/
@@ -608,7 +607,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
struct net_device *dev = ptr;
- struct in_device *in_dev = __in_dev_get(dev);
+ struct in_device *in_dev = __in_dev_get_rtnl(dev);
if (event == NETDEV_UNREGISTER) {
fib_disable_ip(dev, 2);
@@ -662,5 +661,4 @@ void __init ip_fib_init(void)
}
EXPORT_SYMBOL(inet_addr_type);
-EXPORT_SYMBOL(ip_dev_find);
EXPORT_SYMBOL(ip_rt_ioctl);
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index b10d6bb5ef3..2a8c9afc369 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -45,8 +45,8 @@
#include "fib_lookup.h"
-static kmem_cache_t *fn_hash_kmem;
-static kmem_cache_t *fn_alias_kmem;
+static kmem_cache_t *fn_hash_kmem __read_mostly;
+static kmem_cache_t *fn_alias_kmem __read_mostly;
struct fib_node {
struct hlist_node fn_hash;
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index b729d97cfa9..ef6609ea0eb 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -7,6 +7,7 @@
struct fib_alias {
struct list_head fa_list;
+ struct rcu_head rcu;
struct fib_info *fa_info;
u8 fa_tos;
u8 fa_type;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index c886b28ba9f..186f20c4a45 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -290,10 +290,10 @@ void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
kfree_skb(skb);
return;
}
- NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_ROUTE;
+ NETLINK_CB(skb).dst_group = RTNLGRP_IPV4_ROUTE;
if (n->nlmsg_flags&NLM_F_ECHO)
atomic_inc(&skb->users);
- netlink_broadcast(rtnl, skb, pid, RTMGRP_IPV4_ROUTE, GFP_KERNEL);
+ netlink_broadcast(rtnl, skb, pid, RTNLGRP_IPV4_ROUTE, GFP_KERNEL);
if (n->nlmsg_flags&NLM_F_ECHO)
netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
}
@@ -593,10 +593,13 @@ static void fib_hash_move(struct hlist_head *new_info_hash,
struct hlist_head *new_laddrhash,
unsigned int new_size)
{
+ struct hlist_head *old_info_hash, *old_laddrhash;
unsigned int old_size = fib_hash_size;
- unsigned int i;
+ unsigned int i, bytes;
write_lock(&fib_info_lock);
+ old_info_hash = fib_info_hash;
+ old_laddrhash = fib_info_laddrhash;
fib_hash_size = new_size;
for (i = 0; i < old_size; i++) {
@@ -636,6 +639,10 @@ static void fib_hash_move(struct hlist_head *new_info_hash,
fib_info_laddrhash = new_laddrhash;
write_unlock(&fib_info_lock);
+
+ bytes = old_size * sizeof(struct hlist_head *);
+ fib_hash_free(old_info_hash, bytes);
+ fib_hash_free(old_laddrhash, bytes);
}
struct fib_info *
@@ -847,6 +854,7 @@ failure:
return NULL;
}
+/* Note! fib_semantic_match intentionally uses RCU list functions. */
int fib_semantic_match(struct list_head *head, const struct flowi *flp,
struct fib_result *res, __u32 zone, __u32 mask,
int prefixlen)
@@ -854,7 +862,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
struct fib_alias *fa;
int nh_sel = 0;
- list_for_each_entry(fa, head, fa_list) {
+ list_for_each_entry_rcu(fa, head, fa_list) {
int err;
if (fa->fa_tos &&
@@ -1079,7 +1087,7 @@ fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
rta->rta_oif = &dev->ifindex;
if (colon) {
struct in_ifaddr *ifa;
- struct in_device *in_dev = __in_dev_get(dev);
+ struct in_device *in_dev = __in_dev_get_rtnl(dev);
if (!in_dev)
return -ENODEV;
*colon = ':';
@@ -1260,7 +1268,7 @@ int fib_sync_up(struct net_device *dev)
}
if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
continue;
- if (nh->nh_dev != dev || __in_dev_get(dev) == NULL)
+ if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev))
continue;
alive++;
spin_lock_bh(&fib_multipath_lock);
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index a701405fab0..66247f38b37 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -43,7 +43,7 @@
* 2 of the License, or (at your option) any later version.
*/
-#define VERSION "0.325"
+#define VERSION "0.404"
#include <linux/config.h>
#include <asm/uaccess.h>
@@ -62,6 +62,7 @@
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
+#include <linux/rcupdate.h>
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/init.h>
@@ -77,56 +78,55 @@
#undef CONFIG_IP_FIB_TRIE_STATS
#define MAX_CHILDS 16384
-#define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n)))
#define KEYLENGTH (8*sizeof(t_key))
#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
-static DEFINE_RWLOCK(fib_lock);
-
typedef unsigned int t_key;
#define T_TNODE 0
#define T_LEAF 1
#define NODE_TYPE_MASK 0x1UL
-#define NODE_PARENT(_node) \
- ((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK))
-#define NODE_SET_PARENT(_node, _ptr) \
- ((_node)->_parent = (((unsigned long)(_ptr)) | \
- ((_node)->_parent & NODE_TYPE_MASK)))
-#define NODE_INIT_PARENT(_node, _type) \
- ((_node)->_parent = (_type))
-#define NODE_TYPE(_node) \
- ((_node)->_parent & NODE_TYPE_MASK)
-
-#define IS_TNODE(n) (!(n->_parent & T_LEAF))
-#define IS_LEAF(n) (n->_parent & T_LEAF)
+#define NODE_PARENT(node) \
+ ((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK)))
+
+#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
+
+#define NODE_SET_PARENT(node, ptr) \
+ rcu_assign_pointer((node)->parent, \
+ ((unsigned long)(ptr)) | NODE_TYPE(node))
+
+#define IS_TNODE(n) (!(n->parent & T_LEAF))
+#define IS_LEAF(n) (n->parent & T_LEAF)
struct node {
- t_key key;
- unsigned long _parent;
+ t_key key;
+ unsigned long parent;
};
struct leaf {
- t_key key;
- unsigned long _parent;
+ t_key key;
+ unsigned long parent;
struct hlist_head list;
+ struct rcu_head rcu;
};
struct leaf_info {
struct hlist_node hlist;
+ struct rcu_head rcu;
int plen;
struct list_head falh;
};
struct tnode {
- t_key key;
- unsigned long _parent;
- unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */
- unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
- unsigned short full_children; /* KEYLENGTH bits needed */
- unsigned short empty_children; /* KEYLENGTH bits needed */
- struct node *child[0];
+ t_key key;
+ unsigned long parent;
+ unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */
+ unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
+ unsigned short full_children; /* KEYLENGTH bits needed */
+ unsigned short empty_children; /* KEYLENGTH bits needed */
+ struct rcu_head rcu;
+ struct node *child[0];
};
#ifdef CONFIG_IP_FIB_TRIE_STATS
@@ -150,77 +150,44 @@ struct trie_stat {
};
struct trie {
- struct node *trie;
+ struct node *trie;
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats stats;
#endif
- int size;
+ int size;
unsigned int revision;
};
-static int trie_debug = 0;
-
-static int tnode_full(struct tnode *tn, struct node *n);
static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
-static int tnode_child_length(struct tnode *tn);
static struct node *resize(struct trie *t, struct tnode *tn);
-static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err);
-static struct tnode *halve(struct trie *t, struct tnode *tn, int *err);
+static struct tnode *inflate(struct trie *t, struct tnode *tn);
+static struct tnode *halve(struct trie *t, struct tnode *tn);
static void tnode_free(struct tnode *tn);
-static void trie_dump_seq(struct seq_file *seq, struct trie *t);
-extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
-extern int fib_detect_death(struct fib_info *fi, int order,
- struct fib_info **last_resort, int *last_idx, int *dflt);
-extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id,
- struct nlmsghdr *n, struct netlink_skb_parms *req);
-
-static kmem_cache_t *fn_alias_kmem;
+static kmem_cache_t *fn_alias_kmem __read_mostly;
static struct trie *trie_local = NULL, *trie_main = NULL;
-static void trie_bug(char *err)
-{
- printk("Trie Bug: %s\n", err);
- BUG();
-}
+
+/* rcu_read_lock needs to be hold by caller from readside */
static inline struct node *tnode_get_child(struct tnode *tn, int i)
{
- if (i >= 1<<tn->bits)
- trie_bug("tnode_get_child");
+ BUG_ON(i >= 1 << tn->bits);
- return tn->child[i];
+ return rcu_dereference(tn->child[i]);
}
-static inline int tnode_child_length(struct tnode *tn)
+static inline int tnode_child_length(const struct tnode *tn)
{
- return 1<<tn->bits;
+ return 1 << tn->bits;
}
-/*
- _________________________________________________________________
- | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
- ----------------------------------------------------------------
- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
-
- _________________________________________________________________
- | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
- -----------------------------------------------------------------
- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
-
- tp->pos = 7
- tp->bits = 3
- n->pos = 15
- n->bits=4
- KEYLENGTH=32
-*/
-
static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
{
- if (offset < KEYLENGTH)
+ if (offset < KEYLENGTH)
return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
- else
+ else
return 0;
}
@@ -233,8 +200,8 @@ static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
{
if (bits == 0 || offset >= KEYLENGTH)
return 1;
- bits = bits > KEYLENGTH ? KEYLENGTH : bits;
- return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
+ bits = bits > KEYLENGTH ? KEYLENGTH : bits;
+ return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
}
static inline int tkey_mismatch(t_key a, int offset, t_key b)
@@ -249,14 +216,6 @@ static inline int tkey_mismatch(t_key a, int offset, t_key b)
return i;
}
-/* Candiate for fib_semantics */
-
-static void fn_free_alias(struct fib_alias *fa)
-{
- fib_release_info(fa->fa_info);
- kmem_cache_free(fn_alias_kmem, fa);
-}
-
/*
To understand this stuff, an understanding of keys and all their bits is
necessary. Every node in the trie has a key associated with it, but not
@@ -265,7 +224,7 @@ static void fn_free_alias(struct fib_alias *fa)
Consider a node 'n' and its parent 'tp'.
If n is a leaf, every bit in its key is significant. Its presence is
- necessitaded by path compression, since during a tree traversal (when
+ necessitated by path compression, since during a tree traversal (when
searching for a leaf - unless we are doing an insertion) we will completely
ignore all skipped bits we encounter. Thus we need to verify, at the end of
a potentially successful search, that we have indeed been walking the
@@ -295,7 +254,7 @@ static void fn_free_alias(struct fib_alias *fa)
tp->pos = 7
tp->bits = 3
n->pos = 15
- n->bits=4
+ n->bits = 4
First, let's just ignore the bits that come before the parent tp, that is
the bits from 0 to (tp->pos-1). They are *known* but at this point we do
@@ -320,60 +279,67 @@ static void fn_free_alias(struct fib_alias *fa)
*/
-static void check_tnode(struct tnode *tn)
+static inline void check_tnode(const struct tnode *tn)
{
- if (tn && tn->pos+tn->bits > 32) {
- printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits);
- }
+ WARN_ON(tn && tn->pos+tn->bits > 32);
}
static int halve_threshold = 25;
static int inflate_threshold = 50;
+static int halve_threshold_root = 15;
+static int inflate_threshold_root = 25;
-static struct leaf *leaf_new(void)
+
+static void __alias_free_mem(struct rcu_head *head)
{
- struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
- if (l) {
- NODE_INIT_PARENT(l, T_LEAF);
- INIT_HLIST_HEAD(&l->list);
- }
- return l;
+ struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
+ kmem_cache_free(fn_alias_kmem, fa);
}
-static struct leaf_info *leaf_info_new(int plen)
+static inline void alias_free_mem_rcu(struct fib_alias *fa)
{
- struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
- if (li) {
- li->plen = plen;
- INIT_LIST_HEAD(&li->falh);
- }
- return li;
+ call_rcu(&fa->rcu, __alias_free_mem);
+}
+
+static void __leaf_free_rcu(struct rcu_head *head)
+{
+ kfree(container_of(head, struct leaf, rcu));
+}
+
+static inline void free_leaf(struct leaf *leaf)
+{
+ call_rcu(&leaf->rcu, __leaf_free_rcu);
}
-static inline void free_leaf(struct leaf *l)
+static void __leaf_info_free_rcu(struct rcu_head *head)
{
- kfree(l);
+ kfree(container_of(head, struct leaf_info, rcu));
}
-static inline void free_leaf_info(struct leaf_info *li)
+static inline void free_leaf_info(struct leaf_info *leaf)
{
- kfree(li);
+ call_rcu(&leaf->rcu, __leaf_info_free_rcu);
}
static struct tnode *tnode_alloc(unsigned int size)
{
- if (size <= PAGE_SIZE) {
- return kmalloc(size, GFP_KERNEL);
- } else {
- return (struct tnode *)
- __get_free_pages(GFP_KERNEL, get_order(size));
- }
+ struct page *pages;
+
+ if (size <= PAGE_SIZE)
+ return kcalloc(size, 1, GFP_KERNEL);
+
+ pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
+ if (!pages)
+ return NULL;
+
+ return page_address(pages);
}
-static void __tnode_free(struct tnode *tn)
+static void __tnode_free_rcu(struct rcu_head *head)
{
+ struct tnode *tn = container_of(head, struct tnode, rcu);
unsigned int size = sizeof(struct tnode) +
- (1<<tn->bits) * sizeof(struct node *);
+ (1 << tn->bits) * sizeof(struct node *);
if (size <= PAGE_SIZE)
kfree(tn);
@@ -381,15 +347,40 @@ static void __tnode_free(struct tnode *tn)
free_pages((unsigned long)tn, get_order(size));
}
+static inline void tnode_free(struct tnode *tn)
+{
+ call_rcu(&tn->rcu, __tnode_free_rcu);
+}
+
+static struct leaf *leaf_new(void)
+{
+ struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
+ if (l) {
+ l->parent = T_LEAF;
+ INIT_HLIST_HEAD(&l->list);
+ }
+ return l;
+}
+
+static struct leaf_info *leaf_info_new(int plen)
+{
+ struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
+ if (li) {
+ li->plen = plen;
+ INIT_LIST_HEAD(&li->falh);
+ }
+ return li;
+}
+
static struct tnode* tnode_new(t_key key, int pos, int bits)
{
int nchildren = 1<<bits;
int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
struct tnode *tn = tnode_alloc(sz);
- if (tn) {
+ if (tn) {
memset(tn, 0, sz);
- NODE_INIT_PARENT(tn, T_TNODE);
+ tn->parent = T_TNODE;
tn->pos = pos;
tn->bits = bits;
tn->key = key;
@@ -397,38 +388,17 @@ static struct tnode* tnode_new(t_key key, int pos, int bits)
tn->empty_children = 1<<bits;
}
- if (trie_debug > 0)
- printk("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
- (unsigned int) (sizeof(struct node) * 1<<bits));
+ pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
+ (unsigned int) (sizeof(struct node) * 1<<bits));
return tn;
}
-static void tnode_free(struct tnode *tn)
-{
- if (!tn) {
- trie_bug("tnode_free\n");
- }
- if (IS_LEAF(tn)) {
- free_leaf((struct leaf *)tn);
- if (trie_debug > 0 )
- printk("FL %p \n", tn);
- }
- else if (IS_TNODE(tn)) {
- __tnode_free(tn);
- if (trie_debug > 0 )
- printk("FT %p \n", tn);
- }
- else {
- trie_bug("tnode_free\n");
- }
-}
-
/*
* Check whether a tnode 'n' is "full", i.e. it is an internal node
* and no bits are skipped. See discussion in dyntree paper p. 6
*/
-static inline int tnode_full(struct tnode *tn, struct node *n)
+static inline int tnode_full(const struct tnode *tn, const struct node *n)
{
if (n == NULL || IS_LEAF(n))
return 0;
@@ -448,15 +418,11 @@ static inline void put_child(struct trie *t, struct tnode *tn, int i, struct nod
static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
{
- struct node *chi;
+ struct node *chi = tn->child[i];
int isfull;
- if (i >= 1<<tn->bits) {
- printk("bits=%d, i=%d\n", tn->bits, i);
- trie_bug("tnode_put_child_reorg bits");
- }
- write_lock_bh(&fib_lock);
- chi = tn->child[i];
+ BUG_ON(i >= 1<<tn->bits);
+
/* update emptyChildren */
if (n == NULL && chi != NULL)
@@ -465,33 +431,34 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int w
tn->empty_children--;
/* update fullChildren */
- if (wasfull == -1)
+ if (wasfull == -1)
wasfull = tnode_full(tn, chi);
isfull = tnode_full(tn, n);
if (wasfull && !isfull)
tn->full_children--;
-
else if (!wasfull && isfull)
tn->full_children++;
+
if (n)
NODE_SET_PARENT(n, tn);
- tn->child[i] = n;
- write_unlock_bh(&fib_lock);
+ rcu_assign_pointer(tn->child[i], n);
}
static struct node *resize(struct trie *t, struct tnode *tn)
{
int i;
int err = 0;
+ struct tnode *old_tn;
+ int inflate_threshold_use;
+ int halve_threshold_use;
if (!tn)
return NULL;
- if (trie_debug)
- printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
- tn, inflate_threshold, halve_threshold);
+ pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
+ tn, inflate_threshold, halve_threshold);
/* No children */
if (tn->empty_children == tnode_child_length(tn)) {
@@ -501,20 +468,16 @@ static struct node *resize(struct trie *t, struct tnode *tn)
/* One child */
if (tn->empty_children == tnode_child_length(tn) - 1)
for (i = 0; i < tnode_child_length(tn); i++) {
+ struct node *n;
- write_lock_bh(&fib_lock);
- if (tn->child[i] != NULL) {
-
- /* compress one level */
- struct node *n = tn->child[i];
- if (n)
- NODE_INIT_PARENT(n, NODE_TYPE(n));
+ n = tn->child[i];
+ if (!n)
+ continue;
- write_unlock_bh(&fib_lock);
- tnode_free(tn);
- return n;
- }
- write_unlock_bh(&fib_lock);
+ /* compress one level */
+ NODE_SET_PARENT(n, NULL);
+ tnode_free(tn);
+ return n;
}
/*
* Double as long as the resulting node has a number of
@@ -566,30 +529,38 @@ static struct node *resize(struct trie *t, struct tnode *tn)
*
* expand not_to_be_doubled and to_be_doubled, and shorten:
* 100 * (tnode_child_length(tn) - tn->empty_children +
- * tn->full_children ) >= inflate_threshold * new_child_length
+ * tn->full_children) >= inflate_threshold * new_child_length
*
* expand new_child_length:
* 100 * (tnode_child_length(tn) - tn->empty_children +
- * tn->full_children ) >=
+ * tn->full_children) >=
* inflate_threshold * tnode_child_length(tn) * 2
*
* shorten again:
* 50 * (tn->full_children + tnode_child_length(tn) -
- * tn->empty_children ) >= inflate_threshold *
+ * tn->empty_children) >= inflate_threshold *
* tnode_child_length(tn)
*
*/
check_tnode(tn);
+ /* Keep root node larger */
+
+ if(!tn->parent)
+ inflate_threshold_use = inflate_threshold_root;
+ else
+ inflate_threshold_use = inflate_threshold;
+
err = 0;
while ((tn->full_children > 0 &&
50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
- inflate_threshold * tnode_child_length(tn))) {
-
- tn = inflate(t, tn, &err);
+ inflate_threshold_use * tnode_child_length(tn))) {
- if (err) {
+ old_tn = tn;
+ tn = inflate(t, tn);
+ if (IS_ERR(tn)) {
+ tn = old_tn;
#ifdef CONFIG_IP_FIB_TRIE_STATS
t->stats.resize_node_skipped++;
#endif
@@ -604,14 +575,23 @@ static struct node *resize(struct trie *t, struct tnode *tn)
* node is above threshold.
*/
+
+ /* Keep root node larger */
+
+ if(!tn->parent)
+ halve_threshold_use = halve_threshold_root;
+ else
+ halve_threshold_use = halve_threshold;
+
err = 0;
while (tn->bits > 1 &&
100 * (tnode_child_length(tn) - tn->empty_children) <
- halve_threshold * tnode_child_length(tn)) {
-
- tn = halve(t, tn, &err);
+ halve_threshold_use * tnode_child_length(tn)) {
- if (err) {
+ old_tn = tn;
+ tn = halve(t, tn);
+ if (IS_ERR(tn)) {
+ tn = old_tn;
#ifdef CONFIG_IP_FIB_TRIE_STATS
t->stats.resize_node_skipped++;
#endif
@@ -621,44 +601,37 @@ static struct node *resize(struct trie *t, struct tnode *tn)
/* Only one child remains */
-
if (tn->empty_children == tnode_child_length(tn) - 1)
for (i = 0; i < tnode_child_length(tn); i++) {
-
- write_lock_bh(&fib_lock);
- if (tn->child[i] != NULL) {
- /* compress one level */
- struct node *n = tn->child[i];
+ struct node *n;
- if (n)
- NODE_INIT_PARENT(n, NODE_TYPE(n));
+ n = tn->child[i];
+ if (!n)
+ continue;
- write_unlock_bh(&fib_lock);
- tnode_free(tn);
- return n;
- }
- write_unlock_bh(&fib_lock);
+ /* compress one level */
+
+ NODE_SET_PARENT(n, NULL);
+ tnode_free(tn);
+ return n;
}
return (struct node *) tn;
}
-static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
+static struct tnode *inflate(struct trie *t, struct tnode *tn)
{
struct tnode *inode;
struct tnode *oldtnode = tn;
int olen = tnode_child_length(tn);
int i;
- if (trie_debug)
- printk("In inflate\n");
+ pr_debug("In inflate\n");
tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
- if (!tn) {
- *err = -ENOMEM;
- return oldtnode;
- }
+ if (!tn)
+ return ERR_PTR(-ENOMEM);
/*
* Preallocate and store tnodes before the actual work so we
@@ -666,8 +639,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
* fails. In case of failure we return the oldnode and inflate
* of tnode is ignored.
*/
-
- for(i = 0; i < olen; i++) {
+
+ for (i = 0; i < olen; i++) {
struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
if (inode &&
@@ -675,46 +648,30 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
inode->pos == oldtnode->pos + oldtnode->bits &&
inode->bits > 1) {
struct tnode *left, *right;
-
t_key m = TKEY_GET_MASK(inode->pos, 1);
left = tnode_new(inode->key&(~m), inode->pos + 1,
inode->bits - 1);
+ if (!left)
+ goto nomem;
- if (!left) {
- *err = -ENOMEM;
- break;
- }
-
right = tnode_new(inode->key|m, inode->pos + 1,
inode->bits - 1);
- if (!right) {
- *err = -ENOMEM;
- break;
- }
+ if (!right) {
+ tnode_free(left);
+ goto nomem;
+ }
put_child(t, tn, 2*i, (struct node *) left);
put_child(t, tn, 2*i+1, (struct node *) right);
}
}
- if (*err) {
- int size = tnode_child_length(tn);
- int j;
-
- for(j = 0; j < size; j++)
- if (tn->child[j])
- tnode_free((struct tnode *)tn->child[j]);
-
- tnode_free(tn);
-
- *err = -ENOMEM;
- return oldtnode;
- }
-
- for(i = 0; i < olen; i++) {
+ for (i = 0; i < olen; i++) {
struct node *node = tnode_get_child(oldtnode, i);
+ struct tnode *left, *right;
+ int size, j;
/* An empty child */
if (node == NULL)
@@ -740,76 +697,82 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
put_child(t, tn, 2*i+1, inode->child[1]);
tnode_free(inode);
+ continue;
}
- /* An internal node with more than two children */
- else {
- struct tnode *left, *right;
- int size, j;
-
- /* We will replace this node 'inode' with two new
- * ones, 'left' and 'right', each with half of the
- * original children. The two new nodes will have
- * a position one bit further down the key and this
- * means that the "significant" part of their keys
- * (see the discussion near the top of this file)
- * will differ by one bit, which will be "0" in
- * left's key and "1" in right's key. Since we are
- * moving the key position by one step, the bit that
- * we are moving away from - the bit at position
- * (inode->pos) - is the one that will differ between
- * left and right. So... we synthesize that bit in the
- * two new keys.
- * The mask 'm' below will be a single "one" bit at
- * the position (inode->pos)
- */
-
- /* Use the old key, but set the new significant
- * bit to zero.
- */
-
- left = (struct tnode *) tnode_get_child(tn, 2*i);
- put_child(t, tn, 2*i, NULL);
+ /* An internal node with more than two children */
+
+ /* We will replace this node 'inode' with two new
+ * ones, 'left' and 'right', each with half of the
+ * original children. The two new nodes will have
+ * a position one bit further down the key and this
+ * means that the "significant" part of their keys
+ * (see the discussion near the top of this file)
+ * will differ by one bit, which will be "0" in
+ * left's key and "1" in right's key. Since we are
+ * moving the key position by one step, the bit that
+ * we are moving away from - the bit at position
+ * (inode->pos) - is the one that will differ between
+ * left and right. So... we synthesize that bit in the
+ * two new keys.
+ * The mask 'm' below will be a single "one" bit at
+ * the position (inode->pos)
+ */
- if (!left)
- BUG();
+ /* Use the old key, but set the new significant
+ * bit to zero.
+ */
- right = (struct tnode *) tnode_get_child(tn, 2*i+1);
- put_child(t, tn, 2*i+1, NULL);
+ left = (struct tnode *) tnode_get_child(tn, 2*i);
+ put_child(t, tn, 2*i, NULL);
- if (!right)
- BUG();
+ BUG_ON(!left);
- size = tnode_child_length(left);
- for(j = 0; j < size; j++) {
- put_child(t, left, j, inode->child[j]);
- put_child(t, right, j, inode->child[j + size]);
- }
- put_child(t, tn, 2*i, resize(t, left));
- put_child(t, tn, 2*i+1, resize(t, right));
+ right = (struct tnode *) tnode_get_child(tn, 2*i+1);
+ put_child(t, tn, 2*i+1, NULL);
- tnode_free(inode);
+ BUG_ON(!right);
+
+ size = tnode_child_length(left);
+ for (j = 0; j < size; j++) {
+ put_child(t, left, j, inode->child[j]);
+ put_child(t, right, j, inode->child[j + size]);
}
+ put_child(t, tn, 2*i, resize(t, left));
+ put_child(t, tn, 2*i+1, resize(t, right));
+
+ tnode_free(inode);
}
tnode_free(oldtnode);
return tn;
+nomem:
+ {
+ int size = tnode_child_length(tn);
+ int j;
+
+ for (j = 0; j < size; j++)
+ if (tn->child[j])
+ tnode_free((struct tnode *)tn->child[j]);
+
+ tnode_free(tn);
+
+ return ERR_PTR(-ENOMEM);
+ }
}
-static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
+static struct tnode *halve(struct trie *t, struct tnode *tn)
{
struct tnode *oldtnode = tn;
struct node *left, *right;
int i;
int olen = tnode_child_length(tn);
- if (trie_debug) printk("In halve\n");
+ pr_debug("In halve\n");
tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
- if (!tn) {
- *err = -ENOMEM;
- return oldtnode;
- }
+ if (!tn)
+ return ERR_PTR(-ENOMEM);
/*
* Preallocate and store tnodes before the actual work so we
@@ -818,38 +781,27 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
* of tnode is ignored.
*/
- for(i = 0; i < olen; i += 2) {
+ for (i = 0; i < olen; i += 2) {
left = tnode_get_child(oldtnode, i);
right = tnode_get_child(oldtnode, i+1);
/* Two nonempty children */
- if (left && right) {
- struct tnode *newBinNode =
- tnode_new(left->key, tn->pos + tn->bits, 1);
+ if (left && right) {
+ struct tnode *newn;
- if (!newBinNode) {
- *err = -ENOMEM;
- break;
- }
- put_child(t, tn, i/2, (struct node *)newBinNode);
- }
- }
+ newn = tnode_new(left->key, tn->pos + tn->bits, 1);
- if (*err) {
- int size = tnode_child_length(tn);
- int j;
+ if (!newn)
+ goto nomem;
- for(j = 0; j < size; j++)
- if (tn->child[j])
- tnode_free((struct tnode *)tn->child[j]);
+ put_child(t, tn, i/2, (struct node *)newn);
+ }
- tnode_free(tn);
-
- *err = -ENOMEM;
- return oldtnode;
}
- for(i = 0; i < olen; i += 2) {
+ for (i = 0; i < olen; i += 2) {
+ struct tnode *newBinNode;
+
left = tnode_get_child(oldtnode, i);
right = tnode_get_child(oldtnode, i+1);
@@ -858,88 +810,100 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
if (right == NULL) /* Both are empty */
continue;
put_child(t, tn, i/2, right);
- } else if (right == NULL)
+ continue;
+ }
+
+ if (right == NULL) {
put_child(t, tn, i/2, left);
+ continue;
+ }
/* Two nonempty children */
- else {
- struct tnode *newBinNode =
- (struct tnode *) tnode_get_child(tn, i/2);
- put_child(t, tn, i/2, NULL);
-
- if (!newBinNode)
- BUG();
-
- put_child(t, newBinNode, 0, left);
- put_child(t, newBinNode, 1, right);
- put_child(t, tn, i/2, resize(t, newBinNode));
- }
+ newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
+ put_child(t, tn, i/2, NULL);
+ put_child(t, newBinNode, 0, left);
+ put_child(t, newBinNode, 1, right);
+ put_child(t, tn, i/2, resize(t, newBinNode));
}
tnode_free(oldtnode);
return tn;
+nomem:
+ {
+ int size = tnode_child_length(tn);
+ int j;
+
+ for (j = 0; j < size; j++)
+ if (tn->child[j])
+ tnode_free((struct tnode *)tn->child[j]);
+
+ tnode_free(tn);
+
+ return ERR_PTR(-ENOMEM);
+ }
}
-static void *trie_init(struct trie *t)
+static void trie_init(struct trie *t)
{
- if (t) {
- t->size = 0;
- t->trie = NULL;
- t->revision = 0;
+ if (!t)
+ return;
+
+ t->size = 0;
+ rcu_assign_pointer(t->trie, NULL);
+ t->revision = 0;
#ifdef CONFIG_IP_FIB_TRIE_STATS
- memset(&t->stats, 0, sizeof(struct trie_use_stats));
+ memset(&t->stats, 0, sizeof(struct trie_use_stats));
#endif
- }
- return t;
}
-static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
+/* readside must use rcu_read_lock currently dump routines
+ via get_fa_head and dump */
+
+static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
{
+ struct hlist_head *head = &l->list;
struct hlist_node *node;
struct leaf_info *li;
- hlist_for_each_entry(li, node, head, hlist) {
+ hlist_for_each_entry_rcu(li, node, head, hlist)
if (li->plen == plen)
return li;
- }
+
return NULL;
}
static inline struct list_head * get_fa_head(struct leaf *l, int plen)
{
- struct list_head *fa_head = NULL;
- struct leaf_info *li = find_leaf_info(&l->list, plen);
+ struct leaf_info *li = find_leaf_info(l, plen);
- if (li)
- fa_head = &li->falh;
+ if (!li)
+ return NULL;
- return fa_head;
+ return &li->falh;
}
static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
{
- struct leaf_info *li = NULL, *last = NULL;
- struct hlist_node *node, *tmp;
-
- write_lock_bh(&fib_lock);
-
- if (hlist_empty(head))
- hlist_add_head(&new->hlist, head);
- else {
- hlist_for_each_entry_safe(li, node, tmp, head, hlist) {
-
- if (new->plen > li->plen)
- break;
-
- last = li;
- }
- if (last)
- hlist_add_after(&last->hlist, &new->hlist);
- else
- hlist_add_before(&new->hlist, &li->hlist);
- }
- write_unlock_bh(&fib_lock);
+ struct leaf_info *li = NULL, *last = NULL;
+ struct hlist_node *node;
+
+ if (hlist_empty(head)) {
+ hlist_add_head_rcu(&new->hlist, head);
+ } else {
+ hlist_for_each_entry(li, node, head, hlist) {
+ if (new->plen > li->plen)
+ break;
+
+ last = li;
+ }
+ if (last)
+ hlist_add_after_rcu(&last->hlist, &new->hlist);
+ else
+ hlist_add_before_rcu(&new->hlist, &li->hlist);
+ }
}
+/* rcu_read_lock needs to be hold by caller from readside */
+
static struct leaf *
fib_find_node(struct trie *t, u32 key)
{
@@ -948,61 +912,43 @@ fib_find_node(struct trie *t, u32 key)
struct node *n;
pos = 0;
- n = t->trie;
+ n = rcu_dereference(t->trie);
while (n != NULL && NODE_TYPE(n) == T_TNODE) {
tn = (struct tnode *) n;
-
+
check_tnode(tn);
-
+
if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
- pos=tn->pos + tn->bits;
+ pos = tn->pos + tn->bits;
n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
- }
- else
+ } else
break;
}
/* Case we have found a leaf. Compare prefixes */
- if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
- struct leaf *l = (struct leaf *) n;
- return l;
- }
+ if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
+ return (struct leaf *)n;
+
return NULL;
}
static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
{
- int i = 0;
int wasfull;
t_key cindex, key;
struct tnode *tp = NULL;
- if (!tn)
- BUG();
-
key = tn->key;
- i = 0;
while (tn != NULL && NODE_PARENT(tn) != NULL) {
- if (i > 10) {
- printk("Rebalance tn=%p \n", tn);
- if (tn) printk("tn->parent=%p \n", NODE_PARENT(tn));
-
- printk("Rebalance tp=%p \n", tp);
- if (tp) printk("tp->parent=%p \n", NODE_PARENT(tp));
- }
-
- if (i > 12) BUG();
- i++;
-
tp = NODE_PARENT(tn);
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
tn = (struct tnode *) resize (t, (struct tnode *)tn);
tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
-
+
if (!NODE_PARENT(tn))
break;
@@ -1015,6 +961,8 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
return (struct node*) tn;
}
+/* only used from updater-side */
+
static struct list_head *
fib_insert_node(struct trie *t, int *err, u32 key, int plen)
{
@@ -1050,20 +998,16 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
while (n != NULL && NODE_TYPE(n) == T_TNODE) {
tn = (struct tnode *) n;
-
+
check_tnode(tn);
-
+
if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
tp = tn;
- pos=tn->pos + tn->bits;
+ pos = tn->pos + tn->bits;
n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
- if (n && NODE_PARENT(n) != tn) {
- printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
- BUG();
- }
- }
- else
+ BUG_ON(n && NODE_PARENT(n) != tn);
+ } else
break;
}
@@ -1073,17 +1017,15 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
* tp is n's (parent) ----> NULL or TNODE
*/
- if (tp && IS_LEAF(tp))
- BUG();
-
+ BUG_ON(tp && IS_LEAF(tp));
/* Case 1: n is a leaf. Compare prefixes */
if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
- struct leaf *l = ( struct leaf *) n;
-
+ struct leaf *l = (struct leaf *) n;
+
li = leaf_info_new(plen);
-
+
if (!li) {
*err = -ENOMEM;
goto err;
@@ -1113,35 +1055,29 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
fa_head = &li->falh;
insert_leaf_info(&l->list, li);
- /* Case 2: n is NULL, and will just insert a new leaf */
if (t->trie && n == NULL) {
+ /* Case 2: n is NULL, and will just insert a new leaf */
NODE_SET_PARENT(l, tp);
-
- if (!tp)
- BUG();
- else {
- cindex = tkey_extract_bits(key, tp->pos, tp->bits);
- put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
- }
- }
- /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
- else {
+ cindex = tkey_extract_bits(key, tp->pos, tp->bits);
+ put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
+ } else {
+ /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
/*
* Add a new tnode here
* first tnode need some special handling
*/
if (tp)
- pos=tp->pos+tp->bits;
+ pos = tp->pos+tp->bits;
else
- pos=0;
+ pos = 0;
+
if (n) {
newpos = tkey_mismatch(key, pos, n->key);
tn = tnode_new(n->key, newpos, 1);
- }
- else {
+ } else {
newpos = 0;
tn = tnode_new(key, newpos, 1); /* First tnode */
}
@@ -1151,32 +1087,33 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
tnode_free((struct tnode *) l);
*err = -ENOMEM;
goto err;
- }
-
+ }
+
NODE_SET_PARENT(tn, tp);
- missbit=tkey_extract_bits(key, newpos, 1);
+ missbit = tkey_extract_bits(key, newpos, 1);
put_child(t, tn, missbit, (struct node *)l);
put_child(t, tn, 1-missbit, n);
if (tp) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
- }
- else {
- t->trie = (struct node*) tn; /* First tnode */
+ } else {
+ rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
tp = tn;
}
}
- if (tp && tp->pos+tp->bits > 32) {
- printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
+
+ if (tp && tp->pos + tp->bits > 32)
+ printk(KERN_WARNING "fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
tp, tp->pos, tp->bits, key, plen);
- }
+
/* Rebalance the trie */
- t->trie = trie_rebalance(t, tp);
+
+ rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
done:
t->revision++;
-err:;
+err:
return fa_head;
}
@@ -1204,17 +1141,18 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
key = ntohl(key);
- if (trie_debug)
- printk("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
+ pr_debug("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
- mask = ntohl( inet_make_mask(plen) );
+ mask = ntohl(inet_make_mask(plen));
if (key & ~mask)
return -EINVAL;
key = key & mask;
- if ((fi = fib_create_info(r, rta, nlhdr, &err)) == NULL)
+ fi = fib_create_info(r, rta, nlhdr, &err);
+
+ if (!fi)
goto err;
l = fib_find_node(t, key);
@@ -1236,8 +1174,7 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
* and we need to allocate a new one of those as well.
*/
- if (fa &&
- fa->fa_info->fib_priority == fi->fib_priority) {
+ if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
struct fib_alias *fa_orig;
err = -EEXIST;
@@ -1248,22 +1185,27 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
struct fib_info *fi_drop;
u8 state;
- write_lock_bh(&fib_lock);
+ err = -ENOBUFS;
+ new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
+ if (new_fa == NULL)
+ goto out;
fi_drop = fa->fa_info;
- fa->fa_info = fi;
- fa->fa_type = type;
- fa->fa_scope = r->rtm_scope;
+ new_fa->fa_tos = fa->fa_tos;
+ new_fa->fa_info = fi;
+ new_fa->fa_type = type;
+ new_fa->fa_scope = r->rtm_scope;
state = fa->fa_state;
- fa->fa_state &= ~FA_S_ACCESSED;
+ new_fa->fa_state &= ~FA_S_ACCESSED;
- write_unlock_bh(&fib_lock);
+ list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
+ alias_free_mem_rcu(fa);
fib_release_info(fi_drop);
if (state & FA_S_ACCESSED)
- rt_cache_flush(-1);
+ rt_cache_flush(-1);
- goto succeeded;
+ goto succeeded;
}
/* Error if we find a perfect match which
* uses the same scope, type, and nexthop
@@ -1285,7 +1227,7 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
fa = fa_orig;
}
err = -ENOENT;
- if (!(nlhdr->nlmsg_flags&NLM_F_CREATE))
+ if (!(nlhdr->nlmsg_flags & NLM_F_CREATE))
goto out;
err = -ENOBUFS;
@@ -1298,9 +1240,6 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
new_fa->fa_type = type;
new_fa->fa_scope = r->rtm_scope;
new_fa->fa_state = 0;
-#if 0
- new_fa->dst = NULL;
-#endif
/*
* Insert new entry to the list.
*/
@@ -1312,12 +1251,8 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
goto out_free_new_fa;
}
- write_lock_bh(&fib_lock);
-
- list_add_tail(&new_fa->fa_list,
- (fa ? &fa->fa_list : fa_head));
-
- write_unlock_bh(&fib_lock);
+ list_add_tail_rcu(&new_fa->fa_list,
+ (fa ? &fa->fa_list : fa_head));
rt_cache_flush(-1);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
@@ -1328,38 +1263,40 @@ out_free_new_fa:
kmem_cache_free(fn_alias_kmem, new_fa);
out:
fib_release_info(fi);
-err:;
+err:
return err;
}
-static inline int check_leaf(struct trie *t, struct leaf *l, t_key key, int *plen, const struct flowi *flp,
- struct fib_result *res, int *err)
+
+/* should be called with rcu_read_lock */
+static inline int check_leaf(struct trie *t, struct leaf *l,
+ t_key key, int *plen, const struct flowi *flp,
+ struct fib_result *res)
{
- int i;
+ int err, i;
t_key mask;
struct leaf_info *li;
struct hlist_head *hhead = &l->list;
struct hlist_node *node;
- hlist_for_each_entry(li, node, hhead, hlist) {
-
+ hlist_for_each_entry_rcu(li, node, hhead, hlist) {
i = li->plen;
mask = ntohl(inet_make_mask(i));
if (l->key != (key & mask))
continue;
- if (((*err) = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) == 0) {
+ if ((err = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) <= 0) {
*plen = i;
#ifdef CONFIG_IP_FIB_TRIE_STATS
t->stats.semantic_match_passed++;
#endif
- return 1;
+ return err;
}
#ifdef CONFIG_IP_FIB_TRIE_STATS
t->stats.semantic_match_miss++;
#endif
}
- return 0;
+ return 1;
}
static int
@@ -1370,13 +1307,17 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
struct node *n;
struct tnode *pn;
int pos, bits;
- t_key key=ntohl(flp->fl4_dst);
+ t_key key = ntohl(flp->fl4_dst);
int chopped_off;
t_key cindex = 0;
int current_prefix_length = KEYLENGTH;
- n = t->trie;
+ struct tnode *cn;
+ t_key node_prefix, key_prefix, pref_mismatch;
+ int mp;
+
+ rcu_read_lock();
- read_lock(&fib_lock);
+ n = rcu_dereference(t->trie);
if (!n)
goto failed;
@@ -1386,15 +1327,14 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
/* Just a leaf? */
if (IS_LEAF(n)) {
- if (check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret))
+ if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
goto found;
goto failed;
}
pn = (struct tnode *) n;
chopped_off = 0;
- while (pn) {
-
+ while (pn) {
pos = pn->pos;
bits = pn->bits;
@@ -1410,130 +1350,129 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
goto backtrace;
}
- if (IS_TNODE(n)) {
+ if (IS_LEAF(n)) {
+ if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
+ goto found;
+ else
+ goto backtrace;
+ }
+
#define HL_OPTIMIZE
#ifdef HL_OPTIMIZE
- struct tnode *cn = (struct tnode *)n;
- t_key node_prefix, key_prefix, pref_mismatch;
- int mp;
-
- /*
- * It's a tnode, and we can do some extra checks here if we
- * like, to avoid descending into a dead-end branch.
- * This tnode is in the parent's child array at index
- * key[p_pos..p_pos+p_bits] but potentially with some bits
- * chopped off, so in reality the index may be just a
- * subprefix, padded with zero at the end.
- * We can also take a look at any skipped bits in this
- * tnode - everything up to p_pos is supposed to be ok,
- * and the non-chopped bits of the index (se previous
- * paragraph) are also guaranteed ok, but the rest is
- * considered unknown.
- *
- * The skipped bits are key[pos+bits..cn->pos].
- */
-
- /* If current_prefix_length < pos+bits, we are already doing
- * actual prefix matching, which means everything from
- * pos+(bits-chopped_off) onward must be zero along some
- * branch of this subtree - otherwise there is *no* valid
- * prefix present. Here we can only check the skipped
- * bits. Remember, since we have already indexed into the
- * parent's child array, we know that the bits we chopped of
- * *are* zero.
- */
-
- /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
-
- if (current_prefix_length < pos+bits) {
- if (tkey_extract_bits(cn->key, current_prefix_length,
- cn->pos - current_prefix_length) != 0 ||
- !(cn->child[0]))
- goto backtrace;
+ cn = (struct tnode *)n;
+
+ /*
+ * It's a tnode, and we can do some extra checks here if we
+ * like, to avoid descending into a dead-end branch.
+ * This tnode is in the parent's child array at index
+ * key[p_pos..p_pos+p_bits] but potentially with some bits
+ * chopped off, so in reality the index may be just a
+ * subprefix, padded with zero at the end.
+ * We can also take a look at any skipped bits in this
+ * tnode - everything up to p_pos is supposed to be ok,
+ * and the non-chopped bits of the index (se previous
+ * paragraph) are also guaranteed ok, but the rest is
+ * considered unknown.
+ *
+ * The skipped bits are key[pos+bits..cn->pos].
+ */
+
+ /* If current_prefix_length < pos+bits, we are already doing
+ * actual prefix matching, which means everything from
+ * pos+(bits-chopped_off) onward must be zero along some
+ * branch of this subtree - otherwise there is *no* valid
+ * prefix present. Here we can only check the skipped
+ * bits. Remember, since we have already indexed into the
+ * parent's child array, we know that the bits we chopped of
+ * *are* zero.
+ */
+
+ /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
+
+ if (current_prefix_length < pos+bits) {
+ if (tkey_extract_bits(cn->key, current_prefix_length,
+ cn->pos - current_prefix_length) != 0 ||
+ !(cn->child[0]))
+ goto backtrace;
+ }
+
+ /*
+ * If chopped_off=0, the index is fully validated and we
+ * only need to look at the skipped bits for this, the new,
+ * tnode. What we actually want to do is to find out if
+ * these skipped bits match our key perfectly, or if we will
+ * have to count on finding a matching prefix further down,
+ * because if we do, we would like to have some way of
+ * verifying the existence of such a prefix at this point.
+ */
+
+ /* The only thing we can do at this point is to verify that
+ * any such matching prefix can indeed be a prefix to our
+ * key, and if the bits in the node we are inspecting that
+ * do not match our key are not ZERO, this cannot be true.
+ * Thus, find out where there is a mismatch (before cn->pos)
+ * and verify that all the mismatching bits are zero in the
+ * new tnode's key.
+ */
+
+ /* Note: We aren't very concerned about the piece of the key
+ * that precede pn->pos+pn->bits, since these have already been
+ * checked. The bits after cn->pos aren't checked since these are
+ * by definition "unknown" at this point. Thus, what we want to
+ * see is if we are about to enter the "prefix matching" state,
+ * and in that case verify that the skipped bits that will prevail
+ * throughout this subtree are zero, as they have to be if we are
+ * to find a matching prefix.
+ */
+
+ node_prefix = MASK_PFX(cn->key, cn->pos);
+ key_prefix = MASK_PFX(key, cn->pos);
+ pref_mismatch = key_prefix^node_prefix;
+ mp = 0;
+
+ /* In short: If skipped bits in this node do not match the search
+ * key, enter the "prefix matching" state.directly.
+ */
+ if (pref_mismatch) {
+ while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
+ mp++;
+ pref_mismatch = pref_mismatch <<1;
}
+ key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
- /*
- * If chopped_off=0, the index is fully validated and we
- * only need to look at the skipped bits for this, the new,
- * tnode. What we actually want to do is to find out if
- * these skipped bits match our key perfectly, or if we will
- * have to count on finding a matching prefix further down,
- * because if we do, we would like to have some way of
- * verifying the existence of such a prefix at this point.
- */
-
- /* The only thing we can do at this point is to verify that
- * any such matching prefix can indeed be a prefix to our
- * key, and if the bits in the node we are inspecting that
- * do not match our key are not ZERO, this cannot be true.
- * Thus, find out where there is a mismatch (before cn->pos)
- * and verify that all the mismatching bits are zero in the
- * new tnode's key.
- */
-
- /* Note: We aren't very concerned about the piece of the key
- * that precede pn->pos+pn->bits, since these have already been
- * checked. The bits after cn->pos aren't checked since these are
- * by definition "unknown" at this point. Thus, what we want to
- * see is if we are about to enter the "prefix matching" state,
- * and in that case verify that the skipped bits that will prevail
- * throughout this subtree are zero, as they have to be if we are
- * to find a matching prefix.
- */
-
- node_prefix = MASK_PFX(cn->key, cn->pos);
- key_prefix = MASK_PFX(key, cn->pos);
- pref_mismatch = key_prefix^node_prefix;
- mp = 0;
-
- /* In short: If skipped bits in this node do not match the search
- * key, enter the "prefix matching" state.directly.
- */
- if (pref_mismatch) {
- while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
- mp++;
- pref_mismatch = pref_mismatch <<1;
- }
- key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
-
- if (key_prefix != 0)
- goto backtrace;
-
- if (current_prefix_length >= cn->pos)
- current_prefix_length=mp;
- }
-#endif
- pn = (struct tnode *)n; /* Descend */
- chopped_off = 0;
- continue;
+ if (key_prefix != 0)
+ goto backtrace;
+
+ if (current_prefix_length >= cn->pos)
+ current_prefix_length = mp;
}
- if (IS_LEAF(n)) {
- if (check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret))
- goto found;
- }
+#endif
+ pn = (struct tnode *)n; /* Descend */
+ chopped_off = 0;
+ continue;
+
backtrace:
chopped_off++;
/* As zero don't change the child key (cindex) */
- while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) {
+ while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
chopped_off++;
- }
/* Decrease current_... with bits chopped off */
if (current_prefix_length > pn->pos + pn->bits - chopped_off)
current_prefix_length = pn->pos + pn->bits - chopped_off;
-
+
/*
* Either we do the actual chop off according or if we have
* chopped off all bits in this tnode walk up to our parent.
*/
- if (chopped_off <= pn->bits)
+ if (chopped_off <= pn->bits) {
cindex &= ~(1 << (chopped_off-1));
- else {
+ } else {
if (NODE_PARENT(pn) == NULL)
goto failed;
-
+
/* Get Child's index */
cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
pn = NODE_PARENT(pn);
@@ -1548,10 +1487,11 @@ backtrace:
failed:
ret = 1;
found:
- read_unlock(&fib_lock);
+ rcu_read_unlock();
return ret;
}
+/* only called from updater side */
static int trie_leaf_remove(struct trie *t, t_key key)
{
t_key cindex;
@@ -1559,24 +1499,20 @@ static int trie_leaf_remove(struct trie *t, t_key key)
struct node *n = t->trie;
struct leaf *l;
- if (trie_debug)
- printk("entering trie_leaf_remove(%p)\n", n);
+ pr_debug("entering trie_leaf_remove(%p)\n", n);
/* Note that in the case skipped bits, those bits are *not* checked!
* When we finish this, we will have NULL or a T_LEAF, and the
* T_LEAF may or may not match our key.
*/
- while (n != NULL && IS_TNODE(n)) {
+ while (n != NULL && IS_TNODE(n)) {
struct tnode *tn = (struct tnode *) n;
check_tnode(tn);
n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
- if (n && NODE_PARENT(n) != tn) {
- printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
- BUG();
- }
- }
+ BUG_ON(n && NODE_PARENT(n) != tn);
+ }
l = (struct leaf *) n;
if (!n || !tkey_equals(l->key, key))
@@ -1590,23 +1526,24 @@ static int trie_leaf_remove(struct trie *t, t_key key)
t->revision++;
t->size--;
+ preempt_disable();
tp = NODE_PARENT(n);
tnode_free((struct tnode *) n);
if (tp) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
put_child(t, (struct tnode *)tp, cindex, NULL);
- t->trie = trie_rebalance(t, tp);
- }
- else
- t->trie = NULL;
+ rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
+ } else
+ rcu_assign_pointer(t->trie, NULL);
+ preempt_enable();
return 1;
}
static int
fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
- struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
+ struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
{
struct trie *t = (struct trie *) tb->tb_data;
u32 key, mask;
@@ -1615,6 +1552,8 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
struct fib_alias *fa, *fa_to_delete;
struct list_head *fa_head;
struct leaf *l;
+ struct leaf_info *li;
+
if (plen > 32)
return -EINVAL;
@@ -1624,7 +1563,7 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
memcpy(&key, rta->rta_dst, 4);
key = ntohl(key);
- mask = ntohl( inet_make_mask(plen) );
+ mask = ntohl(inet_make_mask(plen));
if (key & ~mask)
return -EINVAL;
@@ -1641,11 +1580,11 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
if (!fa)
return -ESRCH;
- if (trie_debug)
- printk("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
+ pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
fa_to_delete = NULL;
fa_head = fa->fa_list.prev;
+
list_for_each_entry(fa, fa_head, fa_list) {
struct fib_info *fi = fa->fa_info;
@@ -1664,39 +1603,31 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
}
}
- if (fa_to_delete) {
- int kill_li = 0;
- struct leaf_info *li;
-
- fa = fa_to_delete;
- rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
+ if (!fa_to_delete)
+ return -ESRCH;
- l = fib_find_node(t, key);
- li = find_leaf_info(&l->list, plen);
+ fa = fa_to_delete;
+ rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
- write_lock_bh(&fib_lock);
+ l = fib_find_node(t, key);
+ li = find_leaf_info(l, plen);
- list_del(&fa->fa_list);
+ list_del_rcu(&fa->fa_list);
- if (list_empty(fa_head)) {
- hlist_del(&li->hlist);
- kill_li = 1;
- }
- write_unlock_bh(&fib_lock);
-
- if (kill_li)
- free_leaf_info(li);
+ if (list_empty(fa_head)) {
+ hlist_del_rcu(&li->hlist);
+ free_leaf_info(li);
+ }
- if (hlist_empty(&l->list))
- trie_leaf_remove(t, key);
+ if (hlist_empty(&l->list))
+ trie_leaf_remove(t, key);
- if (fa->fa_state & FA_S_ACCESSED)
- rt_cache_flush(-1);
+ if (fa->fa_state & FA_S_ACCESSED)
+ rt_cache_flush(-1);
- fn_free_alias(fa);
- return 0;
- }
- return -ESRCH;
+ fib_release_info(fa->fa_info);
+ alias_free_mem_rcu(fa);
+ return 0;
}
static int trie_flush_list(struct trie *t, struct list_head *head)
@@ -1706,14 +1637,11 @@ static int trie_flush_list(struct trie *t, struct list_head *head)
list_for_each_entry_safe(fa, fa_node, head, fa_list) {
struct fib_info *fi = fa->fa_info;
-
- if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
-
- write_lock_bh(&fib_lock);
- list_del(&fa->fa_list);
- write_unlock_bh(&fib_lock);
- fn_free_alias(fa);
+ if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
+ list_del_rcu(&fa->fa_list);
+ fib_release_info(fa->fa_info);
+ alias_free_mem_rcu(fa);
found++;
}
}
@@ -1728,37 +1656,34 @@ static int trie_flush_leaf(struct trie *t, struct leaf *l)
struct leaf_info *li = NULL;
hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
-
found += trie_flush_list(t, &li->falh);
if (list_empty(&li->falh)) {
-
- write_lock_bh(&fib_lock);
- hlist_del(&li->hlist);
- write_unlock_bh(&fib_lock);
-
+ hlist_del_rcu(&li->hlist);
free_leaf_info(li);
}
}
return found;
}
+/* rcu_read_lock needs to be hold by caller from readside */
+
static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
{
struct node *c = (struct node *) thisleaf;
struct tnode *p;
int idx;
+ struct node *trie = rcu_dereference(t->trie);
if (c == NULL) {
- if (t->trie == NULL)
+ if (trie == NULL)
return NULL;
- if (IS_LEAF(t->trie)) /* trie w. just a leaf */
- return (struct leaf *) t->trie;
+ if (IS_LEAF(trie)) /* trie w. just a leaf */
+ return (struct leaf *) trie;
- p = (struct tnode*) t->trie; /* Start */
- }
- else
+ p = (struct tnode*) trie; /* Start */
+ } else
p = (struct tnode *) NODE_PARENT(c);
while (p) {
@@ -1771,29 +1696,31 @@ static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
pos = 0;
last = 1 << p->bits;
- for(idx = pos; idx < last ; idx++) {
- if (p->child[idx]) {
-
- /* Decend if tnode */
-
- while (IS_TNODE(p->child[idx])) {
- p = (struct tnode*) p->child[idx];
- idx = 0;
-
- /* Rightmost non-NULL branch */
- if (p && IS_TNODE(p))
- while (p->child[idx] == NULL && idx < (1 << p->bits)) idx++;
-
- /* Done with this tnode? */
- if (idx >= (1 << p->bits) || p->child[idx] == NULL )
- goto up;
- }
- return (struct leaf*) p->child[idx];
+ for (idx = pos; idx < last ; idx++) {
+ c = rcu_dereference(p->child[idx]);
+
+ if (!c)
+ continue;
+
+ /* Decend if tnode */
+ while (IS_TNODE(c)) {
+ p = (struct tnode *) c;
+ idx = 0;
+
+ /* Rightmost non-NULL branch */
+ if (p && IS_TNODE(p))
+ while (!(c = rcu_dereference(p->child[idx]))
+ && idx < (1<<p->bits)) idx++;
+
+ /* Done with this tnode? */
+ if (idx >= (1 << p->bits) || !c)
+ goto up;
}
+ return (struct leaf *) c;
}
up:
/* No more children go up one step */
- c = (struct node*) p;
+ c = (struct node *) p;
p = (struct tnode *) NODE_PARENT(p);
}
return NULL; /* Ready. Root of trie */
@@ -1807,7 +1734,7 @@ static int fn_trie_flush(struct fib_table *tb)
t->revision++;
- for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
+ for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
found += trie_flush_leaf(t, l);
if (ll && hlist_empty(&ll->list))
@@ -1818,12 +1745,11 @@ static int fn_trie_flush(struct fib_table *tb)
if (ll && hlist_empty(&ll->list))
trie_leaf_remove(t, ll->key);
- if (trie_debug)
- printk("trie_flush found=%d\n", found);
+ pr_debug("trie_flush found=%d\n", found);
return found;
}
-static int trie_last_dflt=-1;
+static int trie_last_dflt = -1;
static void
fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
@@ -1840,7 +1766,7 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
last_resort = NULL;
order = -1;
- read_lock(&fib_lock);
+ rcu_read_lock();
l = fib_find_node(t, 0);
if (!l)
@@ -1853,20 +1779,20 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
if (list_empty(fa_head))
goto out;
- list_for_each_entry(fa, fa_head, fa_list) {
+ list_for_each_entry_rcu(fa, fa_head, fa_list) {
struct fib_info *next_fi = fa->fa_info;
-
+
if (fa->fa_scope != res->scope ||
fa->fa_type != RTN_UNICAST)
continue;
-
+
if (next_fi->fib_priority > res->fi->fib_priority)
break;
if (!next_fi->fib_nh[0].nh_gw ||
next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
continue;
fa->fa_state |= FA_S_ACCESSED;
-
+
if (fi == NULL) {
if (next_fi != res->fi)
break;
@@ -1904,7 +1830,7 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
}
trie_last_dflt = last_idx;
out:;
- read_unlock(&fib_lock);
+ rcu_read_unlock();
}
static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
@@ -1913,26 +1839,19 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi
int i, s_i;
struct fib_alias *fa;
- u32 xkey=htonl(key);
+ u32 xkey = htonl(key);
- s_i=cb->args[3];
+ s_i = cb->args[3];
i = 0;
- list_for_each_entry(fa, fah, fa_list) {
+ /* rcu_read_lock is hold by caller */
+
+ list_for_each_entry_rcu(fa, fah, fa_list) {
if (i < s_i) {
i++;
continue;
}
- if (fa->fa_info->fib_nh == NULL) {
- printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
- i++;
- continue;
- }
- if (fa->fa_info == NULL) {
- printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
- i++;
- continue;
- }
+ BUG_ON(!fa->fa_info);
if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
cb->nlh->nlmsg_seq,
@@ -1946,10 +1865,10 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi
fa->fa_info, 0) < 0) {
cb->args[3] = i;
return -1;
- }
+ }
i++;
}
- cb->args[3]=i;
+ cb->args[3] = i;
return skb->len;
}
@@ -1959,10 +1878,10 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str
int h, s_h;
struct list_head *fa_head;
struct leaf *l = NULL;
- s_h=cb->args[2];
- for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
+ s_h = cb->args[2];
+ for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
if (h < s_h)
continue;
if (h > s_h)
@@ -1970,7 +1889,7 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str
sizeof(cb->args) - 3*sizeof(cb->args[0]));
fa_head = get_fa_head(l, plen);
-
+
if (!fa_head)
continue;
@@ -1978,11 +1897,11 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str
continue;
if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
- cb->args[2]=h;
+ cb->args[2] = h;
return -1;
}
}
- cb->args[2]=h;
+ cb->args[2] = h;
return skb->len;
}
@@ -1993,25 +1912,24 @@ static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin
s_m = cb->args[1];
- read_lock(&fib_lock);
- for (m=0; m<=32; m++) {
-
+ rcu_read_lock();
+ for (m = 0; m <= 32; m++) {
if (m < s_m)
continue;
if (m > s_m)
memset(&cb->args[2], 0,
- sizeof(cb->args) - 2*sizeof(cb->args[0]));
+ sizeof(cb->args) - 2*sizeof(cb->args[0]));
if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
cb->args[1] = m;
goto out;
}
}
- read_unlock(&fib_lock);
+ rcu_read_unlock();
cb->args[1] = m;
return skb->len;
- out:
- read_unlock(&fib_lock);
+out:
+ rcu_read_unlock();
return -1;
}
@@ -2051,383 +1969,147 @@ struct fib_table * __init fib_hash_init(int id)
trie_init(t);
if (id == RT_TABLE_LOCAL)
- trie_local = t;
+ trie_local = t;
else if (id == RT_TABLE_MAIN)
- trie_main = t;
+ trie_main = t;
if (id == RT_TABLE_LOCAL)
- printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
+ printk(KERN_INFO "IPv4 FIB: Using LC-trie version %s\n", VERSION);
return tb;
}
-/* Trie dump functions */
-
-static void putspace_seq(struct seq_file *seq, int n)
-{
- while (n--) seq_printf(seq, " ");
-}
+#ifdef CONFIG_PROC_FS
+/* Depth first Trie walk iterator */
+struct fib_trie_iter {
+ struct tnode *tnode;
+ struct trie *trie;
+ unsigned index;
+ unsigned depth;
+};
-static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
+static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
{
- while (bits--)
- seq_printf(seq, "%s", (v & (1<<bits))?"1":"0");
-}
+ struct tnode *tn = iter->tnode;
+ unsigned cindex = iter->index;
+ struct tnode *p;
-static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
- int pend, int cindex, int bits)
-{
- putspace_seq(seq, indent);
- if (IS_LEAF(n))
- seq_printf(seq, "|");
- else
- seq_printf(seq, "+");
- if (bits) {
- seq_printf(seq, "%d/", cindex);
- printbin_seq(seq, cindex, bits);
- seq_printf(seq, ": ");
- }
- else
- seq_printf(seq, "<root>: ");
- seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
-
- if (IS_LEAF(n))
- seq_printf(seq, "key=%d.%d.%d.%d\n",
- n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
- else {
- int plen = ((struct tnode *)n)->pos;
- t_key prf=MASK_PFX(n->key, plen);
- seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
- prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
- }
- if (IS_LEAF(n)) {
- struct leaf *l=(struct leaf *)n;
- struct fib_alias *fa;
- int i;
- for (i=32; i>=0; i--)
- if (find_leaf_info(&l->list, i)) {
-
- struct list_head *fa_head = get_fa_head(l, i);
-
- if (!fa_head)
- continue;
-
- if (list_empty(fa_head))
- continue;
-
- putspace_seq(seq, indent+2);
- seq_printf(seq, "{/%d...dumping}\n", i);
-
-
- list_for_each_entry(fa, fa_head, fa_list) {
- putspace_seq(seq, indent+2);
- if (fa->fa_info->fib_nh == NULL) {
- seq_printf(seq, "Error _fib_nh=NULL\n");
- continue;
- }
- if (fa->fa_info == NULL) {
- seq_printf(seq, "Error fa_info=NULL\n");
- continue;
- }
-
- seq_printf(seq, "{type=%d scope=%d TOS=%d}\n",
- fa->fa_type,
- fa->fa_scope,
- fa->fa_tos);
- }
- }
- }
- else if (IS_TNODE(n)) {
- struct tnode *tn = (struct tnode *)n;
- putspace_seq(seq, indent); seq_printf(seq, "| ");
- seq_printf(seq, "{key prefix=%08x/", tn->key&TKEY_GET_MASK(0, tn->pos));
- printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
- seq_printf(seq, "}\n");
- putspace_seq(seq, indent); seq_printf(seq, "| ");
- seq_printf(seq, "{pos=%d", tn->pos);
- seq_printf(seq, " (skip=%d bits)", tn->pos - pend);
- seq_printf(seq, " bits=%d (%u children)}\n", tn->bits, (1 << tn->bits));
- putspace_seq(seq, indent); seq_printf(seq, "| ");
- seq_printf(seq, "{empty=%d full=%d}\n", tn->empty_children, tn->full_children);
- }
-}
+ pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
+ iter->tnode, iter->index, iter->depth);
+rescan:
+ while (cindex < (1<<tn->bits)) {
+ struct node *n = tnode_get_child(tn, cindex);
-static void trie_dump_seq(struct seq_file *seq, struct trie *t)
-{
- struct node *n = t->trie;
- int cindex=0;
- int indent=1;
- int pend=0;
- int depth = 0;
-
- read_lock(&fib_lock);
-
- seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
- if (n) {
- printnode_seq(seq, indent, n, pend, cindex, 0);
- if (IS_TNODE(n)) {
- struct tnode *tn = (struct tnode *)n;
- pend = tn->pos+tn->bits;
- putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
- indent += 3;
- depth++;
-
- while (tn && cindex < (1 << tn->bits)) {
- if (tn->child[cindex]) {
-
- /* Got a child */
-
- printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits);
- if (IS_LEAF(tn->child[cindex])) {
- cindex++;
-
- }
- else {
- /*
- * New tnode. Decend one level
- */
-
- depth++;
- n = tn->child[cindex];
- tn = (struct tnode *)n;
- pend = tn->pos+tn->bits;
- putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
- indent+=3;
- cindex=0;
- }
- }
- else
- cindex++;
-
- /*
- * Test if we are done
- */
-
- while (cindex >= (1 << tn->bits)) {
-
- /*
- * Move upwards and test for root
- * pop off all traversed nodes
- */
-
- if (NODE_PARENT(tn) == NULL) {
- tn = NULL;
- n = NULL;
- break;
- }
- else {
- cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
- tn = NODE_PARENT(tn);
- cindex++;
- n = (struct node *)tn;
- pend = tn->pos+tn->bits;
- indent-=3;
- depth--;
- }
- }
+ if (n) {
+ if (IS_LEAF(n)) {
+ iter->tnode = tn;
+ iter->index = cindex + 1;
+ } else {
+ /* push down one level */
+ iter->tnode = (struct tnode *) n;
+ iter->index = 0;
+ ++iter->depth;
}
+ return n;
}
- else n = NULL;
- }
- else seq_printf(seq, "------ trie is empty\n");
-
- read_unlock(&fib_lock);
-}
-static struct trie_stat *trie_stat_new(void)
-{
- struct trie_stat *s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
- int i;
-
- if (s) {
- s->totdepth = 0;
- s->maxdepth = 0;
- s->tnodes = 0;
- s->leaves = 0;
- s->nullpointers = 0;
-
- for(i=0; i< MAX_CHILDS; i++)
- s->nodesizes[i] = 0;
+ ++cindex;
}
- return s;
-}
-static struct trie_stat *trie_collect_stats(struct trie *t)
-{
- struct node *n = t->trie;
- struct trie_stat *s = trie_stat_new();
- int cindex = 0;
- int indent = 1;
- int pend = 0;
- int depth = 0;
-
- read_lock(&fib_lock);
-
- if (s) {
- if (n) {
- if (IS_TNODE(n)) {
- struct tnode *tn = (struct tnode *)n;
- pend = tn->pos+tn->bits;
- indent += 3;
- s->nodesizes[tn->bits]++;
- depth++;
-
- while (tn && cindex < (1 << tn->bits)) {
- if (tn->child[cindex]) {
- /* Got a child */
-
- if (IS_LEAF(tn->child[cindex])) {
- cindex++;
-
- /* stats */
- if (depth > s->maxdepth)
- s->maxdepth = depth;
- s->totdepth += depth;
- s->leaves++;
- }
-
- else {
- /*
- * New tnode. Decend one level
- */
-
- s->tnodes++;
- s->nodesizes[tn->bits]++;
- depth++;
-
- n = tn->child[cindex];
- tn = (struct tnode *)n;
- pend = tn->pos+tn->bits;
-
- indent += 3;
- cindex = 0;
- }
- }
- else {
- cindex++;
- s->nullpointers++;
- }
-
- /*
- * Test if we are done
- */
-
- while (cindex >= (1 << tn->bits)) {
-
- /*
- * Move upwards and test for root
- * pop off all traversed nodes
- */
-
-
- if (NODE_PARENT(tn) == NULL) {
- tn = NULL;
- n = NULL;
- break;
- }
- else {
- cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
- tn = NODE_PARENT(tn);
- cindex++;
- n = (struct node *)tn;
- pend = tn->pos+tn->bits;
- indent -= 3;
- depth--;
- }
- }
- }
- }
- else n = NULL;
- }
+ /* Current node exhausted, pop back up */
+ p = NODE_PARENT(tn);
+ if (p) {
+ cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
+ tn = p;
+ --iter->depth;
+ goto rescan;
}
- read_unlock(&fib_lock);
- return s;
-}
-
-#ifdef CONFIG_PROC_FS
-
-static struct fib_alias *fib_triestat_get_first(struct seq_file *seq)
-{
+ /* got root? */
return NULL;
}
-static struct fib_alias *fib_triestat_get_next(struct seq_file *seq)
+static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
+ struct trie *t)
{
- return NULL;
-}
-
-static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos)
-{
- void *v = NULL;
+ struct node *n = rcu_dereference(t->trie);
- if (ip_fib_main_table)
- v = *pos ? fib_triestat_get_next(seq) : SEQ_START_TOKEN;
- return v;
+ if (n && IS_TNODE(n)) {
+ iter->tnode = (struct tnode *) n;
+ iter->trie = t;
+ iter->index = 0;
+ iter->depth = 1;
+ return n;
+ }
+ return NULL;
}
-static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+static void trie_collect_stats(struct trie *t, struct trie_stat *s)
{
- ++*pos;
- return v == SEQ_START_TOKEN ? fib_triestat_get_first(seq) : fib_triestat_get_next(seq);
-}
+ struct node *n;
+ struct fib_trie_iter iter;
-static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
-{
+ memset(s, 0, sizeof(*s));
+ rcu_read_lock();
+ for (n = fib_trie_get_first(&iter, t); n;
+ n = fib_trie_get_next(&iter)) {
+ if (IS_LEAF(n)) {
+ s->leaves++;
+ s->totdepth += iter.depth;
+ if (iter.depth > s->maxdepth)
+ s->maxdepth = iter.depth;
+ } else {
+ const struct tnode *tn = (const struct tnode *) n;
+ int i;
+
+ s->tnodes++;
+ s->nodesizes[tn->bits]++;
+ for (i = 0; i < (1<<tn->bits); i++)
+ if (!tn->child[i])
+ s->nullpointers++;
+ }
+ }
+ rcu_read_unlock();
}
/*
* This outputs /proc/net/fib_triestats
- *
- * It always works in backward compatibility mode.
- * The format of the file is not supposed to be changed.
*/
-
-static void collect_and_show(struct trie *t, struct seq_file *seq)
+static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
{
- int bytes = 0; /* How many bytes are used, a ref is 4 bytes */
- int i, max, pointers;
- struct trie_stat *stat;
- int avdepth;
+ unsigned i, max, pointers, bytes, avdepth;
- stat = trie_collect_stats(t);
+ if (stat->leaves)
+ avdepth = stat->totdepth*100 / stat->leaves;
+ else
+ avdepth = 0;
- bytes=0;
- seq_printf(seq, "trie=%p\n", t);
+ seq_printf(seq, "\tAver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
+ seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
- if (stat) {
- if (stat->leaves)
- avdepth=stat->totdepth*100 / stat->leaves;
- else
- avdepth=0;
- seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
- seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
-
- seq_printf(seq, "Leaves: %d\n", stat->leaves);
- bytes += sizeof(struct leaf) * stat->leaves;
- seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
- bytes += sizeof(struct tnode) * stat->tnodes;
-
- max = MAX_CHILDS-1;
-
- while (max >= 0 && stat->nodesizes[max] == 0)
- max--;
- pointers = 0;
-
- for (i = 1; i <= max; i++)
- if (stat->nodesizes[i] != 0) {
- seq_printf(seq, " %d: %d", i, stat->nodesizes[i]);
- pointers += (1<<i) * stat->nodesizes[i];
- }
- seq_printf(seq, "\n");
- seq_printf(seq, "Pointers: %d\n", pointers);
- bytes += sizeof(struct node *) * pointers;
- seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
- seq_printf(seq, "Total size: %d kB\n", bytes / 1024);
+ seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
- kfree(stat);
- }
+ bytes = sizeof(struct leaf) * stat->leaves;
+ seq_printf(seq, "\tInternal nodes: %d\n\t", stat->tnodes);
+ bytes += sizeof(struct tnode) * stat->tnodes;
+
+ max = MAX_CHILDS-1;
+ while (max >= 0 && stat->nodesizes[max] == 0)
+ max--;
+
+ pointers = 0;
+ for (i = 1; i <= max; i++)
+ if (stat->nodesizes[i] != 0) {
+ seq_printf(seq, " %d: %d", i, stat->nodesizes[i]);
+ pointers += (1<<i) * stat->nodesizes[i];
+ }
+ seq_putc(seq, '\n');
+ seq_printf(seq, "\tPointers: %d\n", pointers);
+
+ bytes += sizeof(struct node *) * pointers;
+ seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
+ seq_printf(seq, "Total size: %d kB\n", (bytes + 1023) / 1024);
#ifdef CONFIG_IP_FIB_TRIE_STATS
seq_printf(seq, "Counters:\n---------\n");
@@ -2445,168 +2127,378 @@ static void collect_and_show(struct trie *t, struct seq_file *seq)
static int fib_triestat_seq_show(struct seq_file *seq, void *v)
{
- char bf[128];
+ struct trie_stat *stat;
- if (v == SEQ_START_TOKEN) {
- seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
- sizeof(struct leaf), sizeof(struct tnode));
- if (trie_local)
- collect_and_show(trie_local, seq);
+ stat = kmalloc(sizeof(*stat), GFP_KERNEL);
+ if (!stat)
+ return -ENOMEM;
+
+ seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
+ sizeof(struct leaf), sizeof(struct tnode));
- if (trie_main)
- collect_and_show(trie_main, seq);
+ if (trie_local) {
+ seq_printf(seq, "Local:\n");
+ trie_collect_stats(trie_local, stat);
+ trie_show_stats(seq, stat);
}
- else {
- snprintf(bf, sizeof(bf),
- "*\t%08X\t%08X", 200, 400);
-
- seq_printf(seq, "%-127s\n", bf);
+
+ if (trie_main) {
+ seq_printf(seq, "Main:\n");
+ trie_collect_stats(trie_main, stat);
+ trie_show_stats(seq, stat);
}
+ kfree(stat);
+
return 0;
}
-static struct seq_operations fib_triestat_seq_ops = {
- .start = fib_triestat_seq_start,
- .next = fib_triestat_seq_next,
- .stop = fib_triestat_seq_stop,
- .show = fib_triestat_seq_show,
-};
-
static int fib_triestat_seq_open(struct inode *inode, struct file *file)
{
- struct seq_file *seq;
- int rc = -ENOMEM;
-
- rc = seq_open(file, &fib_triestat_seq_ops);
- if (rc)
- goto out_kfree;
-
- seq = file->private_data;
-out:
- return rc;
-out_kfree:
- goto out;
+ return single_open(file, fib_triestat_seq_show, NULL);
}
-static struct file_operations fib_triestat_seq_fops = {
+static struct file_operations fib_triestat_fops = {
.owner = THIS_MODULE,
.open = fib_triestat_seq_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = single_release,
};
-int __init fib_stat_proc_init(void)
+static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
+ loff_t pos)
{
- if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_seq_fops))
- return -ENOMEM;
- return 0;
+ loff_t idx = 0;
+ struct node *n;
+
+ for (n = fib_trie_get_first(iter, trie_local);
+ n; ++idx, n = fib_trie_get_next(iter)) {
+ if (pos == idx)
+ return n;
+ }
+
+ for (n = fib_trie_get_first(iter, trie_main);
+ n; ++idx, n = fib_trie_get_next(iter)) {
+ if (pos == idx)
+ return n;
+ }
+ return NULL;
}
-void __init fib_stat_proc_exit(void)
+static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
{
- proc_net_remove("fib_triestat");
+ rcu_read_lock();
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+ return fib_trie_get_idx(seq->private, *pos - 1);
}
-static struct fib_alias *fib_trie_get_first(struct seq_file *seq)
+static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
+ struct fib_trie_iter *iter = seq->private;
+ void *l = v;
+
+ ++*pos;
+ if (v == SEQ_START_TOKEN)
+ return fib_trie_get_idx(iter, 0);
+
+ v = fib_trie_get_next(iter);
+ BUG_ON(v == l);
+ if (v)
+ return v;
+
+ /* continue scan in next trie */
+ if (iter->trie == trie_local)
+ return fib_trie_get_first(iter, trie_main);
+
return NULL;
}
-static struct fib_alias *fib_trie_get_next(struct seq_file *seq)
+static void fib_trie_seq_stop(struct seq_file *seq, void *v)
{
- return NULL;
+ rcu_read_unlock();
}
-static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
+static void seq_indent(struct seq_file *seq, int n)
+{
+ while (n-- > 0) seq_puts(seq, " ");
+}
+
+static inline const char *rtn_scope(enum rt_scope_t s)
{
- void *v = NULL;
+ static char buf[32];
- if (ip_fib_main_table)
- v = *pos ? fib_trie_get_next(seq) : SEQ_START_TOKEN;
- return v;
+ switch(s) {
+ case RT_SCOPE_UNIVERSE: return "universe";
+ case RT_SCOPE_SITE: return "site";
+ case RT_SCOPE_LINK: return "link";
+ case RT_SCOPE_HOST: return "host";
+ case RT_SCOPE_NOWHERE: return "nowhere";
+ default:
+ snprintf(buf, sizeof(buf), "scope=%d", s);
+ return buf;
+ }
}
-static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+static const char *rtn_type_names[__RTN_MAX] = {
+ [RTN_UNSPEC] = "UNSPEC",
+ [RTN_UNICAST] = "UNICAST",
+ [RTN_LOCAL] = "LOCAL",
+ [RTN_BROADCAST] = "BROADCAST",
+ [RTN_ANYCAST] = "ANYCAST",
+ [RTN_MULTICAST] = "MULTICAST",
+ [RTN_BLACKHOLE] = "BLACKHOLE",
+ [RTN_UNREACHABLE] = "UNREACHABLE",
+ [RTN_PROHIBIT] = "PROHIBIT",
+ [RTN_THROW] = "THROW",
+ [RTN_NAT] = "NAT",
+ [RTN_XRESOLVE] = "XRESOLVE",
+};
+
+static inline const char *rtn_type(unsigned t)
{
- ++*pos;
- return v == SEQ_START_TOKEN ? fib_trie_get_first(seq) : fib_trie_get_next(seq);
+ static char buf[32];
+
+ if (t < __RTN_MAX && rtn_type_names[t])
+ return rtn_type_names[t];
+ snprintf(buf, sizeof(buf), "type %d", t);
+ return buf;
}
-static void fib_trie_seq_stop(struct seq_file *seq, void *v)
+/* Pretty print the trie */
+static int fib_trie_seq_show(struct seq_file *seq, void *v)
+{
+ const struct fib_trie_iter *iter = seq->private;
+ struct node *n = v;
+
+ if (v == SEQ_START_TOKEN)
+ return 0;
+
+ if (IS_TNODE(n)) {
+ struct tnode *tn = (struct tnode *) n;
+ t_key prf = ntohl(MASK_PFX(tn->key, tn->pos));
+
+ if (!NODE_PARENT(n)) {
+ if (iter->trie == trie_local)
+ seq_puts(seq, "<local>:\n");
+ else
+ seq_puts(seq, "<main>:\n");
+ }
+ seq_indent(seq, iter->depth-1);
+ seq_printf(seq, " +-- %d.%d.%d.%d/%d %d %d %d\n",
+ NIPQUAD(prf), tn->pos, tn->bits, tn->full_children,
+ tn->empty_children);
+
+ } else {
+ struct leaf *l = (struct leaf *) n;
+ int i;
+ u32 val = ntohl(l->key);
+
+ seq_indent(seq, iter->depth);
+ seq_printf(seq, " |-- %d.%d.%d.%d\n", NIPQUAD(val));
+ for (i = 32; i >= 0; i--) {
+ struct leaf_info *li = find_leaf_info(l, i);
+ if (li) {
+ struct fib_alias *fa;
+ list_for_each_entry_rcu(fa, &li->falh, fa_list) {
+ seq_indent(seq, iter->depth+1);
+ seq_printf(seq, " /%d %s %s", i,
+ rtn_scope(fa->fa_scope),
+ rtn_type(fa->fa_type));
+ if (fa->fa_tos)
+ seq_printf(seq, "tos =%d\n",
+ fa->fa_tos);
+ seq_putc(seq, '\n');
+ }
+ }
+ }
+ }
+
+ return 0;
+}
+
+static struct seq_operations fib_trie_seq_ops = {
+ .start = fib_trie_seq_start,
+ .next = fib_trie_seq_next,
+ .stop = fib_trie_seq_stop,
+ .show = fib_trie_seq_show,
+};
+
+static int fib_trie_seq_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int rc = -ENOMEM;
+ struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+ if (!s)
+ goto out;
+
+ rc = seq_open(file, &fib_trie_seq_ops);
+ if (rc)
+ goto out_kfree;
+
+ seq = file->private_data;
+ seq->private = s;
+ memset(s, 0, sizeof(*s));
+out:
+ return rc;
+out_kfree:
+ kfree(s);
+ goto out;
+}
+
+static struct file_operations fib_trie_fops = {
+ .owner = THIS_MODULE,
+ .open = fib_trie_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+static unsigned fib_flag_trans(int type, u32 mask, const struct fib_info *fi)
{
+ static unsigned type2flags[RTN_MAX + 1] = {
+ [7] = RTF_REJECT, [8] = RTF_REJECT,
+ };
+ unsigned flags = type2flags[type];
+ if (fi && fi->fib_nh->nh_gw)
+ flags |= RTF_GATEWAY;
+ if (mask == 0xFFFFFFFF)
+ flags |= RTF_HOST;
+ flags |= RTF_UP;
+ return flags;
}
/*
- * This outputs /proc/net/fib_trie.
- *
- * It always works in backward compatibility mode.
- * The format of the file is not supposed to be changed.
+ * This outputs /proc/net/route.
+ * The format of the file is not supposed to be changed
+ * and needs to be same as fib_hash output to avoid breaking
+ * legacy utilities
*/
-
-static int fib_trie_seq_show(struct seq_file *seq, void *v)
+static int fib_route_seq_show(struct seq_file *seq, void *v)
{
+ struct leaf *l = v;
+ int i;
char bf[128];
if (v == SEQ_START_TOKEN) {
- if (trie_local)
- trie_dump_seq(seq, trie_local);
-
- if (trie_main)
- trie_dump_seq(seq, trie_main);
+ seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
+ "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
+ "\tWindow\tIRTT");
+ return 0;
}
- else {
- snprintf(bf, sizeof(bf),
- "*\t%08X\t%08X", 200, 400);
- seq_printf(seq, "%-127s\n", bf);
+ if (IS_TNODE(l))
+ return 0;
+
+ for (i=32; i>=0; i--) {
+ struct leaf_info *li = find_leaf_info(l, i);
+ struct fib_alias *fa;
+ u32 mask, prefix;
+
+ if (!li)
+ continue;
+
+ mask = inet_make_mask(li->plen);
+ prefix = htonl(l->key);
+
+ list_for_each_entry_rcu(fa, &li->falh, fa_list) {
+ const struct fib_info *fi = fa->fa_info;
+ unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
+
+ if (fa->fa_type == RTN_BROADCAST
+ || fa->fa_type == RTN_MULTICAST)
+ continue;
+
+ if (fi)
+ snprintf(bf, sizeof(bf),
+ "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
+ fi->fib_dev ? fi->fib_dev->name : "*",
+ prefix,
+ fi->fib_nh->nh_gw, flags, 0, 0,
+ fi->fib_priority,
+ mask,
+ (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
+ fi->fib_window,
+ fi->fib_rtt >> 3);
+ else
+ snprintf(bf, sizeof(bf),
+ "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
+ prefix, 0, flags, 0, 0, 0,
+ mask, 0, 0, 0);
+
+ seq_printf(seq, "%-127s\n", bf);
+ }
}
return 0;
}
-static struct seq_operations fib_trie_seq_ops = {
- .start = fib_trie_seq_start,
- .next = fib_trie_seq_next,
- .stop = fib_trie_seq_stop,
- .show = fib_trie_seq_show,
+static struct seq_operations fib_route_seq_ops = {
+ .start = fib_trie_seq_start,
+ .next = fib_trie_seq_next,
+ .stop = fib_trie_seq_stop,
+ .show = fib_route_seq_show,
};
-static int fib_trie_seq_open(struct inode *inode, struct file *file)
+static int fib_route_seq_open(struct inode *inode, struct file *file)
{
struct seq_file *seq;
int rc = -ENOMEM;
+ struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
- rc = seq_open(file, &fib_trie_seq_ops);
+ if (!s)
+ goto out;
+
+ rc = seq_open(file, &fib_route_seq_ops);
if (rc)
goto out_kfree;
- seq = file->private_data;
+ seq = file->private_data;
+ seq->private = s;
+ memset(s, 0, sizeof(*s));
out:
return rc;
out_kfree:
+ kfree(s);
goto out;
}
-static struct file_operations fib_trie_seq_fops = {
- .owner = THIS_MODULE,
- .open = fib_trie_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release= seq_release_private,
+static struct file_operations fib_route_fops = {
+ .owner = THIS_MODULE,
+ .open = fib_route_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
};
int __init fib_proc_init(void)
{
- if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_seq_fops))
- return -ENOMEM;
+ if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_fops))
+ goto out1;
+
+ if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_fops))
+ goto out2;
+
+ if (!proc_net_fops_create("route", S_IRUGO, &fib_route_fops))
+ goto out3;
+
return 0;
+
+out3:
+ proc_net_remove("fib_triestat");
+out2:
+ proc_net_remove("fib_trie");
+out1:
+ return -ENOMEM;
}
void __init fib_proc_exit(void)
{
proc_net_remove("fib_trie");
+ proc_net_remove("fib_triestat");
+ proc_net_remove("route");
}
#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 279f57abfec..175e093ec56 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -114,7 +114,7 @@ struct icmp_bxm {
/*
* Statistics
*/
-DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics);
+DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics) __read_mostly;
/* An array of errno for error messages from dest unreach. */
/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */
@@ -188,7 +188,7 @@ struct icmp_err icmp_err_convert[] = {
/* Control parameters for ECHO replies. */
int sysctl_icmp_echo_ignore_all;
-int sysctl_icmp_echo_ignore_broadcasts;
+int sysctl_icmp_echo_ignore_broadcasts = 1;
/* Control parameter - ignore bogus broadcast responses? */
int sysctl_icmp_ignore_bogus_error_responses;
@@ -349,12 +349,12 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
{
struct sk_buff *skb;
- ip_append_data(icmp_socket->sk, icmp_glue_bits, icmp_param,
- icmp_param->data_len+icmp_param->head_len,
- icmp_param->head_len,
- ipc, rt, MSG_DONTWAIT);
-
- if ((skb = skb_peek(&icmp_socket->sk->sk_write_queue)) != NULL) {
+ if (ip_append_data(icmp_socket->sk, icmp_glue_bits, icmp_param,
+ icmp_param->data_len+icmp_param->head_len,
+ icmp_param->head_len,
+ ipc, rt, MSG_DONTWAIT) < 0)
+ ip_flush_pending_frames(icmp_socket->sk);
+ else if ((skb = skb_peek(&icmp_socket->sk->sk_write_queue)) != NULL) {
struct icmphdr *icmph = skb->h.icmph;
unsigned int csum = 0;
struct sk_buff *skb1;
@@ -627,11 +627,10 @@ static void icmp_unreach(struct sk_buff *skb)
break;
case ICMP_FRAG_NEEDED:
if (ipv4_config.no_pmtu_disc) {
- LIMIT_NETDEBUG(
- printk(KERN_INFO "ICMP: %u.%u.%u.%u: "
+ LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: "
"fragmentation needed "
"and DF set.\n",
- NIPQUAD(iph->daddr)));
+ NIPQUAD(iph->daddr));
} else {
info = ip_rt_frag_needed(iph,
ntohs(icmph->un.frag.mtu));
@@ -640,10 +639,9 @@ static void icmp_unreach(struct sk_buff *skb)
}
break;
case ICMP_SR_FAILED:
- LIMIT_NETDEBUG(
- printk(KERN_INFO "ICMP: %u.%u.%u.%u: Source "
+ LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: Source "
"Route Failed.\n",
- NIPQUAD(iph->daddr)));
+ NIPQUAD(iph->daddr));
break;
default:
break;
@@ -936,8 +934,7 @@ int icmp_rcv(struct sk_buff *skb)
case CHECKSUM_HW:
if (!(u16)csum_fold(skb->csum))
break;
- NETDEBUG(if (net_ratelimit())
- printk(KERN_DEBUG "icmp v4 hw csum failure\n"));
+ LIMIT_NETDEBUG(KERN_DEBUG "icmp v4 hw csum failure\n");
case CHECKSUM_NONE:
if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0)))
goto error;
@@ -1111,12 +1108,9 @@ void __init icmp_init(struct net_proto_family *ops)
struct inet_sock *inet;
int i;
- for (i = 0; i < NR_CPUS; i++) {
+ for_each_cpu(i) {
int err;
- if (!cpu_possible(i))
- continue;
-
err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_ICMP,
&per_cpu(__icmp_socket, i));
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 5088f90835a..c6247fc8406 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -904,7 +904,7 @@ int igmp_rcv(struct sk_buff *skb)
case IGMP_MTRACE_RESP:
break;
default:
- NETDEBUG(printk(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type));
+ NETDEBUG(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type);
}
in_dev_put(in_dev);
kfree_skb(skb);
@@ -1323,7 +1323,7 @@ static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr)
}
if (dev) {
imr->imr_ifindex = dev->ifindex;
- idev = __in_dev_get(dev);
+ idev = __in_dev_get_rtnl(dev);
}
return idev;
}
@@ -1603,7 +1603,7 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc)
}
pmc->sources = NULL;
pmc->sfmode = MCAST_EXCLUDE;
- pmc->sfcount[MCAST_EXCLUDE] = 0;
+ pmc->sfcount[MCAST_INCLUDE] = 0;
pmc->sfcount[MCAST_EXCLUDE] = 1;
}
@@ -1908,8 +1908,11 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
sock_kfree_s(sk, newpsl, IP_SFLSIZE(newpsl->sl_max));
goto done;
}
- } else
+ } else {
newpsl = NULL;
+ (void) ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
+ msf->imsf_fmode, 0, NULL, 0);
+ }
psl = pmc->sflist;
if (psl) {
(void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
new file mode 100644
index 00000000000..94468a76c5b
--- /dev/null
+++ b/net/ipv4/inet_connection_sock.c
@@ -0,0 +1,641 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Support for INET connection oriented protocols.
+ *
+ * Authors: See the TCP sources
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or(at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/jhash.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/inet_timewait_sock.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/tcp_states.h>
+#include <net/xfrm.h>
+
+#ifdef INET_CSK_DEBUG
+const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
+EXPORT_SYMBOL(inet_csk_timer_bug_msg);
+#endif
+
+/*
+ * This array holds the first and last local port number.
+ * For high-usage systems, use sysctl to change this to
+ * 32768-61000
+ */
+int sysctl_local_port_range[2] = { 1024, 4999 };
+
+static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
+{
+ const u32 sk_rcv_saddr = inet_rcv_saddr(sk);
+ struct sock *sk2;
+ struct hlist_node *node;
+ int reuse = sk->sk_reuse;
+
+ sk_for_each_bound(sk2, node, &tb->owners) {
+ if (sk != sk2 &&
+ !inet_v6_ipv6only(sk2) &&
+ (!sk->sk_bound_dev_if ||
+ !sk2->sk_bound_dev_if ||
+ sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
+ if (!reuse || !sk2->sk_reuse ||
+ sk2->sk_state == TCP_LISTEN) {
+ const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
+ if (!sk2_rcv_saddr || !sk_rcv_saddr ||
+ sk2_rcv_saddr == sk_rcv_saddr)
+ break;
+ }
+ }
+ }
+ return node != NULL;
+}
+
+/* Obtain a reference to a local port for the given sock,
+ * if snum is zero it means select any available local port.
+ */
+int inet_csk_get_port(struct inet_hashinfo *hashinfo,
+ struct sock *sk, unsigned short snum)
+{
+ struct inet_bind_hashbucket *head;
+ struct hlist_node *node;
+ struct inet_bind_bucket *tb;
+ int ret;
+
+ local_bh_disable();
+ if (!snum) {
+ int low = sysctl_local_port_range[0];
+ int high = sysctl_local_port_range[1];
+ int remaining = (high - low) + 1;
+ int rover;
+
+ spin_lock(&hashinfo->portalloc_lock);
+ if (hashinfo->port_rover < low)
+ rover = low;
+ else
+ rover = hashinfo->port_rover;
+ do {
+ rover++;
+ if (rover > high)
+ rover = low;
+ head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
+ spin_lock(&head->lock);
+ inet_bind_bucket_for_each(tb, node, &head->chain)
+ if (tb->port == rover)
+ goto next;
+ break;
+ next:
+ spin_unlock(&head->lock);
+ } while (--remaining > 0);
+ hashinfo->port_rover = rover;
+ spin_unlock(&hashinfo->portalloc_lock);
+
+ /* Exhausted local port range during search? It is not
+ * possible for us to be holding one of the bind hash
+ * locks if this test triggers, because if 'remaining'
+ * drops to zero, we broke out of the do/while loop at
+ * the top level, not from the 'break;' statement.
+ */
+ ret = 1;
+ if (remaining <= 0)
+ goto fail;
+
+ /* OK, here is the one we will use. HEAD is
+ * non-NULL and we hold it's mutex.
+ */
+ snum = rover;
+ } else {
+ head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)];
+ spin_lock(&head->lock);
+ inet_bind_bucket_for_each(tb, node, &head->chain)
+ if (tb->port == snum)
+ goto tb_found;
+ }
+ tb = NULL;
+ goto tb_not_found;
+tb_found:
+ if (!hlist_empty(&tb->owners)) {
+ if (sk->sk_reuse > 1)
+ goto success;
+ if (tb->fastreuse > 0 &&
+ sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
+ goto success;
+ } else {
+ ret = 1;
+ if (inet_csk_bind_conflict(sk, tb))
+ goto fail_unlock;
+ }
+ }
+tb_not_found:
+ ret = 1;
+ if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL)
+ goto fail_unlock;
+ if (hlist_empty(&tb->owners)) {
+ if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
+ tb->fastreuse = 1;
+ else
+ tb->fastreuse = 0;
+ } else if (tb->fastreuse &&
+ (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
+ tb->fastreuse = 0;
+success:
+ if (!inet_csk(sk)->icsk_bind_hash)
+ inet_bind_hash(sk, tb, snum);
+ BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb);
+ ret = 0;
+
+fail_unlock:
+ spin_unlock(&head->lock);
+fail:
+ local_bh_enable();
+ return ret;
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_get_port);
+
+/*
+ * Wait for an incoming connection, avoid race conditions. This must be called
+ * with the socket locked.
+ */
+static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ DEFINE_WAIT(wait);
+ int err;
+
+ /*
+ * True wake-one mechanism for incoming connections: only
+ * one process gets woken up, not the 'whole herd'.
+ * Since we do not 'race & poll' for established sockets
+ * anymore, the common case will execute the loop only once.
+ *
+ * Subtle issue: "add_wait_queue_exclusive()" will be added
+ * after any current non-exclusive waiters, and we know that
+ * it will always _stay_ after any new non-exclusive waiters
+ * because all non-exclusive waiters are added at the
+ * beginning of the wait-queue. As such, it's ok to "drop"
+ * our exclusiveness temporarily when we get woken up without
+ * having to remove and re-insert us on the wait queue.
+ */
+ for (;;) {
+ prepare_to_wait_exclusive(sk->sk_sleep, &wait,
+ TASK_INTERRUPTIBLE);
+ release_sock(sk);
+ if (reqsk_queue_empty(&icsk->icsk_accept_queue))
+ timeo = schedule_timeout(timeo);
+ lock_sock(sk);
+ err = 0;
+ if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
+ break;
+ err = -EINVAL;
+ if (sk->sk_state != TCP_LISTEN)
+ break;
+ err = sock_intr_errno(timeo);
+ if (signal_pending(current))
+ break;
+ err = -EAGAIN;
+ if (!timeo)
+ break;
+ }
+ finish_wait(sk->sk_sleep, &wait);
+ return err;
+}
+
+/*
+ * This will accept the next outstanding connection.
+ */
+struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct sock *newsk;
+ int error;
+
+ lock_sock(sk);
+
+ /* We need to make sure that this socket is listening,
+ * and that it has something pending.
+ */
+ error = -EINVAL;
+ if (sk->sk_state != TCP_LISTEN)
+ goto out_err;
+
+ /* Find already established connection */
+ if (reqsk_queue_empty(&icsk->icsk_accept_queue)) {
+ long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+
+ /* If this is a non blocking socket don't sleep */
+ error = -EAGAIN;
+ if (!timeo)
+ goto out_err;
+
+ error = inet_csk_wait_for_connect(sk, timeo);
+ if (error)
+ goto out_err;
+ }
+
+ newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk);
+ BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
+out:
+ release_sock(sk);
+ return newsk;
+out_err:
+ newsk = NULL;
+ *err = error;
+ goto out;
+}
+
+EXPORT_SYMBOL(inet_csk_accept);
+
+/*
+ * Using different timers for retransmit, delayed acks and probes
+ * We may wish use just one timer maintaining a list of expire jiffies
+ * to optimize.
+ */
+void inet_csk_init_xmit_timers(struct sock *sk,
+ void (*retransmit_handler)(unsigned long),
+ void (*delack_handler)(unsigned long),
+ void (*keepalive_handler)(unsigned long))
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ init_timer(&icsk->icsk_retransmit_timer);
+ init_timer(&icsk->icsk_delack_timer);
+ init_timer(&sk->sk_timer);
+
+ icsk->icsk_retransmit_timer.function = retransmit_handler;
+ icsk->icsk_delack_timer.function = delack_handler;
+ sk->sk_timer.function = keepalive_handler;
+
+ icsk->icsk_retransmit_timer.data =
+ icsk->icsk_delack_timer.data =
+ sk->sk_timer.data = (unsigned long)sk;
+
+ icsk->icsk_pending = icsk->icsk_ack.pending = 0;
+}
+
+EXPORT_SYMBOL(inet_csk_init_xmit_timers);
+
+void inet_csk_clear_xmit_timers(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0;
+
+ sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
+ sk_stop_timer(sk, &icsk->icsk_delack_timer);
+ sk_stop_timer(sk, &sk->sk_timer);
+}
+
+EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
+
+void inet_csk_delete_keepalive_timer(struct sock *sk)
+{
+ sk_stop_timer(sk, &sk->sk_timer);
+}
+
+EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);
+
+void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
+{
+ sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
+}
+
+EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
+
+struct dst_entry* inet_csk_route_req(struct sock *sk,
+ const struct request_sock *req)
+{
+ struct rtable *rt;
+ const struct inet_request_sock *ireq = inet_rsk(req);
+ struct ip_options *opt = inet_rsk(req)->opt;
+ struct flowi fl = { .oif = sk->sk_bound_dev_if,
+ .nl_u = { .ip4_u =
+ { .daddr = ((opt && opt->srr) ?
+ opt->faddr :
+ ireq->rmt_addr),
+ .saddr = ireq->loc_addr,
+ .tos = RT_CONN_FLAGS(sk) } },
+ .proto = sk->sk_protocol,
+ .uli_u = { .ports =
+ { .sport = inet_sk(sk)->sport,
+ .dport = ireq->rmt_port } } };
+
+ if (ip_route_output_flow(&rt, &fl, sk, 0)) {
+ IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
+ return NULL;
+ }
+ if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
+ ip_rt_put(rt);
+ IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
+ return NULL;
+ }
+ return &rt->u.dst;
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_route_req);
+
+static inline u32 inet_synq_hash(const u32 raddr, const u16 rport,
+ const u32 rnd, const u16 synq_hsize)
+{
+ return jhash_2words(raddr, (u32)rport, rnd) & (synq_hsize - 1);
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#define AF_INET_FAMILY(fam) ((fam) == AF_INET)
+#else
+#define AF_INET_FAMILY(fam) 1
+#endif
+
+struct request_sock *inet_csk_search_req(const struct sock *sk,
+ struct request_sock ***prevp,
+ const __u16 rport, const __u32 raddr,
+ const __u32 laddr)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
+ struct request_sock *req, **prev;
+
+ for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd,
+ lopt->nr_table_entries)];
+ (req = *prev) != NULL;
+ prev = &req->dl_next) {
+ const struct inet_request_sock *ireq = inet_rsk(req);
+
+ if (ireq->rmt_port == rport &&
+ ireq->rmt_addr == raddr &&
+ ireq->loc_addr == laddr &&
+ AF_INET_FAMILY(req->rsk_ops->family)) {
+ BUG_TRAP(!req->sk);
+ *prevp = prev;
+ break;
+ }
+ }
+
+ return req;
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_search_req);
+
+void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
+ const unsigned timeout)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
+ const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port,
+ lopt->hash_rnd, lopt->nr_table_entries);
+
+ reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
+ inet_csk_reqsk_queue_added(sk, timeout);
+}
+
+/* Only thing we need from tcp.h */
+extern int sysctl_tcp_synack_retries;
+
+EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
+
+void inet_csk_reqsk_queue_prune(struct sock *parent,
+ const unsigned long interval,
+ const unsigned long timeout,
+ const unsigned long max_rto)
+{
+ struct inet_connection_sock *icsk = inet_csk(parent);
+ struct request_sock_queue *queue = &icsk->icsk_accept_queue;
+ struct listen_sock *lopt = queue->listen_opt;
+ int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
+ int thresh = max_retries;
+ unsigned long now = jiffies;
+ struct request_sock **reqp, *req;
+ int i, budget;
+
+ if (lopt == NULL || lopt->qlen == 0)
+ return;
+
+ /* Normally all the openreqs are young and become mature
+ * (i.e. converted to established socket) for first timeout.
+ * If synack was not acknowledged for 3 seconds, it means
+ * one of the following things: synack was lost, ack was lost,
+ * rtt is high or nobody planned to ack (i.e. synflood).
+ * When server is a bit loaded, queue is populated with old
+ * open requests, reducing effective size of queue.
+ * When server is well loaded, queue size reduces to zero
+ * after several minutes of work. It is not synflood,
+ * it is normal operation. The solution is pruning
+ * too old entries overriding normal timeout, when
+ * situation becomes dangerous.
+ *
+ * Essentially, we reserve half of room for young
+ * embrions; and abort old ones without pity, if old
+ * ones are about to clog our table.
+ */
+ if (lopt->qlen>>(lopt->max_qlen_log-1)) {
+ int young = (lopt->qlen_young<<1);
+
+ while (thresh > 2) {
+ if (lopt->qlen < young)
+ break;
+ thresh--;
+ young <<= 1;
+ }
+ }
+
+ if (queue->rskq_defer_accept)
+ max_retries = queue->rskq_defer_accept;
+
+ budget = 2 * (lopt->nr_table_entries / (timeout / interval));
+ i = lopt->clock_hand;
+
+ do {
+ reqp=&lopt->syn_table[i];
+ while ((req = *reqp) != NULL) {
+ if (time_after_eq(now, req->expires)) {
+ if ((req->retrans < thresh ||
+ (inet_rsk(req)->acked && req->retrans < max_retries))
+ && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) {
+ unsigned long timeo;
+
+ if (req->retrans++ == 0)
+ lopt->qlen_young--;
+ timeo = min((timeout << req->retrans), max_rto);
+ req->expires = now + timeo;
+ reqp = &req->dl_next;
+ continue;
+ }
+
+ /* Drop this request */
+ inet_csk_reqsk_queue_unlink(parent, req, reqp);
+ reqsk_queue_removed(queue, req);
+ reqsk_free(req);
+ continue;
+ }
+ reqp = &req->dl_next;
+ }
+
+ i = (i + 1) & (lopt->nr_table_entries - 1);
+
+ } while (--budget > 0);
+
+ lopt->clock_hand = i;
+
+ if (lopt->qlen)
+ inet_csk_reset_keepalive_timer(parent, interval);
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);
+
+struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
+ const gfp_t priority)
+{
+ struct sock *newsk = sk_clone(sk, priority);
+
+ if (newsk != NULL) {
+ struct inet_connection_sock *newicsk = inet_csk(newsk);
+
+ newsk->sk_state = TCP_SYN_RECV;
+ newicsk->icsk_bind_hash = NULL;
+
+ inet_sk(newsk)->dport = inet_rsk(req)->rmt_port;
+ newsk->sk_write_space = sk_stream_write_space;
+
+ newicsk->icsk_retransmits = 0;
+ newicsk->icsk_backoff = 0;
+ newicsk->icsk_probes_out = 0;
+
+ /* Deinitialize accept_queue to trap illegal accesses. */
+ memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
+ }
+ return newsk;
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_clone);
+
+/*
+ * At this point, there should be no process reference to this
+ * socket, and thus no user references at all. Therefore we
+ * can assume the socket waitqueue is inactive and nobody will
+ * try to jump onto it.
+ */
+void inet_csk_destroy_sock(struct sock *sk)
+{
+ BUG_TRAP(sk->sk_state == TCP_CLOSE);
+ BUG_TRAP(sock_flag(sk, SOCK_DEAD));
+
+ /* It cannot be in hash table! */
+ BUG_TRAP(sk_unhashed(sk));
+
+ /* If it has not 0 inet_sk(sk)->num, it must be bound */
+ BUG_TRAP(!inet_sk(sk)->num || inet_csk(sk)->icsk_bind_hash);
+
+ sk->sk_prot->destroy(sk);
+
+ sk_stream_kill_queues(sk);
+
+ xfrm_sk_free_policy(sk);
+
+ sk_refcnt_debug_release(sk);
+
+ atomic_dec(sk->sk_prot->orphan_count);
+ sock_put(sk);
+}
+
+EXPORT_SYMBOL(inet_csk_destroy_sock);
+
+int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
+
+ if (rc != 0)
+ return rc;
+
+ sk->sk_max_ack_backlog = 0;
+ sk->sk_ack_backlog = 0;
+ inet_csk_delack_init(sk);
+
+ /* There is race window here: we announce ourselves listening,
+ * but this transition is still not validated by get_port().
+ * It is OK, because this socket enters to hash table only
+ * after validation is complete.
+ */
+ sk->sk_state = TCP_LISTEN;
+ if (!sk->sk_prot->get_port(sk, inet->num)) {
+ inet->sport = htons(inet->num);
+
+ sk_dst_reset(sk);
+ sk->sk_prot->hash(sk);
+
+ return 0;
+ }
+
+ sk->sk_state = TCP_CLOSE;
+ __reqsk_queue_destroy(&icsk->icsk_accept_queue);
+ return -EADDRINUSE;
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_listen_start);
+
+/*
+ * This routine closes sockets which have been at least partially
+ * opened, but not yet accepted.
+ */
+void inet_csk_listen_stop(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct request_sock *acc_req;
+ struct request_sock *req;
+
+ inet_csk_delete_keepalive_timer(sk);
+
+ /* make all the listen_opt local to us */
+ acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue);
+
+ /* Following specs, it would be better either to send FIN
+ * (and enter FIN-WAIT-1, it is normal close)
+ * or to send active reset (abort).
+ * Certainly, it is pretty dangerous while synflood, but it is
+ * bad justification for our negligence 8)
+ * To be honest, we are not able to make either
+ * of the variants now. --ANK
+ */
+ reqsk_queue_destroy(&icsk->icsk_accept_queue);
+
+ while ((req = acc_req) != NULL) {
+ struct sock *child = req->sk;
+
+ acc_req = req->dl_next;
+
+ local_bh_disable();
+ bh_lock_sock(child);
+ BUG_TRAP(!sock_owned_by_user(child));
+ sock_hold(child);
+
+ sk->sk_prot->disconnect(child, O_NONBLOCK);
+
+ sock_orphan(child);
+
+ atomic_inc(sk->sk_prot->orphan_count);
+
+ inet_csk_destroy_sock(child);
+
+ bh_unlock_sock(child);
+ local_bh_enable();
+ sock_put(child);
+
+ sk_acceptq_removed(sk);
+ __reqsk_free(req);
+ }
+ BUG_TRAP(!sk->sk_ack_backlog);
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
new file mode 100644
index 00000000000..71f3c7350c6
--- /dev/null
+++ b/net/ipv4/inet_diag.c
@@ -0,0 +1,868 @@
+/*
+ * inet_diag.c Module for monitoring INET transport protocols sockets.
+ *
+ * Version: $Id: inet_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/random.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/time.h>
+
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/ipv6.h>
+#include <net/inet_common.h>
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/inet_timewait_sock.h>
+#include <net/inet6_hashtables.h>
+
+#include <linux/inet.h>
+#include <linux/stddef.h>
+
+#include <linux/inet_diag.h>
+
+static const struct inet_diag_handler **inet_diag_table;
+
+struct inet_diag_entry {
+ u32 *saddr;
+ u32 *daddr;
+ u16 sport;
+ u16 dport;
+ u16 family;
+ u16 userlocks;
+};
+
+static struct sock *idiagnl;
+
+#define INET_DIAG_PUT(skb, attrtype, attrlen) \
+ RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
+
+static int inet_diag_fill(struct sk_buff *skb, struct sock *sk,
+ int ext, u32 pid, u32 seq, u16 nlmsg_flags,
+ const struct nlmsghdr *unlh)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_diag_msg *r;
+ struct nlmsghdr *nlh;
+ void *info = NULL;
+ struct inet_diag_meminfo *minfo = NULL;
+ unsigned char *b = skb->tail;
+ const struct inet_diag_handler *handler;
+
+ handler = inet_diag_table[unlh->nlmsg_type];
+ BUG_ON(handler == NULL);
+
+ nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
+ nlh->nlmsg_flags = nlmsg_flags;
+
+ r = NLMSG_DATA(nlh);
+ if (sk->sk_state != TCP_TIME_WAIT) {
+ if (ext & (1 << (INET_DIAG_MEMINFO - 1)))
+ minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO,
+ sizeof(*minfo));
+ if (ext & (1 << (INET_DIAG_INFO - 1)))
+ info = INET_DIAG_PUT(skb, INET_DIAG_INFO,
+ handler->idiag_info_size);
+
+ if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) {
+ size_t len = strlen(icsk->icsk_ca_ops->name);
+ strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1),
+ icsk->icsk_ca_ops->name);
+ }
+ }
+ r->idiag_family = sk->sk_family;
+ r->idiag_state = sk->sk_state;
+ r->idiag_timer = 0;
+ r->idiag_retrans = 0;
+
+ r->id.idiag_if = sk->sk_bound_dev_if;
+ r->id.idiag_cookie[0] = (u32)(unsigned long)sk;
+ r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
+
+ if (r->idiag_state == TCP_TIME_WAIT) {
+ const struct inet_timewait_sock *tw = inet_twsk(sk);
+ long tmo = tw->tw_ttd - jiffies;
+ if (tmo < 0)
+ tmo = 0;
+
+ r->id.idiag_sport = tw->tw_sport;
+ r->id.idiag_dport = tw->tw_dport;
+ r->id.idiag_src[0] = tw->tw_rcv_saddr;
+ r->id.idiag_dst[0] = tw->tw_daddr;
+ r->idiag_state = tw->tw_substate;
+ r->idiag_timer = 3;
+ r->idiag_expires = (tmo * 1000 + HZ - 1) / HZ;
+ r->idiag_rqueue = 0;
+ r->idiag_wqueue = 0;
+ r->idiag_uid = 0;
+ r->idiag_inode = 0;
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+ if (r->idiag_family == AF_INET6) {
+ const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk);
+
+ ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
+ &tcp6tw->tw_v6_rcv_saddr);
+ ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
+ &tcp6tw->tw_v6_daddr);
+ }
+#endif
+ nlh->nlmsg_len = skb->tail - b;
+ return skb->len;
+ }
+
+ r->id.idiag_sport = inet->sport;
+ r->id.idiag_dport = inet->dport;
+ r->id.idiag_src[0] = inet->rcv_saddr;
+ r->id.idiag_dst[0] = inet->daddr;
+
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+ if (r->idiag_family == AF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
+ &np->rcv_saddr);
+ ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
+ &np->daddr);
+ }
+#endif
+
+#define EXPIRES_IN_MS(tmo) ((tmo - jiffies) * 1000 + HZ - 1) / HZ
+
+ if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
+ r->idiag_timer = 1;
+ r->idiag_retrans = icsk->icsk_retransmits;
+ r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
+ } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
+ r->idiag_timer = 4;
+ r->idiag_retrans = icsk->icsk_probes_out;
+ r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
+ } else if (timer_pending(&sk->sk_timer)) {
+ r->idiag_timer = 2;
+ r->idiag_retrans = icsk->icsk_probes_out;
+ r->idiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires);
+ } else {
+ r->idiag_timer = 0;
+ r->idiag_expires = 0;
+ }
+#undef EXPIRES_IN_MS
+
+ r->idiag_uid = sock_i_uid(sk);
+ r->idiag_inode = sock_i_ino(sk);
+
+ if (minfo) {
+ minfo->idiag_rmem = atomic_read(&sk->sk_rmem_alloc);
+ minfo->idiag_wmem = sk->sk_wmem_queued;
+ minfo->idiag_fmem = sk->sk_forward_alloc;
+ minfo->idiag_tmem = atomic_read(&sk->sk_wmem_alloc);
+ }
+
+ handler->idiag_get_info(sk, r, info);
+
+ if (sk->sk_state < TCP_TIME_WAIT &&
+ icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info)
+ icsk->icsk_ca_ops->get_info(sk, ext, skb);
+
+ nlh->nlmsg_len = skb->tail - b;
+ return skb->len;
+
+rtattr_failure:
+nlmsg_failure:
+ skb_trim(skb, b - skb->data);
+ return -1;
+}
+
+static int inet_diag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh)
+{
+ int err;
+ struct sock *sk;
+ struct inet_diag_req *req = NLMSG_DATA(nlh);
+ struct sk_buff *rep;
+ struct inet_hashinfo *hashinfo;
+ const struct inet_diag_handler *handler;
+
+ handler = inet_diag_table[nlh->nlmsg_type];
+ BUG_ON(handler == NULL);
+ hashinfo = handler->idiag_hashinfo;
+
+ if (req->idiag_family == AF_INET) {
+ sk = inet_lookup(hashinfo, req->id.idiag_dst[0],
+ req->id.idiag_dport, req->id.idiag_src[0],
+ req->id.idiag_sport, req->id.idiag_if);
+ }
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+ else if (req->idiag_family == AF_INET6) {
+ sk = inet6_lookup(hashinfo,
+ (struct in6_addr *)req->id.idiag_dst,
+ req->id.idiag_dport,
+ (struct in6_addr *)req->id.idiag_src,
+ req->id.idiag_sport,
+ req->id.idiag_if);
+ }
+#endif
+ else {
+ return -EINVAL;
+ }
+
+ if (sk == NULL)
+ return -ENOENT;
+
+ err = -ESTALE;
+ if ((req->id.idiag_cookie[0] != INET_DIAG_NOCOOKIE ||
+ req->id.idiag_cookie[1] != INET_DIAG_NOCOOKIE) &&
+ ((u32)(unsigned long)sk != req->id.idiag_cookie[0] ||
+ (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.idiag_cookie[1]))
+ goto out;
+
+ err = -ENOMEM;
+ rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) +
+ sizeof(struct inet_diag_meminfo) +
+ handler->idiag_info_size + 64)),
+ GFP_KERNEL);
+ if (!rep)
+ goto out;
+
+ if (inet_diag_fill(rep, sk, req->idiag_ext,
+ NETLINK_CB(in_skb).pid,
+ nlh->nlmsg_seq, 0, nlh) <= 0)
+ BUG();
+
+ err = netlink_unicast(idiagnl, rep, NETLINK_CB(in_skb).pid,
+ MSG_DONTWAIT);
+ if (err > 0)
+ err = 0;
+
+out:
+ if (sk) {
+ if (sk->sk_state == TCP_TIME_WAIT)
+ inet_twsk_put((struct inet_timewait_sock *)sk);
+ else
+ sock_put(sk);
+ }
+ return err;
+}
+
+static int bitstring_match(const u32 *a1, const u32 *a2, int bits)
+{
+ int words = bits >> 5;
+
+ bits &= 0x1f;
+
+ if (words) {
+ if (memcmp(a1, a2, words << 2))
+ return 0;
+ }
+ if (bits) {
+ __u32 w1, w2;
+ __u32 mask;
+
+ w1 = a1[words];
+ w2 = a2[words];
+
+ mask = htonl((0xffffffff) << (32 - bits));
+
+ if ((w1 ^ w2) & mask)
+ return 0;
+ }
+
+ return 1;
+}
+
+
+static int inet_diag_bc_run(const void *bc, int len,
+ const struct inet_diag_entry *entry)
+{
+ while (len > 0) {
+ int yes = 1;
+ const struct inet_diag_bc_op *op = bc;
+
+ switch (op->code) {
+ case INET_DIAG_BC_NOP:
+ break;
+ case INET_DIAG_BC_JMP:
+ yes = 0;
+ break;
+ case INET_DIAG_BC_S_GE:
+ yes = entry->sport >= op[1].no;
+ break;
+ case INET_DIAG_BC_S_LE:
+ yes = entry->dport <= op[1].no;
+ break;
+ case INET_DIAG_BC_D_GE:
+ yes = entry->dport >= op[1].no;
+ break;
+ case INET_DIAG_BC_D_LE:
+ yes = entry->dport <= op[1].no;
+ break;
+ case INET_DIAG_BC_AUTO:
+ yes = !(entry->userlocks & SOCK_BINDPORT_LOCK);
+ break;
+ case INET_DIAG_BC_S_COND:
+ case INET_DIAG_BC_D_COND: {
+ struct inet_diag_hostcond *cond;
+ u32 *addr;
+
+ cond = (struct inet_diag_hostcond *)(op + 1);
+ if (cond->port != -1 &&
+ cond->port != (op->code == INET_DIAG_BC_S_COND ?
+ entry->sport : entry->dport)) {
+ yes = 0;
+ break;
+ }
+
+ if (cond->prefix_len == 0)
+ break;
+
+ if (op->code == INET_DIAG_BC_S_COND)
+ addr = entry->saddr;
+ else
+ addr = entry->daddr;
+
+ if (bitstring_match(addr, cond->addr, cond->prefix_len))
+ break;
+ if (entry->family == AF_INET6 &&
+ cond->family == AF_INET) {
+ if (addr[0] == 0 && addr[1] == 0 &&
+ addr[2] == htonl(0xffff) &&
+ bitstring_match(addr + 3, cond->addr,
+ cond->prefix_len))
+ break;
+ }
+ yes = 0;
+ break;
+ }
+ }
+
+ if (yes) {
+ len -= op->yes;
+ bc += op->yes;
+ } else {
+ len -= op->no;
+ bc += op->no;
+ }
+ }
+ return (len == 0);
+}
+
+static int valid_cc(const void *bc, int len, int cc)
+{
+ while (len >= 0) {
+ const struct inet_diag_bc_op *op = bc;
+
+ if (cc > len)
+ return 0;
+ if (cc == len)
+ return 1;
+ if (op->yes < 4)
+ return 0;
+ len -= op->yes;
+ bc += op->yes;
+ }
+ return 0;
+}
+
+static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
+{
+ const unsigned char *bc = bytecode;
+ int len = bytecode_len;
+
+ while (len > 0) {
+ struct inet_diag_bc_op *op = (struct inet_diag_bc_op *)bc;
+
+//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
+ switch (op->code) {
+ case INET_DIAG_BC_AUTO:
+ case INET_DIAG_BC_S_COND:
+ case INET_DIAG_BC_D_COND:
+ case INET_DIAG_BC_S_GE:
+ case INET_DIAG_BC_S_LE:
+ case INET_DIAG_BC_D_GE:
+ case INET_DIAG_BC_D_LE:
+ if (op->yes < 4 || op->yes > len + 4)
+ return -EINVAL;
+ case INET_DIAG_BC_JMP:
+ if (op->no < 4 || op->no > len + 4)
+ return -EINVAL;
+ if (op->no < len &&
+ !valid_cc(bytecode, bytecode_len, len - op->no))
+ return -EINVAL;
+ break;
+ case INET_DIAG_BC_NOP:
+ if (op->yes < 4 || op->yes > len + 4)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+ bc += op->yes;
+ len -= op->yes;
+ }
+ return len == 0 ? 0 : -EINVAL;
+}
+
+static int inet_diag_dump_sock(struct sk_buff *skb, struct sock *sk,
+ struct netlink_callback *cb)
+{
+ struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
+
+ if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
+ struct inet_diag_entry entry;
+ struct rtattr *bc = (struct rtattr *)(r + 1);
+ struct inet_sock *inet = inet_sk(sk);
+
+ entry.family = sk->sk_family;
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+ if (entry.family == AF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ entry.saddr = np->rcv_saddr.s6_addr32;
+ entry.daddr = np->daddr.s6_addr32;
+ } else
+#endif
+ {
+ entry.saddr = &inet->rcv_saddr;
+ entry.daddr = &inet->daddr;
+ }
+ entry.sport = inet->num;
+ entry.dport = ntohs(inet->dport);
+ entry.userlocks = sk->sk_userlocks;
+
+ if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry))
+ return 0;
+ }
+
+ return inet_diag_fill(skb, sk, r->idiag_ext, NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
+}
+
+static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
+ struct request_sock *req,
+ u32 pid, u32 seq,
+ const struct nlmsghdr *unlh)
+{
+ const struct inet_request_sock *ireq = inet_rsk(req);
+ struct inet_sock *inet = inet_sk(sk);
+ unsigned char *b = skb->tail;
+ struct inet_diag_msg *r;
+ struct nlmsghdr *nlh;
+ long tmo;
+
+ nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
+ nlh->nlmsg_flags = NLM_F_MULTI;
+ r = NLMSG_DATA(nlh);
+
+ r->idiag_family = sk->sk_family;
+ r->idiag_state = TCP_SYN_RECV;
+ r->idiag_timer = 1;
+ r->idiag_retrans = req->retrans;
+
+ r->id.idiag_if = sk->sk_bound_dev_if;
+ r->id.idiag_cookie[0] = (u32)(unsigned long)req;
+ r->id.idiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1);
+
+ tmo = req->expires - jiffies;
+ if (tmo < 0)
+ tmo = 0;
+
+ r->id.idiag_sport = inet->sport;
+ r->id.idiag_dport = ireq->rmt_port;
+ r->id.idiag_src[0] = ireq->loc_addr;
+ r->id.idiag_dst[0] = ireq->rmt_addr;
+ r->idiag_expires = jiffies_to_msecs(tmo);
+ r->idiag_rqueue = 0;
+ r->idiag_wqueue = 0;
+ r->idiag_uid = sock_i_uid(sk);
+ r->idiag_inode = 0;
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+ if (r->idiag_family == AF_INET6) {
+ ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
+ &tcp6_rsk(req)->loc_addr);
+ ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
+ &tcp6_rsk(req)->rmt_addr);
+ }
+#endif
+ nlh->nlmsg_len = skb->tail - b;
+
+ return skb->len;
+
+nlmsg_failure:
+ skb_trim(skb, b - skb->data);
+ return -1;
+}
+
+static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
+ struct netlink_callback *cb)
+{
+ struct inet_diag_entry entry;
+ struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct listen_sock *lopt;
+ struct rtattr *bc = NULL;
+ struct inet_sock *inet = inet_sk(sk);
+ int j, s_j;
+ int reqnum, s_reqnum;
+ int err = 0;
+
+ s_j = cb->args[3];
+ s_reqnum = cb->args[4];
+
+ if (s_j > 0)
+ s_j--;
+
+ entry.family = sk->sk_family;
+
+ read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+
+ lopt = icsk->icsk_accept_queue.listen_opt;
+ if (!lopt || !lopt->qlen)
+ goto out;
+
+ if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
+ bc = (struct rtattr *)(r + 1);
+ entry.sport = inet->num;
+ entry.userlocks = sk->sk_userlocks;
+ }
+
+ for (j = s_j; j < lopt->nr_table_entries; j++) {
+ struct request_sock *req, *head = lopt->syn_table[j];
+
+ reqnum = 0;
+ for (req = head; req; reqnum++, req = req->dl_next) {
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+ if (reqnum < s_reqnum)
+ continue;
+ if (r->id.idiag_dport != ireq->rmt_port &&
+ r->id.idiag_dport)
+ continue;
+
+ if (bc) {
+ entry.saddr =
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+ (entry.family == AF_INET6) ?
+ tcp6_rsk(req)->loc_addr.s6_addr32 :
+#endif
+ &ireq->loc_addr;
+ entry.daddr =
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+ (entry.family == AF_INET6) ?
+ tcp6_rsk(req)->rmt_addr.s6_addr32 :
+#endif
+ &ireq->rmt_addr;
+ entry.dport = ntohs(ireq->rmt_port);
+
+ if (!inet_diag_bc_run(RTA_DATA(bc),
+ RTA_PAYLOAD(bc), &entry))
+ continue;
+ }
+
+ err = inet_diag_fill_req(skb, sk, req,
+ NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq, cb->nlh);
+ if (err < 0) {
+ cb->args[3] = j + 1;
+ cb->args[4] = reqnum;
+ goto out;
+ }
+ }
+
+ s_reqnum = 0;
+ }
+
+out:
+ read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+
+ return err;
+}
+
+static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int i, num;
+ int s_i, s_num;
+ struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
+ const struct inet_diag_handler *handler;
+ struct inet_hashinfo *hashinfo;
+
+ handler = inet_diag_table[cb->nlh->nlmsg_type];
+ BUG_ON(handler == NULL);
+ hashinfo = handler->idiag_hashinfo;
+
+ s_i = cb->args[1];
+ s_num = num = cb->args[2];
+
+ if (cb->args[0] == 0) {
+ if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV)))
+ goto skip_listen_ht;
+
+ inet_listen_lock(hashinfo);
+ for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
+ struct sock *sk;
+ struct hlist_node *node;
+
+ num = 0;
+ sk_for_each(sk, node, &hashinfo->listening_hash[i]) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (num < s_num) {
+ num++;
+ continue;
+ }
+
+ if (r->id.idiag_sport != inet->sport &&
+ r->id.idiag_sport)
+ goto next_listen;
+
+ if (!(r->idiag_states & TCPF_LISTEN) ||
+ r->id.idiag_dport ||
+ cb->args[3] > 0)
+ goto syn_recv;
+
+ if (inet_diag_dump_sock(skb, sk, cb) < 0) {
+ inet_listen_unlock(hashinfo);
+ goto done;
+ }
+
+syn_recv:
+ if (!(r->idiag_states & TCPF_SYN_RECV))
+ goto next_listen;
+
+ if (inet_diag_dump_reqs(skb, sk, cb) < 0) {
+ inet_listen_unlock(hashinfo);
+ goto done;
+ }
+
+next_listen:
+ cb->args[3] = 0;
+ cb->args[4] = 0;
+ ++num;
+ }
+
+ s_num = 0;
+ cb->args[3] = 0;
+ cb->args[4] = 0;
+ }
+ inet_listen_unlock(hashinfo);
+skip_listen_ht:
+ cb->args[0] = 1;
+ s_i = num = s_num = 0;
+ }
+
+ if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV)))
+ return skb->len;
+
+ for (i = s_i; i < hashinfo->ehash_size; i++) {
+ struct inet_ehash_bucket *head = &hashinfo->ehash[i];
+ struct sock *sk;
+ struct hlist_node *node;
+
+ if (i > s_i)
+ s_num = 0;
+
+ read_lock_bh(&head->lock);
+
+ num = 0;
+ sk_for_each(sk, node, &head->chain) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (num < s_num)
+ goto next_normal;
+ if (!(r->idiag_states & (1 << sk->sk_state)))
+ goto next_normal;
+ if (r->id.idiag_sport != inet->sport &&
+ r->id.idiag_sport)
+ goto next_normal;
+ if (r->id.idiag_dport != inet->dport && r->id.idiag_dport)
+ goto next_normal;
+ if (inet_diag_dump_sock(skb, sk, cb) < 0) {
+ read_unlock_bh(&head->lock);
+ goto done;
+ }
+next_normal:
+ ++num;
+ }
+
+ if (r->idiag_states & TCPF_TIME_WAIT) {
+ sk_for_each(sk, node,
+ &hashinfo->ehash[i + hashinfo->ehash_size].chain) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (num < s_num)
+ goto next_dying;
+ if (r->id.idiag_sport != inet->sport &&
+ r->id.idiag_sport)
+ goto next_dying;
+ if (r->id.idiag_dport != inet->dport &&
+ r->id.idiag_dport)
+ goto next_dying;
+ if (inet_diag_dump_sock(skb, sk, cb) < 0) {
+ read_unlock_bh(&head->lock);
+ goto done;
+ }
+next_dying:
+ ++num;
+ }
+ }
+ read_unlock_bh(&head->lock);
+ }
+
+done:
+ cb->args[1] = i;
+ cb->args[2] = num;
+ return skb->len;
+}
+
+static int inet_diag_dump_done(struct netlink_callback *cb)
+{
+ return 0;
+}
+
+
+static __inline__ int
+inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ if (!(nlh->nlmsg_flags&NLM_F_REQUEST))
+ return 0;
+
+ if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX)
+ goto err_inval;
+
+ if (inet_diag_table[nlh->nlmsg_type] == NULL)
+ return -ENOENT;
+
+ if (NLMSG_LENGTH(sizeof(struct inet_diag_req)) > skb->len)
+ goto err_inval;
+
+ if (nlh->nlmsg_flags&NLM_F_DUMP) {
+ if (nlh->nlmsg_len >
+ (4 + NLMSG_SPACE(sizeof(struct inet_diag_req)))) {
+ struct rtattr *rta = (void *)(NLMSG_DATA(nlh) +
+ sizeof(struct inet_diag_req));
+ if (rta->rta_type != INET_DIAG_REQ_BYTECODE ||
+ rta->rta_len < 8 ||
+ rta->rta_len >
+ (nlh->nlmsg_len -
+ NLMSG_SPACE(sizeof(struct inet_diag_req))))
+ goto err_inval;
+ if (inet_diag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta)))
+ goto err_inval;
+ }
+ return netlink_dump_start(idiagnl, skb, nlh,
+ inet_diag_dump,
+ inet_diag_dump_done);
+ } else {
+ return inet_diag_get_exact(skb, nlh);
+ }
+
+err_inval:
+ return -EINVAL;
+}
+
+
+static inline void inet_diag_rcv_skb(struct sk_buff *skb)
+{
+ int err;
+ struct nlmsghdr * nlh;
+
+ if (skb->len >= NLMSG_SPACE(0)) {
+ nlh = (struct nlmsghdr *)skb->data;
+ if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
+ return;
+ err = inet_diag_rcv_msg(skb, nlh);
+ if (err || nlh->nlmsg_flags & NLM_F_ACK)
+ netlink_ack(skb, nlh, err);
+ }
+}
+
+static void inet_diag_rcv(struct sock *sk, int len)
+{
+ struct sk_buff *skb;
+ unsigned int qlen = skb_queue_len(&sk->sk_receive_queue);
+
+ while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) {
+ inet_diag_rcv_skb(skb);
+ kfree_skb(skb);
+ }
+}
+
+static DEFINE_SPINLOCK(inet_diag_register_lock);
+
+int inet_diag_register(const struct inet_diag_handler *h)
+{
+ const __u16 type = h->idiag_type;
+ int err = -EINVAL;
+
+ if (type >= INET_DIAG_GETSOCK_MAX)
+ goto out;
+
+ spin_lock(&inet_diag_register_lock);
+ err = -EEXIST;
+ if (inet_diag_table[type] == NULL) {
+ inet_diag_table[type] = h;
+ err = 0;
+ }
+ spin_unlock(&inet_diag_register_lock);
+out:
+ return err;
+}
+EXPORT_SYMBOL_GPL(inet_diag_register);
+
+void inet_diag_unregister(const struct inet_diag_handler *h)
+{
+ const __u16 type = h->idiag_type;
+
+ if (type >= INET_DIAG_GETSOCK_MAX)
+ return;
+
+ spin_lock(&inet_diag_register_lock);
+ inet_diag_table[type] = NULL;
+ spin_unlock(&inet_diag_register_lock);
+
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(inet_diag_unregister);
+
+static int __init inet_diag_init(void)
+{
+ const int inet_diag_table_size = (INET_DIAG_GETSOCK_MAX *
+ sizeof(struct inet_diag_handler *));
+ int err = -ENOMEM;
+
+ inet_diag_table = kmalloc(inet_diag_table_size, GFP_KERNEL);
+ if (!inet_diag_table)
+ goto out;
+
+ memset(inet_diag_table, 0, inet_diag_table_size);
+ idiagnl = netlink_kernel_create(NETLINK_INET_DIAG, 0, inet_diag_rcv,
+ THIS_MODULE);
+ if (idiagnl == NULL)
+ goto out_free_table;
+ err = 0;
+out:
+ return err;
+out_free_table:
+ kfree(inet_diag_table);
+ goto out;
+}
+
+static void __exit inet_diag_exit(void)
+{
+ sock_release(idiagnl->sk_socket);
+ kfree(inet_diag_table);
+}
+
+module_init(inet_diag_init);
+module_exit(inet_diag_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
new file mode 100644
index 00000000000..e8d29fe736d
--- /dev/null
+++ b/net/ipv4/inet_hashtables.c
@@ -0,0 +1,165 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Generic INET transport hashtables
+ *
+ * Authors: Lotsa people, from code originally in tcp
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+
+/*
+ * Allocate and initialize a new local port bind bucket.
+ * The bindhash mutex for snum's hash chain must be held here.
+ */
+struct inet_bind_bucket *inet_bind_bucket_create(kmem_cache_t *cachep,
+ struct inet_bind_hashbucket *head,
+ const unsigned short snum)
+{
+ struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, SLAB_ATOMIC);
+
+ if (tb != NULL) {
+ tb->port = snum;
+ tb->fastreuse = 0;
+ INIT_HLIST_HEAD(&tb->owners);
+ hlist_add_head(&tb->node, &head->chain);
+ }
+ return tb;
+}
+
+EXPORT_SYMBOL(inet_bind_bucket_create);
+
+/*
+ * Caller must hold hashbucket lock for this tb with local BH disabled
+ */
+void inet_bind_bucket_destroy(kmem_cache_t *cachep, struct inet_bind_bucket *tb)
+{
+ if (hlist_empty(&tb->owners)) {
+ __hlist_del(&tb->node);
+ kmem_cache_free(cachep, tb);
+ }
+}
+
+void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
+ const unsigned short snum)
+{
+ inet_sk(sk)->num = snum;
+ sk_add_bind_node(sk, &tb->owners);
+ inet_csk(sk)->icsk_bind_hash = tb;
+}
+
+EXPORT_SYMBOL(inet_bind_hash);
+
+/*
+ * Get rid of any references to a local port held by the given sock.
+ */
+static void __inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk)
+{
+ const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size);
+ struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
+ struct inet_bind_bucket *tb;
+
+ spin_lock(&head->lock);
+ tb = inet_csk(sk)->icsk_bind_hash;
+ __sk_del_bind_node(sk);
+ inet_csk(sk)->icsk_bind_hash = NULL;
+ inet_sk(sk)->num = 0;
+ inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+ spin_unlock(&head->lock);
+}
+
+void inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk)
+{
+ local_bh_disable();
+ __inet_put_port(hashinfo, sk);
+ local_bh_enable();
+}
+
+EXPORT_SYMBOL(inet_put_port);
+
+/*
+ * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
+ * Look, when several writers sleep and reader wakes them up, all but one
+ * immediately hit write lock and grab all the cpus. Exclusive sleep solves
+ * this, _but_ remember, it adds useless work on UP machines (wake up each
+ * exclusive lock release). It should be ifdefed really.
+ */
+void inet_listen_wlock(struct inet_hashinfo *hashinfo)
+{
+ write_lock(&hashinfo->lhash_lock);
+
+ if (atomic_read(&hashinfo->lhash_users)) {
+ DEFINE_WAIT(wait);
+
+ for (;;) {
+ prepare_to_wait_exclusive(&hashinfo->lhash_wait,
+ &wait, TASK_UNINTERRUPTIBLE);
+ if (!atomic_read(&hashinfo->lhash_users))
+ break;
+ write_unlock_bh(&hashinfo->lhash_lock);
+ schedule();
+ write_lock_bh(&hashinfo->lhash_lock);
+ }
+
+ finish_wait(&hashinfo->lhash_wait, &wait);
+ }
+}
+
+EXPORT_SYMBOL(inet_listen_wlock);
+
+/*
+ * Don't inline this cruft. Here are some nice properties to exploit here. The
+ * BSD API does not allow a listening sock to specify the remote port nor the
+ * remote address for the connection. So always assume those are both
+ * wildcarded during the search since they can never be otherwise.
+ */
+struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 daddr,
+ const unsigned short hnum, const int dif)
+{
+ struct sock *result = NULL, *sk;
+ const struct hlist_node *node;
+ int hiscore = -1;
+
+ sk_for_each(sk, node, head) {
+ const struct inet_sock *inet = inet_sk(sk);
+
+ if (inet->num == hnum && !ipv6_only_sock(sk)) {
+ const __u32 rcv_saddr = inet->rcv_saddr;
+ int score = sk->sk_family == PF_INET ? 1 : 0;
+
+ if (rcv_saddr) {
+ if (rcv_saddr != daddr)
+ continue;
+ score += 2;
+ }
+ if (sk->sk_bound_dev_if) {
+ if (sk->sk_bound_dev_if != dif)
+ continue;
+ score += 2;
+ }
+ if (score == 5)
+ return sk;
+ if (score > hiscore) {
+ hiscore = score;
+ result = sk;
+ }
+ }
+ }
+ return result;
+}
+
+EXPORT_SYMBOL_GPL(__inet_lookup_listener);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
new file mode 100644
index 00000000000..a010e9a6881
--- /dev/null
+++ b/net/ipv4/inet_timewait_sock.c
@@ -0,0 +1,385 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Generic TIME_WAIT sockets functions
+ *
+ * From code orinally in TCP
+ */
+
+#include <linux/config.h>
+
+#include <net/inet_hashtables.h>
+#include <net/inet_timewait_sock.h>
+#include <net/ip.h>
+
+/* Must be called with locally disabled BHs. */
+void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo)
+{
+ struct inet_bind_hashbucket *bhead;
+ struct inet_bind_bucket *tb;
+ /* Unlink from established hashes. */
+ struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, tw->tw_hash);
+
+ write_lock(&ehead->lock);
+ if (hlist_unhashed(&tw->tw_node)) {
+ write_unlock(&ehead->lock);
+ return;
+ }
+ __hlist_del(&tw->tw_node);
+ sk_node_init(&tw->tw_node);
+ write_unlock(&ehead->lock);
+
+ /* Disassociate with bind bucket. */
+ bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)];
+ spin_lock(&bhead->lock);
+ tb = tw->tw_tb;
+ __hlist_del(&tw->tw_bind_node);
+ tw->tw_tb = NULL;
+ inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+ spin_unlock(&bhead->lock);
+#ifdef SOCK_REFCNT_DEBUG
+ if (atomic_read(&tw->tw_refcnt) != 1) {
+ printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n",
+ tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt));
+ }
+#endif
+ inet_twsk_put(tw);
+}
+
+EXPORT_SYMBOL_GPL(__inet_twsk_kill);
+
+/*
+ * Enter the time wait state. This is called with locally disabled BH.
+ * Essentially we whip up a timewait bucket, copy the relevant info into it
+ * from the SK, and mess with hash chains and list linkage.
+ */
+void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
+ struct inet_hashinfo *hashinfo)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
+ struct inet_bind_hashbucket *bhead;
+ /* Step 1: Put TW into bind hash. Original socket stays there too.
+ Note, that any socket with inet->num != 0 MUST be bound in
+ binding cache, even if it is closed.
+ */
+ bhead = &hashinfo->bhash[inet_bhashfn(inet->num, hashinfo->bhash_size)];
+ spin_lock(&bhead->lock);
+ tw->tw_tb = icsk->icsk_bind_hash;
+ BUG_TRAP(icsk->icsk_bind_hash);
+ inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
+ spin_unlock(&bhead->lock);
+
+ write_lock(&ehead->lock);
+
+ /* Step 2: Remove SK from established hash. */
+ if (__sk_del_node_init(sk))
+ sock_prot_dec_use(sk->sk_prot);
+
+ /* Step 3: Hash TW into TIMEWAIT half of established hash table. */
+ inet_twsk_add_node(tw, &(ehead + hashinfo->ehash_size)->chain);
+ atomic_inc(&tw->tw_refcnt);
+
+ write_unlock(&ehead->lock);
+}
+
+EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
+
+struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state)
+{
+ struct inet_timewait_sock *tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab,
+ SLAB_ATOMIC);
+ if (tw != NULL) {
+ const struct inet_sock *inet = inet_sk(sk);
+
+ /* Give us an identity. */
+ tw->tw_daddr = inet->daddr;
+ tw->tw_rcv_saddr = inet->rcv_saddr;
+ tw->tw_bound_dev_if = sk->sk_bound_dev_if;
+ tw->tw_num = inet->num;
+ tw->tw_state = TCP_TIME_WAIT;
+ tw->tw_substate = state;
+ tw->tw_sport = inet->sport;
+ tw->tw_dport = inet->dport;
+ tw->tw_family = sk->sk_family;
+ tw->tw_reuse = sk->sk_reuse;
+ tw->tw_hash = sk->sk_hash;
+ tw->tw_ipv6only = 0;
+ tw->tw_prot = sk->sk_prot_creator;
+ atomic_set(&tw->tw_refcnt, 1);
+ inet_twsk_dead_node_init(tw);
+ __module_get(tw->tw_prot->owner);
+ }
+
+ return tw;
+}
+
+EXPORT_SYMBOL_GPL(inet_twsk_alloc);
+
+/* Returns non-zero if quota exceeded. */
+static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr,
+ const int slot)
+{
+ struct inet_timewait_sock *tw;
+ struct hlist_node *node;
+ unsigned int killed;
+ int ret;
+
+ /* NOTE: compare this to previous version where lock
+ * was released after detaching chain. It was racy,
+ * because tw buckets are scheduled in not serialized context
+ * in 2.3 (with netfilter), and with softnet it is common, because
+ * soft irqs are not sequenced.
+ */
+ killed = 0;
+ ret = 0;
+rescan:
+ inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) {
+ __inet_twsk_del_dead_node(tw);
+ spin_unlock(&twdr->death_lock);
+ __inet_twsk_kill(tw, twdr->hashinfo);
+ inet_twsk_put(tw);
+ killed++;
+ spin_lock(&twdr->death_lock);
+ if (killed > INET_TWDR_TWKILL_QUOTA) {
+ ret = 1;
+ break;
+ }
+
+ /* While we dropped twdr->death_lock, another cpu may have
+ * killed off the next TW bucket in the list, therefore
+ * do a fresh re-read of the hlist head node with the
+ * lock reacquired. We still use the hlist traversal
+ * macro in order to get the prefetches.
+ */
+ goto rescan;
+ }
+
+ twdr->tw_count -= killed;
+ NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed);
+
+ return ret;
+}
+
+void inet_twdr_hangman(unsigned long data)
+{
+ struct inet_timewait_death_row *twdr;
+ int unsigned need_timer;
+
+ twdr = (struct inet_timewait_death_row *)data;
+ spin_lock(&twdr->death_lock);
+
+ if (twdr->tw_count == 0)
+ goto out;
+
+ need_timer = 0;
+ if (inet_twdr_do_twkill_work(twdr, twdr->slot)) {
+ twdr->thread_slots |= (1 << twdr->slot);
+ mb();
+ schedule_work(&twdr->twkill_work);
+ need_timer = 1;
+ } else {
+ /* We purged the entire slot, anything left? */
+ if (twdr->tw_count)
+ need_timer = 1;
+ }
+ twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));
+ if (need_timer)
+ mod_timer(&twdr->tw_timer, jiffies + twdr->period);
+out:
+ spin_unlock(&twdr->death_lock);
+}
+
+EXPORT_SYMBOL_GPL(inet_twdr_hangman);
+
+extern void twkill_slots_invalid(void);
+
+void inet_twdr_twkill_work(void *data)
+{
+ struct inet_timewait_death_row *twdr = data;
+ int i;
+
+ if ((INET_TWDR_TWKILL_SLOTS - 1) > (sizeof(twdr->thread_slots) * 8))
+ twkill_slots_invalid();
+
+ while (twdr->thread_slots) {
+ spin_lock_bh(&twdr->death_lock);
+ for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) {
+ if (!(twdr->thread_slots & (1 << i)))
+ continue;
+
+ while (inet_twdr_do_twkill_work(twdr, i) != 0) {
+ if (need_resched()) {
+ spin_unlock_bh(&twdr->death_lock);
+ schedule();
+ spin_lock_bh(&twdr->death_lock);
+ }
+ }
+
+ twdr->thread_slots &= ~(1 << i);
+ }
+ spin_unlock_bh(&twdr->death_lock);
+ }
+}
+
+EXPORT_SYMBOL_GPL(inet_twdr_twkill_work);
+
+/* These are always called from BH context. See callers in
+ * tcp_input.c to verify this.
+ */
+
+/* This is for handling early-kills of TIME_WAIT sockets. */
+void inet_twsk_deschedule(struct inet_timewait_sock *tw,
+ struct inet_timewait_death_row *twdr)
+{
+ spin_lock(&twdr->death_lock);
+ if (inet_twsk_del_dead_node(tw)) {
+ inet_twsk_put(tw);
+ if (--twdr->tw_count == 0)
+ del_timer(&twdr->tw_timer);
+ }
+ spin_unlock(&twdr->death_lock);
+ __inet_twsk_kill(tw, twdr->hashinfo);
+}
+
+EXPORT_SYMBOL(inet_twsk_deschedule);
+
+void inet_twsk_schedule(struct inet_timewait_sock *tw,
+ struct inet_timewait_death_row *twdr,
+ const int timeo, const int timewait_len)
+{
+ struct hlist_head *list;
+ int slot;
+
+ /* timeout := RTO * 3.5
+ *
+ * 3.5 = 1+2+0.5 to wait for two retransmits.
+ *
+ * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
+ * our ACK acking that FIN can be lost. If N subsequent retransmitted
+ * FINs (or previous seqments) are lost (probability of such event
+ * is p^(N+1), where p is probability to lose single packet and
+ * time to detect the loss is about RTO*(2^N - 1) with exponential
+ * backoff). Normal timewait length is calculated so, that we
+ * waited at least for one retransmitted FIN (maximal RTO is 120sec).
+ * [ BTW Linux. following BSD, violates this requirement waiting
+ * only for 60sec, we should wait at least for 240 secs.
+ * Well, 240 consumes too much of resources 8)
+ * ]
+ * This interval is not reduced to catch old duplicate and
+ * responces to our wandering segments living for two MSLs.
+ * However, if we use PAWS to detect
+ * old duplicates, we can reduce the interval to bounds required
+ * by RTO, rather than MSL. So, if peer understands PAWS, we
+ * kill tw bucket after 3.5*RTO (it is important that this number
+ * is greater than TS tick!) and detect old duplicates with help
+ * of PAWS.
+ */
+ slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK;
+
+ spin_lock(&twdr->death_lock);
+
+ /* Unlink it, if it was scheduled */
+ if (inet_twsk_del_dead_node(tw))
+ twdr->tw_count--;
+ else
+ atomic_inc(&tw->tw_refcnt);
+
+ if (slot >= INET_TWDR_RECYCLE_SLOTS) {
+ /* Schedule to slow timer */
+ if (timeo >= timewait_len) {
+ slot = INET_TWDR_TWKILL_SLOTS - 1;
+ } else {
+ slot = (timeo + twdr->period - 1) / twdr->period;
+ if (slot >= INET_TWDR_TWKILL_SLOTS)
+ slot = INET_TWDR_TWKILL_SLOTS - 1;
+ }
+ tw->tw_ttd = jiffies + timeo;
+ slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1);
+ list = &twdr->cells[slot];
+ } else {
+ tw->tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK);
+
+ if (twdr->twcal_hand < 0) {
+ twdr->twcal_hand = 0;
+ twdr->twcal_jiffie = jiffies;
+ twdr->twcal_timer.expires = twdr->twcal_jiffie +
+ (slot << INET_TWDR_RECYCLE_TICK);
+ add_timer(&twdr->twcal_timer);
+ } else {
+ if (time_after(twdr->twcal_timer.expires,
+ jiffies + (slot << INET_TWDR_RECYCLE_TICK)))
+ mod_timer(&twdr->twcal_timer,
+ jiffies + (slot << INET_TWDR_RECYCLE_TICK));
+ slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1);
+ }
+ list = &twdr->twcal_row[slot];
+ }
+
+ hlist_add_head(&tw->tw_death_node, list);
+
+ if (twdr->tw_count++ == 0)
+ mod_timer(&twdr->tw_timer, jiffies + twdr->period);
+ spin_unlock(&twdr->death_lock);
+}
+
+EXPORT_SYMBOL_GPL(inet_twsk_schedule);
+
+void inet_twdr_twcal_tick(unsigned long data)
+{
+ struct inet_timewait_death_row *twdr;
+ int n, slot;
+ unsigned long j;
+ unsigned long now = jiffies;
+ int killed = 0;
+ int adv = 0;
+
+ twdr = (struct inet_timewait_death_row *)data;
+
+ spin_lock(&twdr->death_lock);
+ if (twdr->twcal_hand < 0)
+ goto out;
+
+ slot = twdr->twcal_hand;
+ j = twdr->twcal_jiffie;
+
+ for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) {
+ if (time_before_eq(j, now)) {
+ struct hlist_node *node, *safe;
+ struct inet_timewait_sock *tw;
+
+ inet_twsk_for_each_inmate_safe(tw, node, safe,
+ &twdr->twcal_row[slot]) {
+ __inet_twsk_del_dead_node(tw);
+ __inet_twsk_kill(tw, twdr->hashinfo);
+ inet_twsk_put(tw);
+ killed++;
+ }
+ } else {
+ if (!adv) {
+ adv = 1;
+ twdr->twcal_jiffie = j;
+ twdr->twcal_hand = slot;
+ }
+
+ if (!hlist_empty(&twdr->twcal_row[slot])) {
+ mod_timer(&twdr->twcal_timer, j);
+ goto out;
+ }
+ }
+ j += 1 << INET_TWDR_RECYCLE_TICK;
+ slot = (slot + 1) & (INET_TWDR_RECYCLE_SLOTS - 1);
+ }
+ twdr->twcal_hand = -1;
+
+out:
+ if ((twdr->tw_count -= killed) == 0)
+ del_timer(&twdr->tw_timer);
+ NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed);
+ spin_unlock(&twdr->death_lock);
+}
+
+EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 95473953c40..2fc3fd38924 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -20,6 +20,7 @@
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/net.h>
+#include <net/ip.h>
#include <net/inetpeer.h>
/*
@@ -72,7 +73,7 @@
/* Exported for inet_getid inline function. */
DEFINE_SPINLOCK(inet_peer_idlock);
-static kmem_cache_t *peer_cachep;
+static kmem_cache_t *peer_cachep __read_mostly;
#define node_height(x) x->avl_height
static struct inet_peer peer_fake_node = {
@@ -99,8 +100,7 @@ DEFINE_SPINLOCK(inet_peer_unused_lock);
#define PEER_MAX_CLEANUP_WORK 30
static void peer_check_expire(unsigned long dummy);
-static struct timer_list peer_periodic_timer =
- TIMER_INITIALIZER(peer_check_expire, 0, 0);
+static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0);
/* Exported for sysctl_net_ipv4. */
int inet_peer_gc_mintime = 10 * HZ,
@@ -450,11 +450,12 @@ static void peer_check_expire(unsigned long dummy)
/* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime
* interval depending on the total number of entries (more entries,
* less interval). */
- peer_periodic_timer.expires = jiffies
- + inet_peer_gc_maxtime
- - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
- peer_total / inet_peer_threshold * HZ;
+ if (peer_total >= inet_peer_threshold)
+ peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime;
+ else
+ peer_periodic_timer.expires = jiffies
+ + inet_peer_gc_maxtime
+ - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
+ peer_total / inet_peer_threshold * HZ;
add_timer(&peer_periodic_timer);
}
-
-EXPORT_SYMBOL(inet_peer_idlock);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 77094aac6c2..0923add122b 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -76,16 +76,12 @@ int ip_forward(struct sk_buff *skb)
* that reaches zero, we must reply an ICMP control message telling
* that the packet's lifetime expired.
*/
-
- iph = skb->nh.iph;
-
- if (iph->ttl <= 1)
+ if (skb->nh.iph->ttl <= 1)
goto too_many_hops;
if (!xfrm4_route_forward(skb))
goto drop;
- iph = skb->nh.iph;
rt = (struct rtable*)skb->dst;
if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 7f68e27eb4e..e7d26d9943c 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -377,7 +377,7 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user)
return ip_frag_intern(hash, qp);
out_nomem:
- NETDEBUG(if (net_ratelimit()) printk(KERN_ERR "ip_frag_create: no memory left !\n"));
+ LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n");
return NULL;
}
@@ -457,7 +457,7 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
if (pskb_pull(skb, ihl) == NULL)
goto err;
- if (pskb_trim(skb, end-offset))
+ if (pskb_trim_rcsum(skb, end-offset))
goto err;
/* Find out which fragments are in front and at the back of us
@@ -533,7 +533,7 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
if (skb->dev)
qp->iif = skb->dev->ifindex;
skb->dev = NULL;
- qp->stamp = skb->stamp;
+ skb_get_timestamp(skb, &qp->stamp);
qp->meat += skb->len;
atomic_add(skb->truesize, &ip_frag_mem);
if (offset == 0)
@@ -615,7 +615,7 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev)
head->next = NULL;
head->dev = dev;
- head->stamp = qp->stamp;
+ skb_set_timestamp(head, &qp->stamp);
iph = head->nh.iph;
iph->frag_off = 0;
@@ -625,10 +625,8 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev)
return head;
out_nomem:
- NETDEBUG(if (net_ratelimit())
- printk(KERN_ERR
- "IP: queue_glue: no memory for gluing queue %p\n",
- qp));
+ LIMIT_NETDEBUG(KERN_ERR "IP: queue_glue: no memory for gluing "
+ "queue %p\n", qp);
goto out_fail;
out_oversize:
if (net_ratelimit())
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index f0d5740d7e2..896ce3f8f53 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1104,10 +1104,10 @@ static int ipgre_open(struct net_device *dev)
return -EADDRNOTAVAIL;
dev = rt->u.dst.dev;
ip_rt_put(rt);
- if (__in_dev_get(dev) == NULL)
+ if (__in_dev_get_rtnl(dev) == NULL)
return -EADDRNOTAVAIL;
t->mlink = dev->ifindex;
- ip_mc_inc_group(__in_dev_get(dev), t->parms.iph.daddr);
+ ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
}
return 0;
}
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index c703528e0bc..473d0f2b2e0 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -150,7 +150,7 @@
* SNMP management statistics
*/
-DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics);
+DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics) __read_mostly;
/*
* Process Router Attention IP option
@@ -225,8 +225,8 @@ static inline int ip_local_deliver_finish(struct sk_buff *skb)
/* If there maybe a raw socket we must check - if not we
* don't care less
*/
- if (raw_sk)
- raw_v4_input(skb, skb->nh.iph, hash);
+ if (raw_sk && !raw_v4_input(skb, skb->nh.iph, hash))
+ raw_sk = NULL;
if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) {
int ret;
@@ -279,18 +279,70 @@ int ip_local_deliver(struct sk_buff *skb)
ip_local_deliver_finish);
}
-static inline int ip_rcv_finish(struct sk_buff *skb)
+static inline int ip_rcv_options(struct sk_buff *skb)
{
+ struct ip_options *opt;
+ struct iphdr *iph;
struct net_device *dev = skb->dev;
+
+ /* It looks as overkill, because not all
+ IP options require packet mangling.
+ But it is the easiest for now, especially taking
+ into account that combination of IP options
+ and running sniffer is extremely rare condition.
+ --ANK (980813)
+ */
+ if (skb_cow(skb, skb_headroom(skb))) {
+ IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
+ goto drop;
+ }
+
+ iph = skb->nh.iph;
+
+ if (ip_options_compile(NULL, skb)) {
+ IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
+ goto drop;
+ }
+
+ opt = &(IPCB(skb)->opt);
+ if (unlikely(opt->srr)) {
+ struct in_device *in_dev = in_dev_get(dev);
+ if (in_dev) {
+ if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
+ if (IN_DEV_LOG_MARTIANS(in_dev) &&
+ net_ratelimit())
+ printk(KERN_INFO "source route option "
+ "%u.%u.%u.%u -> %u.%u.%u.%u\n",
+ NIPQUAD(iph->saddr),
+ NIPQUAD(iph->daddr));
+ in_dev_put(in_dev);
+ goto drop;
+ }
+
+ in_dev_put(in_dev);
+ }
+
+ if (ip_options_rcv_srr(skb))
+ goto drop;
+ }
+
+ return 0;
+drop:
+ return -1;
+}
+
+static inline int ip_rcv_finish(struct sk_buff *skb)
+{
struct iphdr *iph = skb->nh.iph;
- int err;
/*
* Initialise the virtual path cache for the packet. It describes
* how the packet travels inside Linux networking.
*/
- if (skb->dst == NULL) {
- if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) {
+ if (likely(skb->dst == NULL)) {
+ int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
+ skb->dev);
+ if (unlikely(err)) {
if (err == -EHOSTUNREACH)
IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
goto drop;
@@ -298,7 +350,7 @@ static inline int ip_rcv_finish(struct sk_buff *skb)
}
#ifdef CONFIG_NET_CLS_ROUTE
- if (skb->dst->tclassid) {
+ if (unlikely(skb->dst->tclassid)) {
struct ip_rt_acct *st = ip_rt_acct + 256*smp_processor_id();
u32 idx = skb->dst->tclassid;
st[idx&0xFF].o_packets++;
@@ -308,48 +360,11 @@ static inline int ip_rcv_finish(struct sk_buff *skb)
}
#endif
- if (iph->ihl > 5) {
- struct ip_options *opt;
-
- /* It looks as overkill, because not all
- IP options require packet mangling.
- But it is the easiest for now, especially taking
- into account that combination of IP options
- and running sniffer is extremely rare condition.
- --ANK (980813)
- */
-
- if (skb_cow(skb, skb_headroom(skb))) {
- IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
- goto drop;
- }
- iph = skb->nh.iph;
-
- if (ip_options_compile(NULL, skb))
- goto inhdr_error;
-
- opt = &(IPCB(skb)->opt);
- if (opt->srr) {
- struct in_device *in_dev = in_dev_get(dev);
- if (in_dev) {
- if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
- if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
- printk(KERN_INFO "source route option %u.%u.%u.%u -> %u.%u.%u.%u\n",
- NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
- in_dev_put(in_dev);
- goto drop;
- }
- in_dev_put(in_dev);
- }
- if (ip_options_rcv_srr(skb))
- goto drop;
- }
- }
+ if (iph->ihl > 5 && ip_rcv_options(skb))
+ goto drop;
return dst_input(skb);
-inhdr_error:
- IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
drop:
kfree_skb(skb);
return NET_RX_DROP;
@@ -358,9 +373,10 @@ drop:
/*
* Main IP Receive routine.
*/
-int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
struct iphdr *iph;
+ u32 len;
/* When the interface is in promisc. mode, drop all the crap
* that it receives, do not try to analyse it.
@@ -392,29 +408,27 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
*/
if (iph->ihl < 5 || iph->version != 4)
- goto inhdr_error;
+ goto inhdr_error;
if (!pskb_may_pull(skb, iph->ihl*4))
goto inhdr_error;
iph = skb->nh.iph;
- if (ip_fast_csum((u8 *)iph, iph->ihl) != 0)
- goto inhdr_error;
+ if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+ goto inhdr_error;
- {
- __u32 len = ntohs(iph->tot_len);
- if (skb->len < len || len < (iph->ihl<<2))
- goto inhdr_error;
+ len = ntohs(iph->tot_len);
+ if (skb->len < len || len < (iph->ihl*4))
+ goto inhdr_error;
- /* Our transport medium may have padded the buffer out. Now we know it
- * is IP we can trim to the true length of the frame.
- * Note this now means skb->len holds ntohs(iph->tot_len).
- */
- if (pskb_trim_rcsum(skb, len)) {
- IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
- goto drop;
- }
+ /* Our transport medium may have padded the buffer out. Now we know it
+ * is IP we can trim to the true length of the frame.
+ * Note this now means skb->len holds ntohs(iph->tot_len).
+ */
+ if (pskb_trim_rcsum(skb, len)) {
+ IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
+ goto drop;
}
return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL,
@@ -428,5 +442,4 @@ out:
return NET_RX_DROP;
}
-EXPORT_SYMBOL(ip_rcv);
EXPORT_SYMBOL(ip_statistics);
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 6d89f3f3e70..bce4e875193 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -489,23 +489,18 @@ void ip_options_undo(struct ip_options * opt)
}
}
-int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, int user)
+static struct ip_options *ip_options_get_alloc(const int optlen)
{
- struct ip_options *opt;
+ struct ip_options *opt = kmalloc(sizeof(*opt) + ((optlen + 3) & ~3),
+ GFP_KERNEL);
+ if (opt)
+ memset(opt, 0, sizeof(*opt));
+ return opt;
+}
- opt = kmalloc(sizeof(struct ip_options)+((optlen+3)&~3), GFP_KERNEL);
- if (!opt)
- return -ENOMEM;
- memset(opt, 0, sizeof(struct ip_options));
- if (optlen) {
- if (user) {
- if (copy_from_user(opt->__data, data, optlen)) {
- kfree(opt);
- return -EFAULT;
- }
- } else
- memcpy(opt->__data, data, optlen);
- }
+static int ip_options_get_finish(struct ip_options **optp,
+ struct ip_options *opt, int optlen)
+{
while (optlen & 3)
opt->__data[optlen++] = IPOPT_END;
opt->optlen = optlen;
@@ -521,6 +516,30 @@ int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, in
return 0;
}
+int ip_options_get_from_user(struct ip_options **optp, unsigned char __user *data, int optlen)
+{
+ struct ip_options *opt = ip_options_get_alloc(optlen);
+
+ if (!opt)
+ return -ENOMEM;
+ if (optlen && copy_from_user(opt->__data, data, optlen)) {
+ kfree(opt);
+ return -EFAULT;
+ }
+ return ip_options_get_finish(optp, opt, optlen);
+}
+
+int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen)
+{
+ struct ip_options *opt = ip_options_get_alloc(optlen);
+
+ if (!opt)
+ return -ENOMEM;
+ if (optlen)
+ memcpy(opt->__data, data, optlen);
+ return ip_options_get_finish(optp, opt, optlen);
+}
+
void ip_forward_options(struct sk_buff *skb)
{
struct ip_options * opt = &(IPCB(skb)->opt);
@@ -620,6 +639,3 @@ int ip_options_rcv_srr(struct sk_buff *skb)
}
return 0;
}
-
-EXPORT_SYMBOL(ip_options_compile);
-EXPORT_SYMBOL(ip_options_undo);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 80d13103b2b..17758234a3e 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -69,13 +69,10 @@
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
-#include <net/tcp.h>
-#include <net/udp.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/arp.h>
#include <net/icmp.h>
-#include <net/raw.h>
#include <net/checksum.h>
#include <net/inetpeer.h>
#include <net/checksum.h>
@@ -84,12 +81,8 @@
#include <linux/netfilter_bridge.h>
#include <linux/mroute.h>
#include <linux/netlink.h>
+#include <linux/tcp.h>
-/*
- * Shall we try to damage output packets if routing dev changes?
- */
-
-int sysctl_ip_dynaddr;
int sysctl_ip_default_ttl = IPDEFTTL;
/* Generate a checksum for an outgoing IP datagram. */
@@ -165,6 +158,8 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
dst_output);
}
+EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
+
static inline int ip_finish_output2(struct sk_buff *skb)
{
struct dst_entry *dst = skb->dst;
@@ -205,7 +200,7 @@ static inline int ip_finish_output2(struct sk_buff *skb)
return -EINVAL;
}
-int ip_finish_output(struct sk_buff *skb)
+static inline int ip_finish_output(struct sk_buff *skb)
{
struct net_device *dev = skb->dst->dev;
@@ -280,7 +275,8 @@ int ip_output(struct sk_buff *skb)
{
IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
- if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->tso_size)
+ if (skb->len > dst_mtu(skb->dst) &&
+ !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size))
return ip_fragment(skb, ip_finish_output);
else
return ip_finish_output(skb);
@@ -329,8 +325,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
if (ip_route_output_flow(&rt, &fl, sk, 0))
goto no_route;
}
- __sk_dst_set(sk, &rt->u.dst);
- tcp_v4_setup_caps(sk, &rt->u.dst);
+ sk_setup_caps(sk, &rt->u.dst);
}
skb->dst = dst_clone(&rt->u.dst);
@@ -392,12 +387,14 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
#endif
#ifdef CONFIG_NETFILTER
to->nfmark = from->nfmark;
- to->nfcache = from->nfcache;
/* Connection association is same as pre-frag packet */
nf_conntrack_put(to->nfct);
to->nfct = from->nfct;
nf_conntrack_get(to->nfct);
to->nfctinfo = from->nfctinfo;
+#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
+ to->ipvs_property = from->ipvs_property;
+#endif
#ifdef CONFIG_BRIDGE_NETFILTER
nf_bridge_put(to->nf_bridge);
to->nf_bridge = from->nf_bridge;
@@ -580,7 +577,7 @@ slow_path:
*/
if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
- NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
+ NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
err = -ENOMEM;
goto fail;
}
@@ -692,6 +689,60 @@ csum_page(struct page *page, int offset, int copy)
return csum;
}
+inline int ip_ufo_append_data(struct sock *sk,
+ int getfrag(void *from, char *to, int offset, int len,
+ int odd, struct sk_buff *skb),
+ void *from, int length, int hh_len, int fragheaderlen,
+ int transhdrlen, int mtu,unsigned int flags)
+{
+ struct sk_buff *skb;
+ int err;
+
+ /* There is support for UDP fragmentation offload by network
+ * device, so create one single skb packet containing complete
+ * udp datagram
+ */
+ if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
+ skb = sock_alloc_send_skb(sk,
+ hh_len + fragheaderlen + transhdrlen + 20,
+ (flags & MSG_DONTWAIT), &err);
+
+ if (skb == NULL)
+ return err;
+
+ /* reserve space for Hardware header */
+ skb_reserve(skb, hh_len);
+
+ /* create space for UDP/IP header */
+ skb_put(skb,fragheaderlen + transhdrlen);
+
+ /* initialize network header pointer */
+ skb->nh.raw = skb->data;
+
+ /* initialize protocol header pointer */
+ skb->h.raw = skb->data + fragheaderlen;
+
+ skb->ip_summed = CHECKSUM_HW;
+ skb->csum = 0;
+ sk->sk_sndmsg_off = 0;
+ }
+
+ err = skb_append_datato_frags(sk,skb, getfrag, from,
+ (length - transhdrlen));
+ if (!err) {
+ /* specify the length of each IP datagram fragment*/
+ skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen);
+ __skb_queue_tail(&sk->sk_write_queue, skb);
+
+ return 0;
+ }
+ /* There is not enough support do UFO ,
+ * so follow normal path
+ */
+ kfree_skb(skb);
+ return err;
+}
+
/*
* ip_append_data() and ip_append_page() can make one large IP datagram
* from many pieces of data. Each pieces will be holded on the socket
@@ -781,6 +832,15 @@ int ip_append_data(struct sock *sk,
csummode = CHECKSUM_HW;
inet->cork.length += length;
+ if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
+ (rt->u.dst.dev->features & NETIF_F_UFO)) {
+
+ if(ip_ufo_append_data(sk, getfrag, from, length, hh_len,
+ fragheaderlen, transhdrlen, mtu, flags))
+ goto error;
+
+ return 0;
+ }
/* So, what's going on in the loop below?
*
@@ -1012,14 +1072,23 @@ ssize_t ip_append_page(struct sock *sk, struct page *page,
return -EINVAL;
inet->cork.length += size;
+ if ((sk->sk_protocol == IPPROTO_UDP) &&
+ (rt->u.dst.dev->features & NETIF_F_UFO))
+ skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen);
+
while (size > 0) {
int i;
- /* Check if the remaining data fits into current packet. */
- len = mtu - skb->len;
- if (len < size)
- len = maxfraglen - skb->len;
+ if (skb_shinfo(skb)->ufo_size)
+ len = size;
+ else {
+
+ /* Check if the remaining data fits into current packet. */
+ len = mtu - skb->len;
+ if (len < size)
+ len = maxfraglen - skb->len;
+ }
if (len <= 0) {
struct sk_buff *skb_prev;
char *data;
@@ -1027,10 +1096,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page,
int alloclen;
skb_prev = skb;
- if (skb_prev)
- fraggap = skb_prev->len - maxfraglen;
- else
- fraggap = 0;
+ fraggap = skb_prev->len - maxfraglen;
alloclen = fragheaderlen + hh_len + fraggap + 15;
skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
@@ -1329,12 +1395,7 @@ void __init ip_init(void)
#endif
}
-EXPORT_SYMBOL(ip_finish_output);
EXPORT_SYMBOL(ip_fragment);
EXPORT_SYMBOL(ip_generic_getfrag);
EXPORT_SYMBOL(ip_queue_xmit);
EXPORT_SYMBOL(ip_send_check);
-
-#ifdef CONFIG_SYSCTL
-EXPORT_SYMBOL(sysctl_ip_default_ttl);
-#endif
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index fc7c481d0d7..2f0b47da5b3 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -153,7 +153,7 @@ int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc)
switch (cmsg->cmsg_type) {
case IP_RETOPTS:
err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
- err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40, 0);
+ err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40);
if (err)
return err;
break;
@@ -425,7 +425,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
struct ip_options * opt = NULL;
if (optlen > 40 || optlen < 0)
goto e_inval;
- err = ip_options_get(&opt, optval, optlen, 1);
+ err = ip_options_get_from_user(&opt, optval, optlen);
if (err)
break;
if (sk->sk_type == SOCK_STREAM) {
@@ -614,7 +614,6 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
}
case IP_MSFILTER:
{
- extern int sysctl_optmem_max;
extern int sysctl_igmp_max_msf;
struct ip_msfilter *msf;
@@ -769,7 +768,6 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
}
case MCAST_MSFILTER:
{
- extern int sysctl_optmem_max;
extern int sysctl_igmp_max_msf;
struct sockaddr_in *psin;
struct ip_msfilter *msf = NULL;
@@ -848,6 +846,9 @@ mc_msf_out:
case IP_IPSEC_POLICY:
case IP_XFRM_POLICY:
+ err = -EPERM;
+ if (!capable(CAP_NET_ADMIN))
+ break;
err = xfrm_user_policy(sk, optname, optval, optlen);
break;
@@ -1087,7 +1088,5 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
EXPORT_SYMBOL(ip_cmsg_recv);
-#ifdef CONFIG_IP_SCTP_MODULE
EXPORT_SYMBOL(ip_getsockopt);
EXPORT_SYMBOL(ip_setsockopt);
-#endif
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 2065944fd9e..fc718df17b4 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -214,8 +214,8 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
spi, IPPROTO_COMP, AF_INET);
if (!x)
return;
- NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n",
- spi, NIPQUAD(iph->daddr)));
+ NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n",
+ spi, NIPQUAD(iph->daddr));
xfrm_state_put(x);
}
@@ -345,8 +345,7 @@ static void ipcomp_free_tfms(struct crypto_tfm **tfms)
for_each_cpu(cpu) {
struct crypto_tfm *tfm = *per_cpu_ptr(tfms, cpu);
- if (tfm)
- crypto_free_tfm(tfm);
+ crypto_free_tfm(tfm);
}
free_percpu(tfms);
}
@@ -358,7 +357,7 @@ static struct crypto_tfm **ipcomp_alloc_tfms(const char *alg_name)
int cpu;
/* This can be any valid CPU ID so we don't need locking. */
- cpu = smp_processor_id();
+ cpu = raw_smp_processor_id();
list_for_each_entry(pos, &ipcomp_tfms_list, list) {
struct crypto_tfm *tfm;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index d2bf8e1930a..e8674baaa8d 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -54,6 +54,7 @@
#include <linux/major.h>
#include <linux/root_dev.h>
#include <linux/delay.h>
+#include <linux/nfs_fs.h>
#include <net/arp.h>
#include <net/ip.h>
#include <net/ipconfig.h>
@@ -393,7 +394,7 @@ static int __init ic_defaults(void)
#ifdef IPCONFIG_RARP
-static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt);
+static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev);
static struct packet_type rarp_packet_type __initdata = {
.type = __constant_htons(ETH_P_RARP),
@@ -414,7 +415,7 @@ static inline void ic_rarp_cleanup(void)
* Process received RARP packet.
*/
static int __init
-ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
struct arphdr *rarp;
unsigned char *rarp_ptr;
@@ -555,7 +556,7 @@ struct bootp_pkt { /* BOOTP packet format */
#define DHCPRELEASE 7
#define DHCPINFORM 8
-static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt);
+static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev);
static struct packet_type bootp_packet_type __initdata = {
.type = __constant_htons(ETH_P_IP),
@@ -823,7 +824,7 @@ static void __init ic_do_bootp_ext(u8 *ext)
/*
* Receive BOOTP reply.
*/
-static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
struct bootp_pkt *b;
struct iphdr *h;
@@ -1102,10 +1103,8 @@ static int __init ic_dynamic(void)
#endif
jiff = jiffies + (d->next ? CONF_INTER_TIMEOUT : timeout);
- while (time_before(jiffies, jiff) && !ic_got_reply) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(1);
- }
+ while (time_before(jiffies, jiff) && !ic_got_reply)
+ schedule_timeout_uninterruptible(1);
#ifdef IPCONFIG_DHCP
/* DHCP isn't done until we get a DHCPACK. */
if ((ic_got_reply & IC_BOOTP)
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index dc806b57842..302b7eb507c 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -103,7 +103,7 @@ static DEFINE_SPINLOCK(mfc_unres_lock);
In this case data path is free of exclusive locks at all.
*/
-static kmem_cache_t *mrt_cachep;
+static kmem_cache_t *mrt_cachep __read_mostly;
static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
@@ -149,7 +149,7 @@ struct net_device *ipmr_new_tunnel(struct vifctl *v)
if (err == 0 && (dev = __dev_get_by_name(p.name)) != NULL) {
dev->flags |= IFF_MULTICAST;
- in_dev = __in_dev_get(dev);
+ in_dev = __in_dev_get_rtnl(dev);
if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL)
goto failure;
in_dev->cnf.rp_filter = 0;
@@ -278,7 +278,7 @@ static int vif_delete(int vifi)
dev_set_allmulti(dev, -1);
- if ((in_dev = __in_dev_get(dev)) != NULL) {
+ if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
in_dev->cnf.mc_forwarding--;
ip_rt_multicast_event(in_dev);
}
@@ -421,7 +421,7 @@ static int vif_add(struct vifctl *vifc, int mrtsock)
return -EINVAL;
}
- if ((in_dev = __in_dev_get(dev)) == NULL)
+ if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
return -EADDRNOTAVAIL;
in_dev->cnf.mc_forwarding++;
dev_set_allmulti(dev, +1);
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c
index d9212addd19..fc6f95aaa96 100644
--- a/net/ipv4/ipvs/ip_vs_app.c
+++ b/net/ipv4/ipvs/ip_vs_app.c
@@ -26,6 +26,7 @@
#include <linux/in.h>
#include <linux/ip.h>
#include <net/protocol.h>
+#include <net/tcp.h>
#include <asm/system.h>
#include <linux/stat.h>
#include <linux/proc_fs.h>
@@ -603,7 +604,7 @@ static struct file_operations ip_vs_app_fops = {
/*
* Replace a segment of data with a new segment
*/
-int ip_vs_skb_replace(struct sk_buff *skb, int pri,
+int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri,
char *o_buf, int o_len, char *n_buf, int n_len)
{
struct iphdr *iph;
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index d0145a8b155..f828fa2eb7d 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -40,7 +40,7 @@
static struct list_head *ip_vs_conn_tab;
/* SLAB cache for IPVS connections */
-static kmem_cache_t *ip_vs_conn_cachep;
+static kmem_cache_t *ip_vs_conn_cachep __read_mostly;
/* counter for current IPVS connections */
static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
@@ -196,6 +196,7 @@ static inline struct ip_vs_conn *__ip_vs_conn_in_get
list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
if (s_addr==cp->caddr && s_port==cp->cport &&
d_port==cp->vport && d_addr==cp->vaddr &&
+ ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
protocol==cp->protocol) {
/* HIT */
atomic_inc(&cp->refcnt);
@@ -227,6 +228,40 @@ struct ip_vs_conn *ip_vs_conn_in_get
return cp;
}
+/* Get reference to connection template */
+struct ip_vs_conn *ip_vs_ct_in_get
+(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
+{
+ unsigned hash;
+ struct ip_vs_conn *cp;
+
+ hash = ip_vs_conn_hashkey(protocol, s_addr, s_port);
+
+ ct_read_lock(hash);
+
+ list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+ if (s_addr==cp->caddr && s_port==cp->cport &&
+ d_port==cp->vport && d_addr==cp->vaddr &&
+ cp->flags & IP_VS_CONN_F_TEMPLATE &&
+ protocol==cp->protocol) {
+ /* HIT */
+ atomic_inc(&cp->refcnt);
+ goto out;
+ }
+ }
+ cp = NULL;
+
+ out:
+ ct_read_unlock(hash);
+
+ IP_VS_DBG(7, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
+ ip_vs_proto_name(protocol),
+ NIPQUAD(s_addr), ntohs(s_port),
+ NIPQUAD(d_addr), ntohs(d_port),
+ cp?"hit":"not hit");
+
+ return cp;
+}
/*
* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
@@ -367,7 +402,7 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
atomic_read(&dest->refcnt));
/* Update the connection counters */
- if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+ if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
/* It is a normal connection, so increase the inactive
connection counter because it is in TCP SYNRECV
state (inactive) or other protocol inacive state */
@@ -406,7 +441,7 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
atomic_read(&dest->refcnt));
/* Update the connection counters */
- if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+ if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
/* It is a normal connection, so decrease the inactconns
or activeconns counter */
if (cp->flags & IP_VS_CONN_F_INACTIVE) {
@@ -467,7 +502,7 @@ int ip_vs_check_template(struct ip_vs_conn *ct)
/*
* Invalidate the connection template
*/
- if (ct->cport) {
+ if (ct->vport != 65535) {
if (ip_vs_conn_unhash(ct)) {
ct->dport = 65535;
ct->vport = 65535;
@@ -776,7 +811,7 @@ void ip_vs_random_dropentry(void)
ct_write_lock_bh(hash);
list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
- if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT))
+ if (cp->flags & IP_VS_CONN_F_TEMPLATE)
/* connection template */
continue;
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index 5fb257dd07c..981cc3244ef 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -22,6 +22,7 @@
*
* Changes:
* Paul `Rusty' Russell properly handle non-linear skbs
+ * Harald Welte don't use nfcache
*
*/
@@ -242,10 +243,10 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
if (ports[1] == svc->port) {
/* Check if a template already exists */
if (svc->port != FTPPORT)
- ct = ip_vs_conn_in_get(iph->protocol, snet, 0,
+ ct = ip_vs_ct_in_get(iph->protocol, snet, 0,
iph->daddr, ports[1]);
else
- ct = ip_vs_conn_in_get(iph->protocol, snet, 0,
+ ct = ip_vs_ct_in_get(iph->protocol, snet, 0,
iph->daddr, 0);
if (!ct || !ip_vs_check_template(ct)) {
@@ -271,14 +272,14 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
iph->daddr,
ports[1],
dest->addr, dest->port,
- 0,
+ IP_VS_CONN_F_TEMPLATE,
dest);
else
ct = ip_vs_conn_new(iph->protocol,
snet, 0,
iph->daddr, 0,
dest->addr, 0,
- 0,
+ IP_VS_CONN_F_TEMPLATE,
dest);
if (ct == NULL)
return NULL;
@@ -297,10 +298,10 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
* port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
*/
if (svc->fwmark)
- ct = ip_vs_conn_in_get(IPPROTO_IP, snet, 0,
+ ct = ip_vs_ct_in_get(IPPROTO_IP, snet, 0,
htonl(svc->fwmark), 0);
else
- ct = ip_vs_conn_in_get(iph->protocol, snet, 0,
+ ct = ip_vs_ct_in_get(iph->protocol, snet, 0,
iph->daddr, 0);
if (!ct || !ip_vs_check_template(ct)) {
@@ -325,14 +326,14 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
snet, 0,
htonl(svc->fwmark), 0,
dest->addr, 0,
- 0,
+ IP_VS_CONN_F_TEMPLATE,
dest);
else
ct = ip_vs_conn_new(iph->protocol,
snet, 0,
iph->daddr, 0,
dest->addr, 0,
- 0,
+ IP_VS_CONN_F_TEMPLATE,
dest);
if (ct == NULL)
return NULL;
@@ -529,7 +530,7 @@ static unsigned int ip_vs_post_routing(unsigned int hooknum,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- if (!((*pskb)->nfcache & NFC_IPVS_PROPERTY))
+ if (!((*pskb)->ipvs_property))
return NF_ACCEPT;
/* The packet was sent from IPVS, exit this chain */
@@ -701,7 +702,7 @@ static int ip_vs_out_icmp(struct sk_buff **pskb, int *related)
/* do the statistics and put it back */
ip_vs_out_stats(cp, skb);
- skb->nfcache |= NFC_IPVS_PROPERTY;
+ skb->ipvs_property = 1;
verdict = NF_ACCEPT;
out:
@@ -739,7 +740,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
EnterFunction(11);
- if (skb->nfcache & NFC_IPVS_PROPERTY)
+ if (skb->ipvs_property)
return NF_ACCEPT;
iph = skb->nh.iph;
@@ -821,7 +822,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
ip_vs_conn_put(cp);
- skb->nfcache |= NFC_IPVS_PROPERTY;
+ skb->ipvs_property = 1;
LeaveFunction(11);
return NF_ACCEPT;
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 7d99ede2ef7..2d66848e7aa 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -1598,7 +1598,7 @@ static ctl_table vs_table[] = {
{ .ctl_name = 0 }
};
-static ctl_table ipv4_table[] = {
+static ctl_table ipvs_ipv4_table[] = {
{
.ctl_name = NET_IPV4,
.procname = "ipv4",
@@ -1613,7 +1613,7 @@ static ctl_table vs_root_table[] = {
.ctl_name = CTL_NET,
.procname = "net",
.mode = 0555,
- .child = ipv4_table,
+ .child = ipvs_ipv4_table,
},
{ .ctl_name = 0 }
};
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
index c035838b780..561cda326fa 100644
--- a/net/ipv4/ipvs/ip_vs_lblc.c
+++ b/net/ipv4/ipvs/ip_vs_lblc.c
@@ -131,7 +131,7 @@ static ctl_table vs_table[] = {
{ .ctl_name = 0 }
};
-static ctl_table ipv4_table[] = {
+static ctl_table ipvs_ipv4_table[] = {
{
.ctl_name = NET_IPV4,
.procname = "ipv4",
@@ -146,7 +146,7 @@ static ctl_table lblc_root_table[] = {
.ctl_name = CTL_NET,
.procname = "net",
.mode = 0555,
- .child = ipv4_table
+ .child = ipvs_ipv4_table
},
{ .ctl_name = 0 }
};
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
index 22b5dd55d27..ce456dbf09a 100644
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -320,7 +320,7 @@ static ctl_table vs_table[] = {
{ .ctl_name = 0 }
};
-static ctl_table ipv4_table[] = {
+static ctl_table ipvs_ipv4_table[] = {
{
.ctl_name = NET_IPV4,
.procname = "ipv4",
@@ -335,7 +335,7 @@ static ctl_table lblcr_root_table[] = {
.ctl_name = CTL_NET,
.procname = "net",
.mode = 0555,
- .child = ipv4_table
+ .child = ipvs_ipv4_table
},
{ .ctl_name = 0 }
};
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
index e65de675da7..c19408973c0 100644
--- a/net/ipv4/ipvs/ip_vs_proto_tcp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c
@@ -604,14 +604,14 @@ void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
}
-static void tcp_init(struct ip_vs_protocol *pp)
+static void ip_vs_tcp_init(struct ip_vs_protocol *pp)
{
IP_VS_INIT_HASH_TABLE(tcp_apps);
pp->timeout_table = tcp_timeouts;
}
-static void tcp_exit(struct ip_vs_protocol *pp)
+static void ip_vs_tcp_exit(struct ip_vs_protocol *pp)
{
}
@@ -621,8 +621,8 @@ struct ip_vs_protocol ip_vs_protocol_tcp = {
.protocol = IPPROTO_TCP,
.dont_defrag = 0,
.appcnt = ATOMIC_INIT(0),
- .init = tcp_init,
- .exit = tcp_exit,
+ .init = ip_vs_tcp_init,
+ .exit = ip_vs_tcp_exit,
.register_app = tcp_register_app,
.unregister_app = tcp_unregister_app,
.conn_schedule = tcp_conn_schedule,
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
index 574d1f509b4..2e5ced3d806 100644
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -297,16 +297,24 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);
for (i=0; i<m->nr_conns; i++) {
+ unsigned flags;
+
s = (struct ip_vs_sync_conn *)p;
- cp = ip_vs_conn_in_get(s->protocol,
- s->caddr, s->cport,
- s->vaddr, s->vport);
+ flags = ntohs(s->flags);
+ if (!(flags & IP_VS_CONN_F_TEMPLATE))
+ cp = ip_vs_conn_in_get(s->protocol,
+ s->caddr, s->cport,
+ s->vaddr, s->vport);
+ else
+ cp = ip_vs_ct_in_get(s->protocol,
+ s->caddr, s->cport,
+ s->vaddr, s->vport);
if (!cp) {
cp = ip_vs_conn_new(s->protocol,
s->caddr, s->cport,
s->vaddr, s->vport,
s->daddr, s->dport,
- ntohs(s->flags), NULL);
+ flags, NULL);
if (!cp) {
IP_VS_ERR("ip_vs_conn_new failed\n");
return;
@@ -315,11 +323,11 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
} else if (!cp->dest) {
/* it is an entry created by the synchronization */
cp->state = ntohs(s->state);
- cp->flags = ntohs(s->flags) | IP_VS_CONN_F_HASHED;
+ cp->flags = flags | IP_VS_CONN_F_HASHED;
} /* Note that we don't touch its state and flags
if it is a normal entry. */
- if (ntohs(s->flags) & IP_VS_CONN_F_SEQ_MASK) {
+ if (flags & IP_VS_CONN_F_SEQ_MASK) {
opt = (struct ip_vs_sync_conn_options *)&s[1];
memcpy(&cp->in_seq, opt, sizeof(*opt));
p += FULL_CONN_SIZE;
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c
index a8512a3fd08..3b87482049c 100644
--- a/net/ipv4/ipvs/ip_vs_xmit.c
+++ b/net/ipv4/ipvs/ip_vs_xmit.c
@@ -127,7 +127,7 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
#define IP_VS_XMIT(skb, rt) \
do { \
- (skb)->nfcache |= NFC_IPVS_PROPERTY; \
+ (skb)->ipvs_property = 1; \
(skb)->ip_summed = CHECKSUM_NONE; \
NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \
(rt)->u.dst.dev, dst_output); \
diff --git a/net/ipv4/multipath_drr.c b/net/ipv4/multipath_drr.c
index c9cf8726051..db67373f9b3 100644
--- a/net/ipv4/multipath_drr.c
+++ b/net/ipv4/multipath_drr.c
@@ -107,7 +107,7 @@ static int drr_dev_event(struct notifier_block *this,
return NOTIFY_DONE;
}
-struct notifier_block drr_dev_notifier = {
+static struct notifier_block drr_dev_notifier = {
.notifier_call = drr_dev_event,
};
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
new file mode 100644
index 00000000000..ae0779d82c5
--- /dev/null
+++ b/net/ipv4/netfilter.c
@@ -0,0 +1,139 @@
+/* IPv4 specific functions of netfilter core */
+
+#include <linux/config.h>
+#ifdef CONFIG_NETFILTER
+
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <net/route.h>
+#include <linux/ip.h>
+
+/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
+int ip_route_me_harder(struct sk_buff **pskb)
+{
+ struct iphdr *iph = (*pskb)->nh.iph;
+ struct rtable *rt;
+ struct flowi fl = {};
+ struct dst_entry *odst;
+ unsigned int hh_len;
+
+ /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause
+ * packets with foreign saddr to appear on the NF_IP_LOCAL_OUT hook.
+ */
+ if (inet_addr_type(iph->saddr) == RTN_LOCAL) {
+ fl.nl_u.ip4_u.daddr = iph->daddr;
+ fl.nl_u.ip4_u.saddr = iph->saddr;
+ fl.nl_u.ip4_u.tos = RT_TOS(iph->tos);
+ fl.oif = (*pskb)->sk ? (*pskb)->sk->sk_bound_dev_if : 0;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark;
+#endif
+ fl.proto = iph->protocol;
+ if (ip_route_output_key(&rt, &fl) != 0)
+ return -1;
+
+ /* Drop old route. */
+ dst_release((*pskb)->dst);
+ (*pskb)->dst = &rt->u.dst;
+ } else {
+ /* non-local src, find valid iif to satisfy
+ * rp-filter when calling ip_route_input. */
+ fl.nl_u.ip4_u.daddr = iph->saddr;
+ if (ip_route_output_key(&rt, &fl) != 0)
+ return -1;
+
+ odst = (*pskb)->dst;
+ if (ip_route_input(*pskb, iph->daddr, iph->saddr,
+ RT_TOS(iph->tos), rt->u.dst.dev) != 0) {
+ dst_release(&rt->u.dst);
+ return -1;
+ }
+ dst_release(&rt->u.dst);
+ dst_release(odst);
+ }
+
+ if ((*pskb)->dst->error)
+ return -1;
+
+ /* Change in oif may mean change in hh_len. */
+ hh_len = (*pskb)->dst->dev->hard_header_len;
+ if (skb_headroom(*pskb) < hh_len) {
+ struct sk_buff *nskb;
+
+ nskb = skb_realloc_headroom(*pskb, hh_len);
+ if (!nskb)
+ return -1;
+ if ((*pskb)->sk)
+ skb_set_owner_w(nskb, (*pskb)->sk);
+ kfree_skb(*pskb);
+ *pskb = nskb;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(ip_route_me_harder);
+
+/*
+ * Extra routing may needed on local out, as the QUEUE target never
+ * returns control to the table.
+ */
+
+struct ip_rt_info {
+ u_int32_t daddr;
+ u_int32_t saddr;
+ u_int8_t tos;
+};
+
+static void queue_save(const struct sk_buff *skb, struct nf_info *info)
+{
+ struct ip_rt_info *rt_info = nf_info_reroute(info);
+
+ if (info->hook == NF_IP_LOCAL_OUT) {
+ const struct iphdr *iph = skb->nh.iph;
+
+ rt_info->tos = iph->tos;
+ rt_info->daddr = iph->daddr;
+ rt_info->saddr = iph->saddr;
+ }
+}
+
+static int queue_reroute(struct sk_buff **pskb, const struct nf_info *info)
+{
+ const struct ip_rt_info *rt_info = nf_info_reroute(info);
+
+ if (info->hook == NF_IP_LOCAL_OUT) {
+ struct iphdr *iph = (*pskb)->nh.iph;
+
+ if (!(iph->tos == rt_info->tos
+ && iph->daddr == rt_info->daddr
+ && iph->saddr == rt_info->saddr))
+ return ip_route_me_harder(pskb);
+ }
+ return 0;
+}
+
+static struct nf_queue_rerouter ip_reroute = {
+ .rer_size = sizeof(struct ip_rt_info),
+ .save = queue_save,
+ .reroute = queue_reroute,
+};
+
+static int init(void)
+{
+ return nf_register_queue_rerouter(PF_INET, &ip_reroute);
+}
+
+static void fini(void)
+{
+ nf_unregister_queue_rerouter(PF_INET);
+}
+
+module_init(init);
+module_exit(fini);
+
+#endif /* CONFIG_NETFILTER */
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 46d4cb1c06f..7d917e4ce1d 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -34,12 +34,31 @@ config IP_NF_CT_ACCT
config IP_NF_CONNTRACK_MARK
bool 'Connection mark tracking support'
+ depends on IP_NF_CONNTRACK
help
This option enables support for connection marks, used by the
`CONNMARK' target and `connmark' match. Similar to the mark value
of packets, but this mark value is kept in the conntrack session
instead of the individual packets.
+config IP_NF_CONNTRACK_EVENTS
+ bool "Connection tracking events"
+ depends on IP_NF_CONNTRACK
+ help
+ If this option is enabled, the connection tracking code will
+ provide a notifier chain that can be used by other kernel code
+ to get notified about changes in the connection tracking state.
+
+ IF unsure, say `N'.
+
+config IP_NF_CONNTRACK_NETLINK
+ tristate 'Connection tracking netlink interface'
+ depends on IP_NF_CONNTRACK && NETFILTER_NETLINK
+ depends on IP_NF_CONNTRACK!=y || NETFILTER_NETLINK!=m
+ help
+ This option enables support for a netlink-based userspace interface
+
+
config IP_NF_CT_PROTO_SCTP
tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)'
depends on IP_NF_CONNTRACK && EXPERIMENTAL
@@ -75,6 +94,25 @@ config IP_NF_IRC
To compile it as a module, choose M here. If unsure, say Y.
+config IP_NF_NETBIOS_NS
+ tristate "NetBIOS name service protocol support (EXPERIMENTAL)"
+ depends on IP_NF_CONNTRACK && EXPERIMENTAL
+ help
+ NetBIOS name service requests are sent as broadcast messages from an
+ unprivileged port and responded to with unicast messages to the
+ same port. This make them hard to firewall properly because connection
+ tracking doesn't deal with broadcasts. This helper tracks locally
+ originating NetBIOS name service requests and the corresponding
+ responses. It relies on correct IP address configuration, specifically
+ netmask and broadcast address. When properly configured, the output
+ of "ip address show" should look similar to this:
+
+ $ ip -4 address show eth0
+ 4: eth0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc pfifo_fast qlen 1000
+ inet 172.16.2.252/24 brd 172.16.2.255 scope global eth0
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config IP_NF_TFTP
tristate "TFTP protocol support"
depends on IP_NF_CONNTRACK
@@ -99,12 +137,33 @@ config IP_NF_AMANDA
To compile it as a module, choose M here. If unsure, say Y.
+config IP_NF_PPTP
+ tristate 'PPTP protocol support'
+ depends on IP_NF_CONNTRACK
+ help
+ This module adds support for PPTP (Point to Point Tunnelling
+ Protocol, RFC2637) connection tracking and NAT.
+
+ If you are running PPTP sessions over a stateful firewall or NAT
+ box, you may want to enable this feature.
+
+ Please note that not all PPTP modes of operation are supported yet.
+ For more info, read top of the file
+ net/ipv4/netfilter/ip_conntrack_pptp.c
+
+ If you want to compile it as a module, say M here and read
+ Documentation/modules.txt. If unsure, say `N'.
+
config IP_NF_QUEUE
- tristate "Userspace queueing via NETLINK"
+ tristate "IP Userspace queueing via NETLINK (OBSOLETE)"
help
Netfilter has the ability to queue packets to user space: the
netlink device can be used to access them using this driver.
+ This option enables the old IPv4-only "ip_queue" implementation
+ which has been obsoleted by the new "nfnetlink_queue" code (see
+ CONFIG_NETFILTER_NETLINK_QUEUE).
+
To compile it as a module, choose M here. If unsure, say N.
config IP_NF_IPTABLES
@@ -340,6 +399,17 @@ config IP_NF_MATCH_SCTP
If you want to compile it as a module, say M here and read
<file:Documentation/modules.txt>. If unsure, say `N'.
+config IP_NF_MATCH_DCCP
+ tristate 'DCCP protocol match support'
+ depends on IP_NF_IPTABLES
+ help
+ With this option enabled, you will be able to use the iptables
+ `dccp' match in order to match on DCCP source/destination ports
+ and DCCP flags.
+
+ If you want to compile it as a module, say M here and read
+ <file:Documentation/modules.txt>. If unsure, say `N'.
+
config IP_NF_MATCH_COMMENT
tristate 'comment match support'
depends on IP_NF_IPTABLES
@@ -361,6 +431,16 @@ config IP_NF_MATCH_CONNMARK
<file:Documentation/modules.txt>. The module will be called
ipt_connmark.o. If unsure, say `N'.
+config IP_NF_MATCH_CONNBYTES
+ tristate 'Connection byte/packet counter match support'
+ depends on IP_NF_CT_ACCT && IP_NF_IPTABLES
+ help
+ This option adds a `connbytes' match, which allows you to match the
+ number of bytes and/or packets for each direction within a connection.
+
+ If you want to compile it as a module, say M here and read
+ <file:Documentation/modules.txt>. If unsure, say `N'.
+
config IP_NF_MATCH_HASHLIMIT
tristate 'hashlimit match support'
depends on IP_NF_IPTABLES
@@ -375,6 +455,19 @@ config IP_NF_MATCH_HASHLIMIT
destination IP' or `500pps from any given source IP' with a single
IPtables rule.
+config IP_NF_MATCH_STRING
+ tristate 'string match support'
+ depends on IP_NF_IPTABLES
+ select TEXTSEARCH
+ select TEXTSEARCH_KMP
+ select TEXTSEARCH_BM
+ select TEXTSEARCH_FSM
+ help
+ This option adds a `string' match, which allows you to look for
+ pattern matchings in packets.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
# `filter', generic and specific targets
config IP_NF_FILTER
tristate "Packet filtering"
@@ -406,9 +499,14 @@ config IP_NF_TARGET_LOG
To compile it as a module, choose M here. If unsure, say N.
config IP_NF_TARGET_ULOG
- tristate "ULOG target support"
+ tristate "ULOG target support (OBSOLETE)"
depends on IP_NF_IPTABLES
---help---
+
+ This option enables the old IPv4-only "ipt_ULOG" implementation
+ which has been obsoleted by the new "nfnetlink_log" code (see
+ CONFIG_NETFILTER_NETLINK_LOG).
+
This option adds a `ULOG' target, which allows you to create rules in
any iptables table. The packet is passed to a userspace logging
daemon using netlink multicast sockets; unlike the LOG target
@@ -445,6 +543,17 @@ config IP_NF_TARGET_TCPMSS
To compile it as a module, choose M here. If unsure, say N.
+config IP_NF_TARGET_NFQUEUE
+ tristate "NFQUEUE Target Support"
+ depends on IP_NF_IPTABLES
+ help
+ This Target replaced the old obsolete QUEUE target.
+
+ As opposed to QUEUE, it supports 65535 different queues,
+ not just one.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
# NAT + specific targets
config IP_NF_NAT
tristate "Full NAT"
@@ -545,6 +654,12 @@ config IP_NF_NAT_AMANDA
default IP_NF_NAT if IP_NF_AMANDA=y
default m if IP_NF_AMANDA=m
+config IP_NF_NAT_PPTP
+ tristate
+ depends on IP_NF_NAT!=n && IP_NF_PPTP!=n
+ default IP_NF_NAT if IP_NF_PPTP=y
+ default m if IP_NF_PPTP=m
+
# mangle + specific targets
config IP_NF_MANGLE
tristate "Packet mangling"
@@ -616,6 +731,20 @@ config IP_NF_TARGET_CLASSIFY
To compile it as a module, choose M here. If unsure, say N.
+config IP_NF_TARGET_TTL
+ tristate 'TTL target support'
+ depends on IP_NF_MANGLE
+ help
+ This option adds a `TTL' target, which enables the user to modify
+ the TTL value of the IP header.
+
+ While it is safe to decrement/lower the TTL, this target also enables
+ functionality to increment and set the TTL value of the IP header to
+ arbitrary values. This is EXTREMELY DANGEROUS since you can easily
+ create immortal packets that loop forever on the network.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config IP_NF_TARGET_CONNMARK
tristate 'CONNMARK target support'
depends on IP_NF_CONNTRACK_MARK && IP_NF_MANGLE
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 45796d5924d..dab4b58dd31 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -4,21 +4,32 @@
# objects for the standalone - connection tracking / NAT
ip_conntrack-objs := ip_conntrack_standalone.o ip_conntrack_core.o ip_conntrack_proto_generic.o ip_conntrack_proto_tcp.o ip_conntrack_proto_udp.o ip_conntrack_proto_icmp.o
-iptable_nat-objs := ip_nat_standalone.o ip_nat_rule.o ip_nat_core.o ip_nat_helper.o ip_nat_proto_unknown.o ip_nat_proto_tcp.o ip_nat_proto_udp.o ip_nat_proto_icmp.o
+ip_nat-objs := ip_nat_core.o ip_nat_helper.o ip_nat_proto_unknown.o ip_nat_proto_tcp.o ip_nat_proto_udp.o ip_nat_proto_icmp.o
+iptable_nat-objs := ip_nat_rule.o ip_nat_standalone.o
+
+ip_conntrack_pptp-objs := ip_conntrack_helper_pptp.o ip_conntrack_proto_gre.o
+ip_nat_pptp-objs := ip_nat_helper_pptp.o ip_nat_proto_gre.o
# connection tracking
obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o
+# conntrack netlink interface
+obj-$(CONFIG_IP_NF_CONNTRACK_NETLINK) += ip_conntrack_netlink.o
+
+
# SCTP protocol connection tracking
obj-$(CONFIG_IP_NF_CT_PROTO_SCTP) += ip_conntrack_proto_sctp.o
# connection tracking helpers
+obj-$(CONFIG_IP_NF_PPTP) += ip_conntrack_pptp.o
obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o
obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o
obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o
obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o
+obj-$(CONFIG_IP_NF_NETBIOS_NS) += ip_conntrack_netbios_ns.o
# NAT helpers
+obj-$(CONFIG_IP_NF_NAT_PPTP) += ip_nat_pptp.o
obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o
obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o
obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o
@@ -30,7 +41,7 @@ obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
# the three instances of ip_tables
obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o
obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o
-obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o
+obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o ip_nat.o
obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o
# matches
@@ -38,6 +49,7 @@ obj-$(CONFIG_IP_NF_MATCH_HELPER) += ipt_helper.o
obj-$(CONFIG_IP_NF_MATCH_LIMIT) += ipt_limit.o
obj-$(CONFIG_IP_NF_MATCH_HASHLIMIT) += ipt_hashlimit.o
obj-$(CONFIG_IP_NF_MATCH_SCTP) += ipt_sctp.o
+obj-$(CONFIG_IP_NF_MATCH_DCCP) += ipt_dccp.o
obj-$(CONFIG_IP_NF_MATCH_MARK) += ipt_mark.o
obj-$(CONFIG_IP_NF_MATCH_MAC) += ipt_mac.o
obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o
@@ -54,11 +66,13 @@ obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o
obj-$(CONFIG_IP_NF_MATCH_STATE) += ipt_state.o
obj-$(CONFIG_IP_NF_MATCH_CONNMARK) += ipt_connmark.o
obj-$(CONFIG_IP_NF_MATCH_CONNTRACK) += ipt_conntrack.o
+obj-$(CONFIG_IP_NF_MATCH_CONNBYTES) += ipt_connbytes.o
obj-$(CONFIG_IP_NF_MATCH_TCPMSS) += ipt_tcpmss.o
obj-$(CONFIG_IP_NF_MATCH_REALM) += ipt_realm.o
obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o
obj-$(CONFIG_IP_NF_MATCH_PHYSDEV) += ipt_physdev.o
obj-$(CONFIG_IP_NF_MATCH_COMMENT) += ipt_comment.o
+obj-$(CONFIG_IP_NF_MATCH_STRING) += ipt_string.o
# targets
obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
@@ -78,6 +92,8 @@ obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
obj-$(CONFIG_IP_NF_TARGET_TCPMSS) += ipt_TCPMSS.o
obj-$(CONFIG_IP_NF_TARGET_NOTRACK) += ipt_NOTRACK.o
obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
+obj-$(CONFIG_IP_NF_TARGET_TTL) += ipt_TTL.o
+obj-$(CONFIG_IP_NF_TARGET_NFQUEUE) += ipt_NFQUEUE.o
# generic ARP tables
obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index fa163425668..3c2e9639bba 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -347,58 +347,106 @@ unsigned int arpt_do_table(struct sk_buff **pskb,
return verdict;
}
-static inline void *find_inlist_lock_noload(struct list_head *head,
- const char *name,
- int *error,
- struct semaphore *mutex)
+/*
+ * These are weird, but module loading must not be done with mutex
+ * held (since they will register), and we have to have a single
+ * function to use try_then_request_module().
+ */
+
+/* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */
+static inline struct arpt_table *find_table_lock(const char *name)
{
- void *ret;
+ struct arpt_table *t;
- *error = down_interruptible(mutex);
- if (*error != 0)
- return NULL;
+ if (down_interruptible(&arpt_mutex) != 0)
+ return ERR_PTR(-EINTR);
- ret = list_named_find(head, name);
- if (!ret) {
- *error = -ENOENT;
- up(mutex);
- }
- return ret;
+ list_for_each_entry(t, &arpt_tables, list)
+ if (strcmp(t->name, name) == 0 && try_module_get(t->me))
+ return t;
+ up(&arpt_mutex);
+ return NULL;
}
-#ifndef CONFIG_KMOD
-#define find_inlist_lock(h,n,p,e,m) find_inlist_lock_noload((h),(n),(e),(m))
-#else
-static void *
-find_inlist_lock(struct list_head *head,
- const char *name,
- const char *prefix,
- int *error,
- struct semaphore *mutex)
+
+/* Find target, grabs ref. Returns ERR_PTR() on error. */
+static inline struct arpt_target *find_target(const char *name, u8 revision)
{
- void *ret;
+ struct arpt_target *t;
+ int err = 0;
- ret = find_inlist_lock_noload(head, name, error, mutex);
- if (!ret) {
- duprintf("find_inlist: loading `%s%s'.\n", prefix, name);
- request_module("%s%s", prefix, name);
- ret = find_inlist_lock_noload(head, name, error, mutex);
+ if (down_interruptible(&arpt_mutex) != 0)
+ return ERR_PTR(-EINTR);
+
+ list_for_each_entry(t, &arpt_target, list) {
+ if (strcmp(t->name, name) == 0) {
+ if (t->revision == revision) {
+ if (try_module_get(t->me)) {
+ up(&arpt_mutex);
+ return t;
+ }
+ } else
+ err = -EPROTOTYPE; /* Found something. */
+ }
}
+ up(&arpt_mutex);
+ return ERR_PTR(err);
+}
- return ret;
+struct arpt_target *arpt_find_target(const char *name, u8 revision)
+{
+ struct arpt_target *target;
+
+ target = try_then_request_module(find_target(name, revision),
+ "arpt_%s", name);
+ if (IS_ERR(target) || !target)
+ return NULL;
+ return target;
}
-#endif
-static inline struct arpt_table *arpt_find_table_lock(const char *name, int *error, struct semaphore *mutex)
+static int target_revfn(const char *name, u8 revision, int *bestp)
{
- return find_inlist_lock(&arpt_tables, name, "arptable_", error, mutex);
+ struct arpt_target *t;
+ int have_rev = 0;
+
+ list_for_each_entry(t, &arpt_target, list) {
+ if (strcmp(t->name, name) == 0) {
+ if (t->revision > *bestp)
+ *bestp = t->revision;
+ if (t->revision == revision)
+ have_rev =1;
+ }
+ }
+ return have_rev;
}
-static struct arpt_target *arpt_find_target_lock(const char *name, int *error, struct semaphore *mutex)
+/* Returns true or false (if no such extension at all) */
+static inline int find_revision(const char *name, u8 revision,
+ int (*revfn)(const char *, u8, int *),
+ int *err)
{
- return find_inlist_lock(&arpt_target, name, "arpt_", error, mutex);
+ int have_rev, best = -1;
+
+ if (down_interruptible(&arpt_mutex) != 0) {
+ *err = -EINTR;
+ return 1;
+ }
+ have_rev = revfn(name, revision, &best);
+ up(&arpt_mutex);
+
+ /* Nothing at all? Return 0 to try loading module. */
+ if (best == -1) {
+ *err = -ENOENT;
+ return 0;
+ }
+
+ *err = best;
+ if (!have_rev)
+ *err = -EPROTONOSUPPORT;
+ return 1;
}
+
/* All zeroes == unconditional rule. */
static inline int unconditional(const struct arpt_arp *arp)
{
@@ -544,17 +592,15 @@ static inline int check_entry(struct arpt_entry *e, const char *name, unsigned i
}
t = arpt_get_target(e);
- target = arpt_find_target_lock(t->u.user.name, &ret, &arpt_mutex);
- if (!target) {
+ target = try_then_request_module(find_target(t->u.user.name,
+ t->u.user.revision),
+ "arpt_%s", t->u.user.name);
+ if (IS_ERR(target) || !target) {
duprintf("check_entry: `%s' not found\n", t->u.user.name);
+ ret = target ? PTR_ERR(target) : -ENOENT;
goto out;
}
- if (!try_module_get((target->me))) {
- ret = -ENOENT;
- goto out_unlock;
- }
t->u.kernel.target = target;
- up(&arpt_mutex);
if (t->u.kernel.target == &arpt_standard_target) {
if (!standard_check(t, size)) {
@@ -576,8 +622,6 @@ static inline int check_entry(struct arpt_entry *e, const char *name, unsigned i
(*i)++;
return 0;
-out_unlock:
- up(&arpt_mutex);
out:
return ret;
}
@@ -716,8 +760,10 @@ static int translate_table(const char *name,
}
/* And one copy for every other CPU */
- for (i = 1; i < num_possible_cpus(); i++) {
- memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i,
+ for_each_cpu(i) {
+ if (i == 0)
+ continue;
+ memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i,
newinfo->entries,
SMP_ALIGN(newinfo->size));
}
@@ -767,7 +813,7 @@ static void get_counters(const struct arpt_table_info *t,
unsigned int cpu;
unsigned int i;
- for (cpu = 0; cpu < num_possible_cpus(); cpu++) {
+ for_each_cpu(cpu) {
i = 0;
ARPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu),
t->size,
@@ -844,8 +890,8 @@ static int get_entries(const struct arpt_get_entries *entries,
int ret;
struct arpt_table *t;
- t = arpt_find_table_lock(entries->name, &ret, &arpt_mutex);
- if (t) {
+ t = find_table_lock(entries->name);
+ if (t || !IS_ERR(t)) {
duprintf("t->private->number = %u\n",
t->private->number);
if (entries->size == t->private->size)
@@ -857,10 +903,10 @@ static int get_entries(const struct arpt_get_entries *entries,
entries->size);
ret = -EINVAL;
}
+ module_put(t->me);
up(&arpt_mutex);
} else
- duprintf("get_entries: Can't find %s!\n",
- entries->name);
+ ret = t ? PTR_ERR(t) : -ENOENT;
return ret;
}
@@ -885,7 +931,8 @@ static int do_replace(void __user *user, unsigned int len)
return -ENOMEM;
newinfo = vmalloc(sizeof(struct arpt_table_info)
- + SMP_ALIGN(tmp.size) * num_possible_cpus());
+ + SMP_ALIGN(tmp.size) *
+ (highest_possible_processor_id()+1));
if (!newinfo)
return -ENOMEM;
@@ -910,22 +957,19 @@ static int do_replace(void __user *user, unsigned int len)
duprintf("arp_tables: Translated table\n");
- t = arpt_find_table_lock(tmp.name, &ret, &arpt_mutex);
- if (!t)
+ t = try_then_request_module(find_table_lock(tmp.name),
+ "arptable_%s", tmp.name);
+ if (!t || IS_ERR(t)) {
+ ret = t ? PTR_ERR(t) : -ENOENT;
goto free_newinfo_counters_untrans;
+ }
/* You lied! */
if (tmp.valid_hooks != t->valid_hooks) {
duprintf("Valid hook crap: %08X vs %08X\n",
tmp.valid_hooks, t->valid_hooks);
ret = -EINVAL;
- goto free_newinfo_counters_untrans_unlock;
- }
-
- /* Get a reference in advance, we're not allowed fail later */
- if (!try_module_get(t->me)) {
- ret = -EBUSY;
- goto free_newinfo_counters_untrans_unlock;
+ goto put_module;
}
oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret);
@@ -956,7 +1000,6 @@ static int do_replace(void __user *user, unsigned int len)
put_module:
module_put(t->me);
- free_newinfo_counters_untrans_unlock:
up(&arpt_mutex);
free_newinfo_counters_untrans:
ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry, NULL);
@@ -986,7 +1029,7 @@ static int do_add_counters(void __user *user, unsigned int len)
unsigned int i;
struct arpt_counters_info tmp, *paddc;
struct arpt_table *t;
- int ret;
+ int ret = 0;
if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
return -EFAULT;
@@ -1003,9 +1046,11 @@ static int do_add_counters(void __user *user, unsigned int len)
goto free;
}
- t = arpt_find_table_lock(tmp.name, &ret, &arpt_mutex);
- if (!t)
+ t = find_table_lock(tmp.name);
+ if (!t || IS_ERR(t)) {
+ ret = t ? PTR_ERR(t) : -ENOENT;
goto free;
+ }
write_lock_bh(&t->lock);
if (t->private->number != paddc->num_counters) {
@@ -1022,6 +1067,7 @@ static int do_add_counters(void __user *user, unsigned int len)
unlock_up_free:
write_unlock_bh(&t->lock);
up(&arpt_mutex);
+ module_put(t->me);
free:
vfree(paddc);
@@ -1076,8 +1122,10 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
break;
}
name[ARPT_TABLE_MAXNAMELEN-1] = '\0';
- t = arpt_find_table_lock(name, &ret, &arpt_mutex);
- if (t) {
+
+ t = try_then_request_module(find_table_lock(name),
+ "arptable_%s", name);
+ if (t && !IS_ERR(t)) {
struct arpt_getinfo info;
info.valid_hooks = t->valid_hooks;
@@ -1093,9 +1141,10 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
ret = -EFAULT;
else
ret = 0;
-
up(&arpt_mutex);
- }
+ module_put(t->me);
+ } else
+ ret = t ? PTR_ERR(t) : -ENOENT;
}
break;
@@ -1116,6 +1165,24 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
break;
}
+ case ARPT_SO_GET_REVISION_TARGET: {
+ struct arpt_get_revision rev;
+
+ if (*len != sizeof(rev)) {
+ ret = -EINVAL;
+ break;
+ }
+ if (copy_from_user(&rev, user, sizeof(rev)) != 0) {
+ ret = -EFAULT;
+ break;
+ }
+
+ try_then_request_module(find_revision(rev.name, rev.revision,
+ target_revfn, &ret),
+ "arpt_%s", rev.name);
+ break;
+ }
+
default:
duprintf("do_arpt_get_ctl: unknown request %i\n", cmd);
ret = -EINVAL;
@@ -1133,12 +1200,9 @@ int arpt_register_target(struct arpt_target *target)
if (ret != 0)
return ret;
- if (!list_named_insert(&arpt_target, target)) {
- duprintf("arpt_register_target: `%s' already in list!\n",
- target->name);
- ret = -EINVAL;
- }
+ list_add(&target->list, &arpt_target);
up(&arpt_mutex);
+
return ret;
}
@@ -1158,7 +1222,8 @@ int arpt_register_table(struct arpt_table *table,
= { 0, 0, 0, { 0 }, { 0 }, { } };
newinfo = vmalloc(sizeof(struct arpt_table_info)
- + SMP_ALIGN(repl->size) * num_possible_cpus());
+ + SMP_ALIGN(repl->size) *
+ (highest_possible_processor_id()+1));
if (!newinfo) {
ret = -ENOMEM;
return ret;
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
index 01e1b58322a..fa3f914117e 100644
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -40,7 +40,7 @@ MODULE_PARM_DESC(master_timeout, "timeout for the master connection");
static char *conns[] = { "DATA ", "MESG ", "INDEX " };
/* This is slow, but it's simple. --RR */
-static char amanda_buffer[65536];
+static char *amanda_buffer;
static DEFINE_SPINLOCK(amanda_buffer_lock);
unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb,
@@ -65,7 +65,7 @@ static int help(struct sk_buff **pskb,
/* increase the UDP timeout of the master connection as replies from
* Amanda clients to the server can be quite delayed */
- ip_ct_refresh_acct(ct, ctinfo, NULL, master_timeout * HZ);
+ ip_ct_refresh(ct, *pskb, master_timeout * HZ);
/* No data? */
dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
@@ -108,6 +108,7 @@ static int help(struct sk_buff **pskb,
}
exp->expectfn = NULL;
+ exp->flags = 0;
exp->tuple.src.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip;
exp->tuple.src.u.tcp.port = 0;
@@ -153,11 +154,25 @@ static struct ip_conntrack_helper amanda_helper = {
static void __exit fini(void)
{
ip_conntrack_helper_unregister(&amanda_helper);
+ kfree(amanda_buffer);
}
static int __init init(void)
{
- return ip_conntrack_helper_register(&amanda_helper);
+ int ret;
+
+ amanda_buffer = kmalloc(65536, GFP_KERNEL);
+ if (!amanda_buffer)
+ return -ENOMEM;
+
+ ret = ip_conntrack_helper_register(&amanda_helper);
+ if (ret < 0) {
+ kfree(amanda_buffer);
+ return ret;
+ }
+ return 0;
+
+
}
module_init(init);
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index a7f0c821a9b..422ab68ee7f 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -37,6 +37,7 @@
#include <linux/err.h>
#include <linux/percpu.h>
#include <linux/moduleparam.h>
+#include <linux/notifier.h>
/* ip_conntrack_lock protects the main hash table, protocol/helper/expected
registrations, conntrack timers*/
@@ -49,7 +50,7 @@
#include <linux/netfilter_ipv4/ip_conntrack_core.h>
#include <linux/netfilter_ipv4/listhelp.h>
-#define IP_CONNTRACK_VERSION "2.1"
+#define IP_CONNTRACK_VERSION "2.4"
#if 0
#define DEBUGP printk
@@ -69,35 +70,98 @@ static LIST_HEAD(helpers);
unsigned int ip_conntrack_htable_size = 0;
int ip_conntrack_max;
struct list_head *ip_conntrack_hash;
-static kmem_cache_t *ip_conntrack_cachep;
-static kmem_cache_t *ip_conntrack_expect_cachep;
+static kmem_cache_t *ip_conntrack_cachep __read_mostly;
+static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly;
struct ip_conntrack ip_conntrack_untracked;
unsigned int ip_ct_log_invalid;
static LIST_HEAD(unconfirmed);
static int ip_conntrack_vmalloc;
-DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
+static unsigned int ip_conntrack_next_id = 1;
+static unsigned int ip_conntrack_expect_next_id = 1;
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+struct notifier_block *ip_conntrack_chain;
+struct notifier_block *ip_conntrack_expect_chain;
+
+DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
+
+/* deliver cached events and clear cache entry - must be called with locally
+ * disabled softirqs */
+static inline void
+__ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
+{
+ DEBUGP("ecache: delivering events for %p\n", ecache->ct);
+ if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
+ notifier_call_chain(&ip_conntrack_chain, ecache->events,
+ ecache->ct);
+ ecache->events = 0;
+ ip_conntrack_put(ecache->ct);
+ ecache->ct = NULL;
+}
+
+/* Deliver all cached events for a particular conntrack. This is called
+ * by code prior to async packet handling or freeing the skb */
+void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
+{
+ struct ip_conntrack_ecache *ecache;
+
+ local_bh_disable();
+ ecache = &__get_cpu_var(ip_conntrack_ecache);
+ if (ecache->ct == ct)
+ __ip_ct_deliver_cached_events(ecache);
+ local_bh_enable();
+}
-void
-ip_conntrack_put(struct ip_conntrack *ct)
+void __ip_ct_event_cache_init(struct ip_conntrack *ct)
{
- IP_NF_ASSERT(ct);
- nf_conntrack_put(&ct->ct_general);
+ struct ip_conntrack_ecache *ecache;
+
+ /* take care of delivering potentially old events */
+ ecache = &__get_cpu_var(ip_conntrack_ecache);
+ BUG_ON(ecache->ct == ct);
+ if (ecache->ct)
+ __ip_ct_deliver_cached_events(ecache);
+ /* initialize for this conntrack/packet */
+ ecache->ct = ct;
+ nf_conntrack_get(&ct->ct_general);
+}
+
+/* flush the event cache - touches other CPU's data and must not be called while
+ * packets are still passing through the code */
+static void ip_ct_event_cache_flush(void)
+{
+ struct ip_conntrack_ecache *ecache;
+ int cpu;
+
+ for_each_cpu(cpu) {
+ ecache = &per_cpu(ip_conntrack_ecache, cpu);
+ if (ecache->ct)
+ ip_conntrack_put(ecache->ct);
+ }
}
+#else
+static inline void ip_ct_event_cache_flush(void) {}
+#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
+
+DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
static int ip_conntrack_hash_rnd_initted;
static unsigned int ip_conntrack_hash_rnd;
-static u_int32_t
-hash_conntrack(const struct ip_conntrack_tuple *tuple)
+static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple,
+ unsigned int size, unsigned int rnd)
{
-#if 0
- dump_tuple(tuple);
-#endif
return (jhash_3words(tuple->src.ip,
(tuple->dst.ip ^ tuple->dst.protonum),
(tuple->src.u.all | (tuple->dst.u.all << 16)),
- ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
+ rnd) % size);
+}
+
+static u_int32_t
+hash_conntrack(const struct ip_conntrack_tuple *tuple)
+{
+ return __hash_conntrack(tuple, ip_conntrack_htable_size,
+ ip_conntrack_hash_rnd);
}
int
@@ -137,13 +201,14 @@ ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
/* ip_conntrack_expect helper functions */
-static void unlink_expect(struct ip_conntrack_expect *exp)
+void ip_ct_unlink_expect(struct ip_conntrack_expect *exp)
{
ASSERT_WRITE_LOCK(&ip_conntrack_lock);
IP_NF_ASSERT(!timer_pending(&exp->timeout));
list_del(&exp->list);
CONNTRACK_STAT_INC(expect_delete);
exp->master->expecting--;
+ ip_conntrack_expect_put(exp);
}
static void expectation_timed_out(unsigned long ul_expect)
@@ -151,11 +216,38 @@ static void expectation_timed_out(unsigned long ul_expect)
struct ip_conntrack_expect *exp = (void *)ul_expect;
write_lock_bh(&ip_conntrack_lock);
- unlink_expect(exp);
+ ip_ct_unlink_expect(exp);
write_unlock_bh(&ip_conntrack_lock);
ip_conntrack_expect_put(exp);
}
+struct ip_conntrack_expect *
+__ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
+{
+ struct ip_conntrack_expect *i;
+
+ list_for_each_entry(i, &ip_conntrack_expect_list, list) {
+ if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
+ atomic_inc(&i->use);
+ return i;
+ }
+ }
+ return NULL;
+}
+
+/* Just find a expectation corresponding to a tuple. */
+struct ip_conntrack_expect *
+ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
+{
+ struct ip_conntrack_expect *i;
+
+ read_lock_bh(&ip_conntrack_lock);
+ i = __ip_conntrack_expect_find(tuple);
+ read_unlock_bh(&ip_conntrack_lock);
+
+ return i;
+}
+
/* If an expectation for this connection is found, it gets delete from
* global list then returned. */
static struct ip_conntrack_expect *
@@ -170,17 +262,21 @@ find_expectation(const struct ip_conntrack_tuple *tuple)
master ct never got confirmed, we'd hold a reference to it
and weird things would happen to future packets). */
if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
- && is_confirmed(i->master)
- && del_timer(&i->timeout)) {
- unlink_expect(i);
- return i;
+ && is_confirmed(i->master)) {
+ if (i->flags & IP_CT_EXPECT_PERMANENT) {
+ atomic_inc(&i->use);
+ return i;
+ } else if (del_timer(&i->timeout)) {
+ ip_ct_unlink_expect(i);
+ return i;
+ }
}
}
return NULL;
}
/* delete all expectations for this conntrack */
-static void remove_expectations(struct ip_conntrack *ct)
+void ip_ct_remove_expectations(struct ip_conntrack *ct)
{
struct ip_conntrack_expect *i, *tmp;
@@ -190,7 +286,7 @@ static void remove_expectations(struct ip_conntrack *ct)
list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
if (i->master == ct && del_timer(&i->timeout)) {
- unlink_expect(i);
+ ip_ct_unlink_expect(i);
ip_conntrack_expect_put(i);
}
}
@@ -210,7 +306,7 @@ clean_from_lists(struct ip_conntrack *ct)
LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
/* Destroy all pending expectations */
- remove_expectations(ct);
+ ip_ct_remove_expectations(ct);
}
static void
@@ -223,10 +319,13 @@ destroy_conntrack(struct nf_conntrack *nfct)
IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
IP_NF_ASSERT(!timer_pending(&ct->timeout));
+ ip_conntrack_event(IPCT_DESTROY, ct);
+ set_bit(IPS_DYING_BIT, &ct->status);
+
/* To make sure we don't get any weird locking issues here:
* destroy_conntrack() MUST NOT be called with a write lock
* to ip_conntrack_lock!!! -HW */
- proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
+ proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
if (proto && proto->destroy)
proto->destroy(ct);
@@ -238,7 +337,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
* except TFTP can create an expectation on the first packet,
* before connection is in the list, so we need to clean here,
* too. */
- remove_expectations(ct);
+ ip_ct_remove_expectations(ct);
/* We overload first tuple to link into unconfirmed list. */
if (!is_confirmed(ct)) {
@@ -253,8 +352,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
ip_conntrack_put(ct->master);
DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
- kmem_cache_free(ip_conntrack_cachep, ct);
- atomic_dec(&ip_conntrack_count);
+ ip_conntrack_free(ct);
}
static void death_by_timeout(unsigned long ul_conntrack)
@@ -280,7 +378,7 @@ conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
&& ip_ct_tuple_equal(tuple, &i->tuple);
}
-static struct ip_conntrack_tuple_hash *
+struct ip_conntrack_tuple_hash *
__ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
const struct ip_conntrack *ignored_conntrack)
{
@@ -315,6 +413,29 @@ ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
return h;
}
+static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
+ unsigned int hash,
+ unsigned int repl_hash)
+{
+ ct->id = ++ip_conntrack_next_id;
+ list_prepend(&ip_conntrack_hash[hash],
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
+ list_prepend(&ip_conntrack_hash[repl_hash],
+ &ct->tuplehash[IP_CT_DIR_REPLY].list);
+}
+
+void ip_conntrack_hash_insert(struct ip_conntrack *ct)
+{
+ unsigned int hash, repl_hash;
+
+ hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+ write_lock_bh(&ip_conntrack_lock);
+ __ip_conntrack_hash_insert(ct, hash, repl_hash);
+ write_unlock_bh(&ip_conntrack_lock);
+}
+
/* Confirm a connection given skb; places it in hash table */
int
__ip_conntrack_confirm(struct sk_buff **pskb)
@@ -361,10 +482,7 @@ __ip_conntrack_confirm(struct sk_buff **pskb)
/* Remove from unconfirmed list */
list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
- list_prepend(&ip_conntrack_hash[hash],
- &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
- list_prepend(&ip_conntrack_hash[repl_hash],
- &ct->tuplehash[IP_CT_DIR_REPLY]);
+ __ip_conntrack_hash_insert(ct, hash, repl_hash);
/* Timer relative to confirmation time, not original
setting time, otherwise we'd get timer wrap in
weird delay cases. */
@@ -374,6 +492,16 @@ __ip_conntrack_confirm(struct sk_buff **pskb)
set_bit(IPS_CONFIRMED_BIT, &ct->status);
CONNTRACK_STAT_INC(insert);
write_unlock_bh(&ip_conntrack_lock);
+ if (ct->helper)
+ ip_conntrack_event_cache(IPCT_HELPER, *pskb);
+#ifdef CONFIG_IP_NF_NAT_NEEDED
+ if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
+ test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
+ ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
+#endif
+ ip_conntrack_event_cache(master_ct(ct) ?
+ IPCT_RELATED : IPCT_NEW, *pskb);
+
return NF_ACCEPT;
}
@@ -438,34 +566,84 @@ static inline int helper_cmp(const struct ip_conntrack_helper *i,
return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
}
-static struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
+static struct ip_conntrack_helper *
+__ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
{
return LIST_FIND(&helpers, helper_cmp,
struct ip_conntrack_helper *,
tuple);
}
-/* Allocate a new conntrack: we return -ENOMEM if classification
- failed due to stress. Otherwise it really is unclassifiable. */
-static struct ip_conntrack_tuple_hash *
-init_conntrack(const struct ip_conntrack_tuple *tuple,
- struct ip_conntrack_protocol *protocol,
- struct sk_buff *skb)
+struct ip_conntrack_helper *
+ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
+{
+ struct ip_conntrack_helper *helper;
+
+ /* need ip_conntrack_lock to assure that helper exists until
+ * try_module_get() is called */
+ read_lock_bh(&ip_conntrack_lock);
+
+ helper = __ip_conntrack_helper_find(tuple);
+ if (helper) {
+ /* need to increase module usage count to assure helper will
+ * not go away while the caller is e.g. busy putting a
+ * conntrack in the hash that uses the helper */
+ if (!try_module_get(helper->me))
+ helper = NULL;
+ }
+
+ read_unlock_bh(&ip_conntrack_lock);
+
+ return helper;
+}
+
+void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
+{
+ module_put(helper->me);
+}
+
+struct ip_conntrack_protocol *
+__ip_conntrack_proto_find(u_int8_t protocol)
+{
+ return ip_ct_protos[protocol];
+}
+
+/* this is guaranteed to always return a valid protocol helper, since
+ * it falls back to generic_protocol */
+struct ip_conntrack_protocol *
+ip_conntrack_proto_find_get(u_int8_t protocol)
+{
+ struct ip_conntrack_protocol *p;
+
+ preempt_disable();
+ p = __ip_conntrack_proto_find(protocol);
+ if (p) {
+ if (!try_module_get(p->me))
+ p = &ip_conntrack_generic_protocol;
+ }
+ preempt_enable();
+
+ return p;
+}
+
+void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
+{
+ module_put(p->me);
+}
+
+struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
+ struct ip_conntrack_tuple *repl)
{
struct ip_conntrack *conntrack;
- struct ip_conntrack_tuple repl_tuple;
- size_t hash;
- struct ip_conntrack_expect *exp;
if (!ip_conntrack_hash_rnd_initted) {
get_random_bytes(&ip_conntrack_hash_rnd, 4);
ip_conntrack_hash_rnd_initted = 1;
}
- hash = hash_conntrack(tuple);
-
if (ip_conntrack_max
&& atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
+ unsigned int hash = hash_conntrack(orig);
/* Try dropping from this hash chain. */
if (!early_drop(&ip_conntrack_hash[hash])) {
if (net_ratelimit())
@@ -476,11 +654,6 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
}
}
- if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
- DEBUGP("Can't invert tuple.\n");
- return NULL;
- }
-
conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
if (!conntrack) {
DEBUGP("Can't allocate conntrack.\n");
@@ -490,17 +663,50 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
memset(conntrack, 0, sizeof(*conntrack));
atomic_set(&conntrack->ct_general.use, 1);
conntrack->ct_general.destroy = destroy_conntrack;
- conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
- conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
- if (!protocol->new(conntrack, skb)) {
- kmem_cache_free(ip_conntrack_cachep, conntrack);
- return NULL;
- }
+ conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
+ conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
/* Don't set timer yet: wait for confirmation */
init_timer(&conntrack->timeout);
conntrack->timeout.data = (unsigned long)conntrack;
conntrack->timeout.function = death_by_timeout;
+ atomic_inc(&ip_conntrack_count);
+
+ return conntrack;
+}
+
+void
+ip_conntrack_free(struct ip_conntrack *conntrack)
+{
+ atomic_dec(&ip_conntrack_count);
+ kmem_cache_free(ip_conntrack_cachep, conntrack);
+}
+
+/* Allocate a new conntrack: we return -ENOMEM if classification
+ * failed due to stress. Otherwise it really is unclassifiable */
+static struct ip_conntrack_tuple_hash *
+init_conntrack(struct ip_conntrack_tuple *tuple,
+ struct ip_conntrack_protocol *protocol,
+ struct sk_buff *skb)
+{
+ struct ip_conntrack *conntrack;
+ struct ip_conntrack_tuple repl_tuple;
+ struct ip_conntrack_expect *exp;
+
+ if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
+ DEBUGP("Can't invert tuple.\n");
+ return NULL;
+ }
+
+ conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
+ if (conntrack == NULL || IS_ERR(conntrack))
+ return (struct ip_conntrack_tuple_hash *)conntrack;
+
+ if (!protocol->new(conntrack, skb)) {
+ ip_conntrack_free(conntrack);
+ return NULL;
+ }
+
write_lock_bh(&ip_conntrack_lock);
exp = find_expectation(tuple);
@@ -521,7 +727,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
nf_conntrack_get(&conntrack->master->ct_general);
CONNTRACK_STAT_INC(expect_new);
} else {
- conntrack->helper = ip_ct_find_helper(&repl_tuple);
+ conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
CONNTRACK_STAT_INC(new);
}
@@ -529,7 +735,6 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
/* Overload tuple linked list to put us in unconfirmed list. */
list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
- atomic_inc(&ip_conntrack_count);
write_unlock_bh(&ip_conntrack_lock);
if (exp) {
@@ -607,7 +812,7 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
struct ip_conntrack *ct;
enum ip_conntrack_info ctinfo;
struct ip_conntrack_protocol *proto;
- int set_reply;
+ int set_reply = 0;
int ret;
/* Previously seen (loopback or untracked)? Ignore. */
@@ -625,9 +830,6 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
return NF_DROP;
}
- /* FIXME: Do this right please. --RR */
- (*pskb)->nfcache |= NFC_UNKNOWN;
-
/* Doesn't cover locally-generated broadcast, so not worth it. */
#if 0
/* Ignore broadcast: no `connection'. */
@@ -643,7 +845,7 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
}
#endif
- proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
+ proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
/* It may be an special packet, error, unclean...
* inverse of the return code tells to the netfilter
@@ -679,8 +881,8 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
return -ret;
}
- if (set_reply)
- set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
+ if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
+ ip_conntrack_event_cache(IPCT_STATUS, *pskb);
return ret;
}
@@ -689,7 +891,7 @@ int invert_tuplepr(struct ip_conntrack_tuple *inverse,
const struct ip_conntrack_tuple *orig)
{
return ip_ct_invert_tuple(inverse, orig,
- ip_ct_find_proto(orig->dst.protonum));
+ __ip_conntrack_proto_find(orig->dst.protonum));
}
/* Would two expected things clash? */
@@ -725,7 +927,7 @@ void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
/* choose the the oldest expectation to evict */
list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
if (expect_matches(i, exp) && del_timer(&i->timeout)) {
- unlink_expect(i);
+ ip_ct_unlink_expect(i);
write_unlock_bh(&ip_conntrack_lock);
ip_conntrack_expect_put(i);
return;
@@ -734,6 +936,9 @@ void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
write_unlock_bh(&ip_conntrack_lock);
}
+/* We don't increase the master conntrack refcount for non-fulfilled
+ * conntracks. During the conntrack destruction, the expectations are
+ * always killed before the conntrack itself */
struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
{
struct ip_conntrack_expect *new;
@@ -744,17 +949,14 @@ struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
return NULL;
}
new->master = me;
- atomic_inc(&new->master->ct_general.use);
atomic_set(&new->use, 1);
return new;
}
void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
{
- if (atomic_dec_and_test(&exp->use)) {
- ip_conntrack_put(exp->master);
+ if (atomic_dec_and_test(&exp->use))
kmem_cache_free(ip_conntrack_expect_cachep, exp);
- }
}
static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
@@ -769,6 +971,8 @@ static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
add_timer(&exp->timeout);
+ exp->id = ++ip_conntrack_expect_next_id;
+ atomic_inc(&exp->use);
CONNTRACK_STAT_INC(expect_create);
}
@@ -780,7 +984,7 @@ static void evict_oldest_expect(struct ip_conntrack *master)
list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
if (i->master == master) {
if (del_timer(&i->timeout)) {
- unlink_expect(i);
+ ip_ct_unlink_expect(i);
ip_conntrack_expect_put(i);
}
break;
@@ -827,6 +1031,7 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
evict_oldest_expect(expect->master);
ip_conntrack_expect_insert(expect);
+ ip_conntrack_expect_event(IPEXP_NEW, expect);
ret = 0;
out:
write_unlock_bh(&ip_conntrack_lock);
@@ -847,7 +1052,7 @@ void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
if (!conntrack->master && conntrack->expecting == 0)
- conntrack->helper = ip_ct_find_helper(newreply);
+ conntrack->helper = __ip_conntrack_helper_find(newreply);
write_unlock_bh(&ip_conntrack_lock);
}
@@ -861,11 +1066,26 @@ int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
return 0;
}
+struct ip_conntrack_helper *
+__ip_conntrack_helper_find_byname(const char *name)
+{
+ struct ip_conntrack_helper *h;
+
+ list_for_each_entry(h, &helpers, list) {
+ if (!strcmp(h->name, name))
+ return h;
+ }
+
+ return NULL;
+}
+
static inline int unhelp(struct ip_conntrack_tuple_hash *i,
const struct ip_conntrack_helper *me)
{
- if (tuplehash_to_ctrack(i)->helper == me)
+ if (tuplehash_to_ctrack(i)->helper == me) {
+ ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
tuplehash_to_ctrack(i)->helper = NULL;
+ }
return 0;
}
@@ -881,7 +1101,7 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
/* Get rid of expectations */
list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
if (exp->master->helper == me && del_timer(&exp->timeout)) {
- unlink_expect(exp);
+ ip_ct_unlink_expect(exp);
ip_conntrack_expect_put(exp);
}
}
@@ -896,42 +1116,83 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
synchronize_net();
}
-static inline void ct_add_counters(struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- const struct sk_buff *skb)
-{
-#ifdef CONFIG_IP_NF_CT_ACCT
- if (skb) {
- ct->counters[CTINFO2DIR(ctinfo)].packets++;
- ct->counters[CTINFO2DIR(ctinfo)].bytes +=
- ntohs(skb->nh.iph->tot_len);
- }
-#endif
-}
-
-/* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
-void ip_ct_refresh_acct(struct ip_conntrack *ct,
+/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
+void __ip_ct_refresh_acct(struct ip_conntrack *ct,
enum ip_conntrack_info ctinfo,
const struct sk_buff *skb,
- unsigned long extra_jiffies)
+ unsigned long extra_jiffies,
+ int do_acct)
{
+ int event = 0;
+
IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
+ IP_NF_ASSERT(skb);
+
+ write_lock_bh(&ip_conntrack_lock);
/* If not in hash table, timer will not be active yet */
if (!is_confirmed(ct)) {
ct->timeout.expires = extra_jiffies;
- ct_add_counters(ct, ctinfo, skb);
+ event = IPCT_REFRESH;
} else {
- write_lock_bh(&ip_conntrack_lock);
/* Need del_timer for race avoidance (may already be dying). */
if (del_timer(&ct->timeout)) {
ct->timeout.expires = jiffies + extra_jiffies;
add_timer(&ct->timeout);
+ event = IPCT_REFRESH;
}
- ct_add_counters(ct, ctinfo, skb);
- write_unlock_bh(&ip_conntrack_lock);
}
+
+#ifdef CONFIG_IP_NF_CT_ACCT
+ if (do_acct) {
+ ct->counters[CTINFO2DIR(ctinfo)].packets++;
+ ct->counters[CTINFO2DIR(ctinfo)].bytes +=
+ ntohs(skb->nh.iph->tot_len);
+ if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
+ || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
+ event |= IPCT_COUNTER_FILLING;
+ }
+#endif
+
+ write_unlock_bh(&ip_conntrack_lock);
+
+ /* must be unlocked when calling event cache */
+ if (event)
+ ip_conntrack_event_cache(event, skb);
+}
+
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
+ * in ip_conntrack_core, since we don't want the protocols to autoload
+ * or depend on ctnetlink */
+int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
+ const struct ip_conntrack_tuple *tuple)
+{
+ NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
+ &tuple->src.u.tcp.port);
+ NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
+ &tuple->dst.u.tcp.port);
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
+ struct ip_conntrack_tuple *t)
+{
+ if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
+ return -EINVAL;
+
+ t->src.u.tcp.port =
+ *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
+ t->dst.u.tcp.port =
+ *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
+
+ return 0;
}
+#endif
/* Returns new sk_buff, or NULL */
struct sk_buff *
@@ -943,10 +1204,8 @@ ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
skb = ip_defrag(skb, user);
local_bh_enable();
- if (skb) {
+ if (skb)
ip_send_check(skb->nh.iph);
- skb->nfcache |= NFC_ALTERED;
- }
return skb;
}
@@ -1086,26 +1345,23 @@ static int kill_all(struct ip_conntrack *i, void *data)
return 1;
}
-static void free_conntrack_hash(void)
+static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
{
- if (ip_conntrack_vmalloc)
- vfree(ip_conntrack_hash);
+ if (vmalloced)
+ vfree(hash);
else
- free_pages((unsigned long)ip_conntrack_hash,
- get_order(sizeof(struct list_head)
- * ip_conntrack_htable_size));
+ free_pages((unsigned long)hash,
+ get_order(sizeof(struct list_head) * size));
}
-/* Mishearing the voices in his head, our hero wonders how he's
- supposed to kill the mall. */
-void ip_conntrack_cleanup(void)
+void ip_conntrack_flush()
{
- ip_ct_attach = NULL;
/* This makes sure all current packets have passed through
netfilter framework. Roll on, two-stage module
delete... */
synchronize_net();
-
+
+ ip_ct_event_cache_flush();
i_see_dead_people:
ip_ct_iterate_cleanup(kill_all, NULL);
if (atomic_read(&ip_conntrack_count) != 0) {
@@ -1115,15 +1371,93 @@ void ip_conntrack_cleanup(void)
/* wait until all references to ip_conntrack_untracked are dropped */
while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
schedule();
+}
+/* Mishearing the voices in his head, our hero wonders how he's
+ supposed to kill the mall. */
+void ip_conntrack_cleanup(void)
+{
+ ip_ct_attach = NULL;
+ ip_conntrack_flush();
kmem_cache_destroy(ip_conntrack_cachep);
kmem_cache_destroy(ip_conntrack_expect_cachep);
- free_conntrack_hash();
+ free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
+ ip_conntrack_htable_size);
nf_unregister_sockopt(&so_getorigdst);
}
-static int hashsize;
-module_param(hashsize, int, 0400);
+static struct list_head *alloc_hashtable(int size, int *vmalloced)
+{
+ struct list_head *hash;
+ unsigned int i;
+
+ *vmalloced = 0;
+ hash = (void*)__get_free_pages(GFP_KERNEL,
+ get_order(sizeof(struct list_head)
+ * size));
+ if (!hash) {
+ *vmalloced = 1;
+ printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n");
+ hash = vmalloc(sizeof(struct list_head) * size);
+ }
+
+ if (hash)
+ for (i = 0; i < size; i++)
+ INIT_LIST_HEAD(&hash[i]);
+
+ return hash;
+}
+
+int set_hashsize(const char *val, struct kernel_param *kp)
+{
+ int i, bucket, hashsize, vmalloced;
+ int old_vmalloced, old_size;
+ int rnd;
+ struct list_head *hash, *old_hash;
+ struct ip_conntrack_tuple_hash *h;
+
+ /* On boot, we can set this without any fancy locking. */
+ if (!ip_conntrack_htable_size)
+ return param_set_int(val, kp);
+
+ hashsize = simple_strtol(val, NULL, 0);
+ if (!hashsize)
+ return -EINVAL;
+
+ hash = alloc_hashtable(hashsize, &vmalloced);
+ if (!hash)
+ return -ENOMEM;
+
+ /* We have to rehash for the new table anyway, so we also can
+ * use a new random seed */
+ get_random_bytes(&rnd, 4);
+
+ write_lock_bh(&ip_conntrack_lock);
+ for (i = 0; i < ip_conntrack_htable_size; i++) {
+ while (!list_empty(&ip_conntrack_hash[i])) {
+ h = list_entry(ip_conntrack_hash[i].next,
+ struct ip_conntrack_tuple_hash, list);
+ list_del(&h->list);
+ bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
+ list_add_tail(&h->list, &hash[bucket]);
+ }
+ }
+ old_size = ip_conntrack_htable_size;
+ old_vmalloced = ip_conntrack_vmalloc;
+ old_hash = ip_conntrack_hash;
+
+ ip_conntrack_htable_size = hashsize;
+ ip_conntrack_vmalloc = vmalloced;
+ ip_conntrack_hash = hash;
+ ip_conntrack_hash_rnd = rnd;
+ write_unlock_bh(&ip_conntrack_lock);
+
+ free_conntrack_hash(old_hash, old_vmalloced, old_size);
+ return 0;
+}
+
+module_param_call(hashsize, set_hashsize, param_get_uint,
+ &ip_conntrack_htable_size, 0600);
int __init ip_conntrack_init(void)
{
@@ -1132,9 +1466,7 @@ int __init ip_conntrack_init(void)
/* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
* machine has 256 buckets. >= 1GB machines have 8192 buckets. */
- if (hashsize) {
- ip_conntrack_htable_size = hashsize;
- } else {
+ if (!ip_conntrack_htable_size) {
ip_conntrack_htable_size
= (((num_physpages << PAGE_SHIFT) / 16384)
/ sizeof(struct list_head));
@@ -1156,20 +1488,8 @@ int __init ip_conntrack_init(void)
return ret;
}
- /* AK: the hash table is twice as big than needed because it
- uses list_head. it would be much nicer to caches to use a
- single pointer list head here. */
- ip_conntrack_vmalloc = 0;
- ip_conntrack_hash
- =(void*)__get_free_pages(GFP_KERNEL,
- get_order(sizeof(struct list_head)
- *ip_conntrack_htable_size));
- if (!ip_conntrack_hash) {
- ip_conntrack_vmalloc = 1;
- printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n");
- ip_conntrack_hash = vmalloc(sizeof(struct list_head)
- * ip_conntrack_htable_size);
- }
+ ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size,
+ &ip_conntrack_vmalloc);
if (!ip_conntrack_hash) {
printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
goto err_unreg_sockopt;
@@ -1201,9 +1521,6 @@ int __init ip_conntrack_init(void)
ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
write_unlock_bh(&ip_conntrack_lock);
- for (i = 0; i < ip_conntrack_htable_size; i++)
- INIT_LIST_HEAD(&ip_conntrack_hash[i]);
-
/* For use by ipt_REJECT */
ip_ct_attach = ip_conntrack_attach;
@@ -1218,7 +1535,8 @@ int __init ip_conntrack_init(void)
err_free_conntrack_slab:
kmem_cache_destroy(ip_conntrack_cachep);
err_free_hash:
- free_conntrack_hash();
+ free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
+ ip_conntrack_htable_size);
err_unreg_sockopt:
nf_unregister_sockopt(&so_getorigdst);
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c
index 7a3b773be3f..d77d6b3f5f8 100644
--- a/net/ipv4/netfilter/ip_conntrack_ftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_ftp.c
@@ -25,14 +25,13 @@ MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
MODULE_DESCRIPTION("ftp connection tracking helper");
/* This is slow, but it's simple. --RR */
-static char ftp_buffer[65536];
-
+static char *ftp_buffer;
static DEFINE_SPINLOCK(ip_ftp_lock);
#define MAX_PORTS 8
-static int ports[MAX_PORTS];
+static short ports[MAX_PORTS];
static int ports_c;
-module_param_array(ports, int, &ports_c, 0400);
+module_param_array(ports, short, &ports_c, 0400);
static int loose;
module_param(loose, int, 0600);
@@ -262,7 +261,8 @@ static int find_nl_seq(u32 seq, const struct ip_ct_ftp_master *info, int dir)
}
/* We don't update if it's older than what we have. */
-static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir)
+static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir,
+ struct sk_buff *skb)
{
unsigned int i, oldest = NUM_SEQ_TO_REMEMBER;
@@ -276,10 +276,13 @@ static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir)
oldest = i;
}
- if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER)
+ if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) {
info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq;
- else if (oldest != NUM_SEQ_TO_REMEMBER)
+ ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb);
+ } else if (oldest != NUM_SEQ_TO_REMEMBER) {
info->seq_aft_nl[dir][oldest] = nl_seq;
+ ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb);
+ }
}
static int help(struct sk_buff **pskb,
@@ -418,6 +421,7 @@ static int help(struct sk_buff **pskb,
{ 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }});
exp->expectfn = NULL;
+ exp->flags = 0;
/* Now, NAT might want to mangle the packet, and register the
* (possibly changed) expectation itself. */
@@ -439,14 +443,14 @@ out_update_nl:
/* Now if this ends in \n, update ftp info. Seq may have been
* adjusted by NAT code. */
if (ends_in_nl)
- update_nl_seq(seq, ct_ftp_info,dir);
+ update_nl_seq(seq, ct_ftp_info,dir, *pskb);
out:
spin_unlock_bh(&ip_ftp_lock);
return ret;
}
static struct ip_conntrack_helper ftp[MAX_PORTS];
-static char ftp_names[MAX_PORTS][10];
+static char ftp_names[MAX_PORTS][sizeof("ftp-65535")];
/* Not __exit: called from init() */
static void fini(void)
@@ -457,6 +461,8 @@ static void fini(void)
ports[i]);
ip_conntrack_helper_unregister(&ftp[i]);
}
+
+ kfree(ftp_buffer);
}
static int __init init(void)
@@ -464,6 +470,10 @@ static int __init init(void)
int i, ret;
char *tmpname;
+ ftp_buffer = kmalloc(65536, GFP_KERNEL);
+ if (!ftp_buffer)
+ return -ENOMEM;
+
if (ports_c == 0)
ports[ports_c++] = FTP_PORT;
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
new file mode 100644
index 00000000000..926a6684643
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
@@ -0,0 +1,806 @@
+/*
+ * ip_conntrack_pptp.c - Version 3.0
+ *
+ * Connection tracking support for PPTP (Point to Point Tunneling Protocol).
+ * PPTP is a a protocol for creating virtual private networks.
+ * It is a specification defined by Microsoft and some vendors
+ * working with Microsoft. PPTP is built on top of a modified
+ * version of the Internet Generic Routing Encapsulation Protocol.
+ * GRE is defined in RFC 1701 and RFC 1702. Documentation of
+ * PPTP can be found in RFC 2637
+ *
+ * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ *
+ * Limitations:
+ * - We blindly assume that control connections are always
+ * established in PNS->PAC direction. This is a violation
+ * of RFFC2673
+ * - We can only support one single call within each session
+ *
+ * TODO:
+ * - testing of incoming PPTP calls
+ *
+ * Changes:
+ * 2002-02-05 - Version 1.3
+ * - Call ip_conntrack_unexpect_related() from
+ * pptp_destroy_siblings() to destroy expectations in case
+ * CALL_DISCONNECT_NOTIFY or tcp fin packet was seen
+ * (Philip Craig <philipc@snapgear.com>)
+ * - Add Version information at module loadtime
+ * 2002-02-10 - Version 1.6
+ * - move to C99 style initializers
+ * - remove second expectation if first arrives
+ * 2004-10-22 - Version 2.0
+ * - merge Mandrake's 2.6.x port with recent 2.6.x API changes
+ * - fix lots of linear skb assumptions from Mandrake's port
+ * 2005-06-10 - Version 2.1
+ * - use ip_conntrack_expect_free() instead of kfree() on the
+ * expect's (which are from the slab for quite some time)
+ * 2005-06-10 - Version 3.0
+ * - port helper to post-2.6.11 API changes,
+ * funded by Oxcoda NetBox Blue (http://www.netboxblue.com/)
+ * 2005-07-30 - Version 3.1
+ * - port helper to 2.6.13 API changes
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_proto_gre.h>
+#include <linux/netfilter_ipv4/ip_conntrack_pptp.h>
+
+#define IP_CT_PPTP_VERSION "3.1"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
+MODULE_DESCRIPTION("Netfilter connection tracking helper module for PPTP");
+
+static DEFINE_SPINLOCK(ip_pptp_lock);
+
+int
+(*ip_nat_pptp_hook_outbound)(struct sk_buff **pskb,
+ struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo,
+ struct PptpControlHeader *ctlh,
+ union pptp_ctrl_union *pptpReq);
+
+int
+(*ip_nat_pptp_hook_inbound)(struct sk_buff **pskb,
+ struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo,
+ struct PptpControlHeader *ctlh,
+ union pptp_ctrl_union *pptpReq);
+
+int
+(*ip_nat_pptp_hook_exp_gre)(struct ip_conntrack_expect *expect_orig,
+ struct ip_conntrack_expect *expect_reply);
+
+void
+(*ip_nat_pptp_hook_expectfn)(struct ip_conntrack *ct,
+ struct ip_conntrack_expect *exp);
+
+#if 0
+/* PptpControlMessageType names */
+const char *pptp_msg_name[] = {
+ "UNKNOWN_MESSAGE",
+ "START_SESSION_REQUEST",
+ "START_SESSION_REPLY",
+ "STOP_SESSION_REQUEST",
+ "STOP_SESSION_REPLY",
+ "ECHO_REQUEST",
+ "ECHO_REPLY",
+ "OUT_CALL_REQUEST",
+ "OUT_CALL_REPLY",
+ "IN_CALL_REQUEST",
+ "IN_CALL_REPLY",
+ "IN_CALL_CONNECT",
+ "CALL_CLEAR_REQUEST",
+ "CALL_DISCONNECT_NOTIFY",
+ "WAN_ERROR_NOTIFY",
+ "SET_LINK_INFO"
+};
+EXPORT_SYMBOL(pptp_msg_name);
+#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s: " format, __FILE__, __FUNCTION__, ## args)
+#else
+#define DEBUGP(format, args...)
+#endif
+
+#define SECS *HZ
+#define MINS * 60 SECS
+#define HOURS * 60 MINS
+
+#define PPTP_GRE_TIMEOUT (10 MINS)
+#define PPTP_GRE_STREAM_TIMEOUT (5 HOURS)
+
+static void pptp_expectfn(struct ip_conntrack *ct,
+ struct ip_conntrack_expect *exp)
+{
+ DEBUGP("increasing timeouts\n");
+
+ /* increase timeout of GRE data channel conntrack entry */
+ ct->proto.gre.timeout = PPTP_GRE_TIMEOUT;
+ ct->proto.gre.stream_timeout = PPTP_GRE_STREAM_TIMEOUT;
+
+ /* Can you see how rusty this code is, compared with the pre-2.6.11
+ * one? That's what happened to my shiny newnat of 2002 ;( -HW */
+
+ if (!ip_nat_pptp_hook_expectfn) {
+ struct ip_conntrack_tuple inv_t;
+ struct ip_conntrack_expect *exp_other;
+
+ /* obviously this tuple inversion only works until you do NAT */
+ invert_tuplepr(&inv_t, &exp->tuple);
+ DEBUGP("trying to unexpect other dir: ");
+ DUMP_TUPLE(&inv_t);
+
+ exp_other = ip_conntrack_expect_find(&inv_t);
+ if (exp_other) {
+ /* delete other expectation. */
+ DEBUGP("found\n");
+ ip_conntrack_unexpect_related(exp_other);
+ ip_conntrack_expect_put(exp_other);
+ } else {
+ DEBUGP("not found\n");
+ }
+ } else {
+ /* we need more than simple inversion */
+ ip_nat_pptp_hook_expectfn(ct, exp);
+ }
+}
+
+static int destroy_sibling_or_exp(const struct ip_conntrack_tuple *t)
+{
+ struct ip_conntrack_tuple_hash *h;
+ struct ip_conntrack_expect *exp;
+
+ DEBUGP("trying to timeout ct or exp for tuple ");
+ DUMP_TUPLE(t);
+
+ h = ip_conntrack_find_get(t, NULL);
+ if (h) {
+ struct ip_conntrack *sibling = tuplehash_to_ctrack(h);
+ DEBUGP("setting timeout of conntrack %p to 0\n", sibling);
+ sibling->proto.gre.timeout = 0;
+ sibling->proto.gre.stream_timeout = 0;
+ if (del_timer(&sibling->timeout))
+ sibling->timeout.function((unsigned long)sibling);
+ ip_conntrack_put(sibling);
+ return 1;
+ } else {
+ exp = ip_conntrack_expect_find(t);
+ if (exp) {
+ DEBUGP("unexpect_related of expect %p\n", exp);
+ ip_conntrack_unexpect_related(exp);
+ ip_conntrack_expect_put(exp);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+
+/* timeout GRE data connections */
+static void pptp_destroy_siblings(struct ip_conntrack *ct)
+{
+ struct ip_conntrack_tuple t;
+
+ /* Since ct->sibling_list has literally rusted away in 2.6.11,
+ * we now need another way to find out about our sibling
+ * contrack and expects... -HW */
+
+ /* try original (pns->pac) tuple */
+ memcpy(&t, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, sizeof(t));
+ t.dst.protonum = IPPROTO_GRE;
+ t.src.u.gre.key = htons(ct->help.ct_pptp_info.pns_call_id);
+ t.dst.u.gre.key = htons(ct->help.ct_pptp_info.pac_call_id);
+
+ if (!destroy_sibling_or_exp(&t))
+ DEBUGP("failed to timeout original pns->pac ct/exp\n");
+
+ /* try reply (pac->pns) tuple */
+ memcpy(&t, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, sizeof(t));
+ t.dst.protonum = IPPROTO_GRE;
+ t.src.u.gre.key = htons(ct->help.ct_pptp_info.pac_call_id);
+ t.dst.u.gre.key = htons(ct->help.ct_pptp_info.pns_call_id);
+
+ if (!destroy_sibling_or_exp(&t))
+ DEBUGP("failed to timeout reply pac->pns ct/exp\n");
+}
+
+/* expect GRE connections (PNS->PAC and PAC->PNS direction) */
+static inline int
+exp_gre(struct ip_conntrack *master,
+ u_int32_t seq,
+ __be16 callid,
+ __be16 peer_callid)
+{
+ struct ip_conntrack_tuple inv_tuple;
+ struct ip_conntrack_tuple exp_tuples[] = {
+ /* tuple in original direction, PNS->PAC */
+ { .src = { .ip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip,
+ .u = { .gre = { .key = peer_callid } }
+ },
+ .dst = { .ip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip,
+ .u = { .gre = { .key = callid } },
+ .protonum = IPPROTO_GRE
+ },
+ },
+ /* tuple in reply direction, PAC->PNS */
+ { .src = { .ip = master->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip,
+ .u = { .gre = { .key = callid } }
+ },
+ .dst = { .ip = master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip,
+ .u = { .gre = { .key = peer_callid } },
+ .protonum = IPPROTO_GRE
+ },
+ }
+ };
+ struct ip_conntrack_expect *exp_orig, *exp_reply;
+ int ret = 1;
+
+ exp_orig = ip_conntrack_expect_alloc(master);
+ if (exp_orig == NULL)
+ goto out;
+
+ exp_reply = ip_conntrack_expect_alloc(master);
+ if (exp_reply == NULL)
+ goto out_put_orig;
+
+ memcpy(&exp_orig->tuple, &exp_tuples[0], sizeof(exp_orig->tuple));
+
+ exp_orig->mask.src.ip = 0xffffffff;
+ exp_orig->mask.src.u.all = 0;
+ exp_orig->mask.dst.u.all = 0;
+ exp_orig->mask.dst.u.gre.key = htons(0xffff);
+ exp_orig->mask.dst.ip = 0xffffffff;
+ exp_orig->mask.dst.protonum = 0xff;
+
+ exp_orig->master = master;
+ exp_orig->expectfn = pptp_expectfn;
+ exp_orig->flags = 0;
+
+ exp_orig->dir = IP_CT_DIR_ORIGINAL;
+
+ /* both expectations are identical apart from tuple */
+ memcpy(exp_reply, exp_orig, sizeof(*exp_reply));
+ memcpy(&exp_reply->tuple, &exp_tuples[1], sizeof(exp_reply->tuple));
+
+ exp_reply->dir = !exp_orig->dir;
+
+ if (ip_nat_pptp_hook_exp_gre)
+ ret = ip_nat_pptp_hook_exp_gre(exp_orig, exp_reply);
+ else {
+
+ DEBUGP("calling expect_related PNS->PAC");
+ DUMP_TUPLE(&exp_orig->tuple);
+
+ if (ip_conntrack_expect_related(exp_orig) != 0) {
+ DEBUGP("cannot expect_related()\n");
+ goto out_put_both;
+ }
+
+ DEBUGP("calling expect_related PAC->PNS");
+ DUMP_TUPLE(&exp_reply->tuple);
+
+ if (ip_conntrack_expect_related(exp_reply) != 0) {
+ DEBUGP("cannot expect_related()\n");
+ goto out_unexpect_orig;
+ }
+
+ /* Add GRE keymap entries */
+ if (ip_ct_gre_keymap_add(master, &exp_reply->tuple, 0) != 0) {
+ DEBUGP("cannot keymap_add() exp\n");
+ goto out_unexpect_both;
+ }
+
+ invert_tuplepr(&inv_tuple, &exp_reply->tuple);
+ if (ip_ct_gre_keymap_add(master, &inv_tuple, 1) != 0) {
+ ip_ct_gre_keymap_destroy(master);
+ DEBUGP("cannot keymap_add() exp_inv\n");
+ goto out_unexpect_both;
+ }
+ ret = 0;
+ }
+
+out_put_both:
+ ip_conntrack_expect_put(exp_reply);
+out_put_orig:
+ ip_conntrack_expect_put(exp_orig);
+out:
+ return ret;
+
+out_unexpect_both:
+ ip_conntrack_unexpect_related(exp_reply);
+out_unexpect_orig:
+ ip_conntrack_unexpect_related(exp_orig);
+ goto out_put_both;
+}
+
+static inline int
+pptp_inbound_pkt(struct sk_buff **pskb,
+ struct tcphdr *tcph,
+ unsigned int nexthdr_off,
+ unsigned int datalen,
+ struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ struct PptpControlHeader _ctlh, *ctlh;
+ unsigned int reqlen;
+ union pptp_ctrl_union _pptpReq, *pptpReq;
+ struct ip_ct_pptp_master *info = &ct->help.ct_pptp_info;
+ u_int16_t msg;
+ __be16 *cid, *pcid;
+ u_int32_t seq;
+
+ ctlh = skb_header_pointer(*pskb, nexthdr_off, sizeof(_ctlh), &_ctlh);
+ if (!ctlh) {
+ DEBUGP("error during skb_header_pointer\n");
+ return NF_ACCEPT;
+ }
+ nexthdr_off += sizeof(_ctlh);
+ datalen -= sizeof(_ctlh);
+
+ reqlen = datalen;
+ if (reqlen > sizeof(*pptpReq))
+ reqlen = sizeof(*pptpReq);
+ pptpReq = skb_header_pointer(*pskb, nexthdr_off, reqlen, &_pptpReq);
+ if (!pptpReq) {
+ DEBUGP("error during skb_header_pointer\n");
+ return NF_ACCEPT;
+ }
+
+ msg = ntohs(ctlh->messageType);
+ DEBUGP("inbound control message %s\n", pptp_msg_name[msg]);
+
+ switch (msg) {
+ case PPTP_START_SESSION_REPLY:
+ if (reqlen < sizeof(_pptpReq.srep)) {
+ DEBUGP("%s: short packet\n", pptp_msg_name[msg]);
+ break;
+ }
+
+ /* server confirms new control session */
+ if (info->sstate < PPTP_SESSION_REQUESTED) {
+ DEBUGP("%s without START_SESS_REQUEST\n",
+ pptp_msg_name[msg]);
+ break;
+ }
+ if (pptpReq->srep.resultCode == PPTP_START_OK)
+ info->sstate = PPTP_SESSION_CONFIRMED;
+ else
+ info->sstate = PPTP_SESSION_ERROR;
+ break;
+
+ case PPTP_STOP_SESSION_REPLY:
+ if (reqlen < sizeof(_pptpReq.strep)) {
+ DEBUGP("%s: short packet\n", pptp_msg_name[msg]);
+ break;
+ }
+
+ /* server confirms end of control session */
+ if (info->sstate > PPTP_SESSION_STOPREQ) {
+ DEBUGP("%s without STOP_SESS_REQUEST\n",
+ pptp_msg_name[msg]);
+ break;
+ }
+ if (pptpReq->strep.resultCode == PPTP_STOP_OK)
+ info->sstate = PPTP_SESSION_NONE;
+ else
+ info->sstate = PPTP_SESSION_ERROR;
+ break;
+
+ case PPTP_OUT_CALL_REPLY:
+ if (reqlen < sizeof(_pptpReq.ocack)) {
+ DEBUGP("%s: short packet\n", pptp_msg_name[msg]);
+ break;
+ }
+
+ /* server accepted call, we now expect GRE frames */
+ if (info->sstate != PPTP_SESSION_CONFIRMED) {
+ DEBUGP("%s but no session\n", pptp_msg_name[msg]);
+ break;
+ }
+ if (info->cstate != PPTP_CALL_OUT_REQ &&
+ info->cstate != PPTP_CALL_OUT_CONF) {
+ DEBUGP("%s without OUTCALL_REQ\n", pptp_msg_name[msg]);
+ break;
+ }
+ if (pptpReq->ocack.resultCode != PPTP_OUTCALL_CONNECT) {
+ info->cstate = PPTP_CALL_NONE;
+ break;
+ }
+
+ cid = &pptpReq->ocack.callID;
+ pcid = &pptpReq->ocack.peersCallID;
+
+ info->pac_call_id = ntohs(*cid);
+
+ if (htons(info->pns_call_id) != *pcid) {
+ DEBUGP("%s for unknown callid %u\n",
+ pptp_msg_name[msg], ntohs(*pcid));
+ break;
+ }
+
+ DEBUGP("%s, CID=%X, PCID=%X\n", pptp_msg_name[msg],
+ ntohs(*cid), ntohs(*pcid));
+
+ info->cstate = PPTP_CALL_OUT_CONF;
+
+ seq = ntohl(tcph->seq) + sizeof(struct pptp_pkt_hdr)
+ + sizeof(struct PptpControlHeader)
+ + ((void *)pcid - (void *)pptpReq);
+
+ if (exp_gre(ct, seq, *cid, *pcid) != 0)
+ printk("ip_conntrack_pptp: error during exp_gre\n");
+ break;
+
+ case PPTP_IN_CALL_REQUEST:
+ if (reqlen < sizeof(_pptpReq.icack)) {
+ DEBUGP("%s: short packet\n", pptp_msg_name[msg]);
+ break;
+ }
+
+ /* server tells us about incoming call request */
+ if (info->sstate != PPTP_SESSION_CONFIRMED) {
+ DEBUGP("%s but no session\n", pptp_msg_name[msg]);
+ break;
+ }
+ pcid = &pptpReq->icack.peersCallID;
+ DEBUGP("%s, PCID=%X\n", pptp_msg_name[msg], ntohs(*pcid));
+ info->cstate = PPTP_CALL_IN_REQ;
+ info->pac_call_id = ntohs(*pcid);
+ break;
+
+ case PPTP_IN_CALL_CONNECT:
+ if (reqlen < sizeof(_pptpReq.iccon)) {
+ DEBUGP("%s: short packet\n", pptp_msg_name[msg]);
+ break;
+ }
+
+ /* server tells us about incoming call established */
+ if (info->sstate != PPTP_SESSION_CONFIRMED) {
+ DEBUGP("%s but no session\n", pptp_msg_name[msg]);
+ break;
+ }
+ if (info->sstate != PPTP_CALL_IN_REP
+ && info->sstate != PPTP_CALL_IN_CONF) {
+ DEBUGP("%s but never sent IN_CALL_REPLY\n",
+ pptp_msg_name[msg]);
+ break;
+ }
+
+ pcid = &pptpReq->iccon.peersCallID;
+ cid = &info->pac_call_id;
+
+ if (info->pns_call_id != ntohs(*pcid)) {
+ DEBUGP("%s for unknown CallID %u\n",
+ pptp_msg_name[msg], ntohs(*pcid));
+ break;
+ }
+
+ DEBUGP("%s, PCID=%X\n", pptp_msg_name[msg], ntohs(*pcid));
+ info->cstate = PPTP_CALL_IN_CONF;
+
+ /* we expect a GRE connection from PAC to PNS */
+ seq = ntohl(tcph->seq) + sizeof(struct pptp_pkt_hdr)
+ + sizeof(struct PptpControlHeader)
+ + ((void *)pcid - (void *)pptpReq);
+
+ if (exp_gre(ct, seq, *cid, *pcid) != 0)
+ printk("ip_conntrack_pptp: error during exp_gre\n");
+
+ break;
+
+ case PPTP_CALL_DISCONNECT_NOTIFY:
+ if (reqlen < sizeof(_pptpReq.disc)) {
+ DEBUGP("%s: short packet\n", pptp_msg_name[msg]);
+ break;
+ }
+
+ /* server confirms disconnect */
+ cid = &pptpReq->disc.callID;
+ DEBUGP("%s, CID=%X\n", pptp_msg_name[msg], ntohs(*cid));
+ info->cstate = PPTP_CALL_NONE;
+
+ /* untrack this call id, unexpect GRE packets */
+ pptp_destroy_siblings(ct);
+ break;
+
+ case PPTP_WAN_ERROR_NOTIFY:
+ break;
+
+ case PPTP_ECHO_REQUEST:
+ case PPTP_ECHO_REPLY:
+ /* I don't have to explain these ;) */
+ break;
+ default:
+ DEBUGP("invalid %s (TY=%d)\n", (msg <= PPTP_MSG_MAX)
+ ? pptp_msg_name[msg]:pptp_msg_name[0], msg);
+ break;
+ }
+
+
+ if (ip_nat_pptp_hook_inbound)
+ return ip_nat_pptp_hook_inbound(pskb, ct, ctinfo, ctlh,
+ pptpReq);
+
+ return NF_ACCEPT;
+
+}
+
+static inline int
+pptp_outbound_pkt(struct sk_buff **pskb,
+ struct tcphdr *tcph,
+ unsigned int nexthdr_off,
+ unsigned int datalen,
+ struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ struct PptpControlHeader _ctlh, *ctlh;
+ unsigned int reqlen;
+ union pptp_ctrl_union _pptpReq, *pptpReq;
+ struct ip_ct_pptp_master *info = &ct->help.ct_pptp_info;
+ u_int16_t msg;
+ __be16 *cid, *pcid;
+
+ ctlh = skb_header_pointer(*pskb, nexthdr_off, sizeof(_ctlh), &_ctlh);
+ if (!ctlh)
+ return NF_ACCEPT;
+ nexthdr_off += sizeof(_ctlh);
+ datalen -= sizeof(_ctlh);
+
+ reqlen = datalen;
+ if (reqlen > sizeof(*pptpReq))
+ reqlen = sizeof(*pptpReq);
+ pptpReq = skb_header_pointer(*pskb, nexthdr_off, reqlen, &_pptpReq);
+ if (!pptpReq)
+ return NF_ACCEPT;
+
+ msg = ntohs(ctlh->messageType);
+ DEBUGP("outbound control message %s\n", pptp_msg_name[msg]);
+
+ switch (msg) {
+ case PPTP_START_SESSION_REQUEST:
+ /* client requests for new control session */
+ if (info->sstate != PPTP_SESSION_NONE) {
+ DEBUGP("%s but we already have one",
+ pptp_msg_name[msg]);
+ }
+ info->sstate = PPTP_SESSION_REQUESTED;
+ break;
+ case PPTP_STOP_SESSION_REQUEST:
+ /* client requests end of control session */
+ info->sstate = PPTP_SESSION_STOPREQ;
+ break;
+
+ case PPTP_OUT_CALL_REQUEST:
+ if (reqlen < sizeof(_pptpReq.ocreq)) {
+ DEBUGP("%s: short packet\n", pptp_msg_name[msg]);
+ /* FIXME: break; */
+ }
+
+ /* client initiating connection to server */
+ if (info->sstate != PPTP_SESSION_CONFIRMED) {
+ DEBUGP("%s but no session\n",
+ pptp_msg_name[msg]);
+ break;
+ }
+ info->cstate = PPTP_CALL_OUT_REQ;
+ /* track PNS call id */
+ cid = &pptpReq->ocreq.callID;
+ DEBUGP("%s, CID=%X\n", pptp_msg_name[msg], ntohs(*cid));
+ info->pns_call_id = ntohs(*cid);
+ break;
+ case PPTP_IN_CALL_REPLY:
+ if (reqlen < sizeof(_pptpReq.icack)) {
+ DEBUGP("%s: short packet\n", pptp_msg_name[msg]);
+ break;
+ }
+
+ /* client answers incoming call */
+ if (info->cstate != PPTP_CALL_IN_REQ
+ && info->cstate != PPTP_CALL_IN_REP) {
+ DEBUGP("%s without incall_req\n",
+ pptp_msg_name[msg]);
+ break;
+ }
+ if (pptpReq->icack.resultCode != PPTP_INCALL_ACCEPT) {
+ info->cstate = PPTP_CALL_NONE;
+ break;
+ }
+ pcid = &pptpReq->icack.peersCallID;
+ if (info->pac_call_id != ntohs(*pcid)) {
+ DEBUGP("%s for unknown call %u\n",
+ pptp_msg_name[msg], ntohs(*pcid));
+ break;
+ }
+ DEBUGP("%s, CID=%X\n", pptp_msg_name[msg], ntohs(*pcid));
+ /* part two of the three-way handshake */
+ info->cstate = PPTP_CALL_IN_REP;
+ info->pns_call_id = ntohs(pptpReq->icack.callID);
+ break;
+
+ case PPTP_CALL_CLEAR_REQUEST:
+ /* client requests hangup of call */
+ if (info->sstate != PPTP_SESSION_CONFIRMED) {
+ DEBUGP("CLEAR_CALL but no session\n");
+ break;
+ }
+ /* FUTURE: iterate over all calls and check if
+ * call ID is valid. We don't do this without newnat,
+ * because we only know about last call */
+ info->cstate = PPTP_CALL_CLEAR_REQ;
+ break;
+ case PPTP_SET_LINK_INFO:
+ break;
+ case PPTP_ECHO_REQUEST:
+ case PPTP_ECHO_REPLY:
+ /* I don't have to explain these ;) */
+ break;
+ default:
+ DEBUGP("invalid %s (TY=%d)\n", (msg <= PPTP_MSG_MAX)?
+ pptp_msg_name[msg]:pptp_msg_name[0], msg);
+ /* unknown: no need to create GRE masq table entry */
+ break;
+ }
+
+ if (ip_nat_pptp_hook_outbound)
+ return ip_nat_pptp_hook_outbound(pskb, ct, ctinfo, ctlh,
+ pptpReq);
+
+ return NF_ACCEPT;
+}
+
+
+/* track caller id inside control connection, call expect_related */
+static int
+conntrack_pptp_help(struct sk_buff **pskb,
+ struct ip_conntrack *ct, enum ip_conntrack_info ctinfo)
+
+{
+ struct pptp_pkt_hdr _pptph, *pptph;
+ struct tcphdr _tcph, *tcph;
+ u_int32_t tcplen = (*pskb)->len - (*pskb)->nh.iph->ihl * 4;
+ u_int32_t datalen;
+ int dir = CTINFO2DIR(ctinfo);
+ struct ip_ct_pptp_master *info = &ct->help.ct_pptp_info;
+ unsigned int nexthdr_off;
+
+ int oldsstate, oldcstate;
+ int ret;
+
+ /* don't do any tracking before tcp handshake complete */
+ if (ctinfo != IP_CT_ESTABLISHED
+ && ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) {
+ DEBUGP("ctinfo = %u, skipping\n", ctinfo);
+ return NF_ACCEPT;
+ }
+
+ nexthdr_off = (*pskb)->nh.iph->ihl*4;
+ tcph = skb_header_pointer(*pskb, nexthdr_off, sizeof(_tcph), &_tcph);
+ BUG_ON(!tcph);
+ nexthdr_off += tcph->doff * 4;
+ datalen = tcplen - tcph->doff * 4;
+
+ if (tcph->fin || tcph->rst) {
+ DEBUGP("RST/FIN received, timeouting GRE\n");
+ /* can't do this after real newnat */
+ info->cstate = PPTP_CALL_NONE;
+
+ /* untrack this call id, unexpect GRE packets */
+ pptp_destroy_siblings(ct);
+ }
+
+ pptph = skb_header_pointer(*pskb, nexthdr_off, sizeof(_pptph), &_pptph);
+ if (!pptph) {
+ DEBUGP("no full PPTP header, can't track\n");
+ return NF_ACCEPT;
+ }
+ nexthdr_off += sizeof(_pptph);
+ datalen -= sizeof(_pptph);
+
+ /* if it's not a control message we can't do anything with it */
+ if (ntohs(pptph->packetType) != PPTP_PACKET_CONTROL ||
+ ntohl(pptph->magicCookie) != PPTP_MAGIC_COOKIE) {
+ DEBUGP("not a control packet\n");
+ return NF_ACCEPT;
+ }
+
+ oldsstate = info->sstate;
+ oldcstate = info->cstate;
+
+ spin_lock_bh(&ip_pptp_lock);
+
+ /* FIXME: We just blindly assume that the control connection is always
+ * established from PNS->PAC. However, RFC makes no guarantee */
+ if (dir == IP_CT_DIR_ORIGINAL)
+ /* client -> server (PNS -> PAC) */
+ ret = pptp_outbound_pkt(pskb, tcph, nexthdr_off, datalen, ct,
+ ctinfo);
+ else
+ /* server -> client (PAC -> PNS) */
+ ret = pptp_inbound_pkt(pskb, tcph, nexthdr_off, datalen, ct,
+ ctinfo);
+ DEBUGP("sstate: %d->%d, cstate: %d->%d\n",
+ oldsstate, info->sstate, oldcstate, info->cstate);
+ spin_unlock_bh(&ip_pptp_lock);
+
+ return ret;
+}
+
+/* control protocol helper */
+static struct ip_conntrack_helper pptp = {
+ .list = { NULL, NULL },
+ .name = "pptp",
+ .me = THIS_MODULE,
+ .max_expected = 2,
+ .timeout = 5 * 60,
+ .tuple = { .src = { .ip = 0,
+ .u = { .tcp = { .port =
+ __constant_htons(PPTP_CONTROL_PORT) } }
+ },
+ .dst = { .ip = 0,
+ .u = { .all = 0 },
+ .protonum = IPPROTO_TCP
+ }
+ },
+ .mask = { .src = { .ip = 0,
+ .u = { .tcp = { .port = __constant_htons(0xffff) } }
+ },
+ .dst = { .ip = 0,
+ .u = { .all = 0 },
+ .protonum = 0xff
+ }
+ },
+ .help = conntrack_pptp_help
+};
+
+extern void __exit ip_ct_proto_gre_fini(void);
+extern int __init ip_ct_proto_gre_init(void);
+
+/* ip_conntrack_pptp initialization */
+static int __init init(void)
+{
+ int retcode;
+
+ retcode = ip_ct_proto_gre_init();
+ if (retcode < 0)
+ return retcode;
+
+ DEBUGP(" registering helper\n");
+ if ((retcode = ip_conntrack_helper_register(&pptp))) {
+ printk(KERN_ERR "Unable to register conntrack application "
+ "helper for pptp: %d\n", retcode);
+ ip_ct_proto_gre_fini();
+ return retcode;
+ }
+
+ printk("ip_conntrack_pptp version %s loaded\n", IP_CT_PPTP_VERSION);
+ return 0;
+}
+
+static void __exit fini(void)
+{
+ ip_conntrack_helper_unregister(&pptp);
+ ip_ct_proto_gre_fini();
+ printk("ip_conntrack_pptp version %s unloaded\n", IP_CT_PPTP_VERSION);
+}
+
+module_init(init);
+module_exit(fini);
+
+EXPORT_SYMBOL(ip_nat_pptp_hook_outbound);
+EXPORT_SYMBOL(ip_nat_pptp_hook_inbound);
+EXPORT_SYMBOL(ip_nat_pptp_hook_exp_gre);
+EXPORT_SYMBOL(ip_nat_pptp_hook_expectfn);
diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c
index 4a28f297d50..15457415a4f 100644
--- a/net/ipv4/netfilter/ip_conntrack_irc.c
+++ b/net/ipv4/netfilter/ip_conntrack_irc.c
@@ -34,12 +34,12 @@
#include <linux/moduleparam.h>
#define MAX_PORTS 8
-static int ports[MAX_PORTS];
+static short ports[MAX_PORTS];
static int ports_c;
static int max_dcc_channels = 8;
static unsigned int dcc_timeout = 300;
/* This is slow, but it's simple. --RR */
-static char irc_buffer[65536];
+static char *irc_buffer;
static DEFINE_SPINLOCK(irc_buffer_lock);
unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb,
@@ -52,7 +52,7 @@ EXPORT_SYMBOL_GPL(ip_nat_irc_hook);
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
MODULE_DESCRIPTION("IRC (DCC) connection tracking helper");
MODULE_LICENSE("GPL");
-module_param_array(ports, int, &ports_c, 0400);
+module_param_array(ports, short, &ports_c, 0400);
MODULE_PARM_DESC(ports, "port numbers of IRC servers");
module_param(max_dcc_channels, int, 0400);
MODULE_PARM_DESC(max_dcc_channels, "max number of expected DCC channels per IRC session");
@@ -221,6 +221,7 @@ static int help(struct sk_buff **pskb,
{ { 0, { 0 } },
{ 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }});
exp->expectfn = NULL;
+ exp->flags = 0;
if (ip_nat_irc_hook)
ret = ip_nat_irc_hook(pskb, ctinfo,
addr_beg_p - ib_ptr,
@@ -239,7 +240,7 @@ static int help(struct sk_buff **pskb,
}
static struct ip_conntrack_helper irc_helpers[MAX_PORTS];
-static char irc_names[MAX_PORTS][10];
+static char irc_names[MAX_PORTS][sizeof("irc-65535")];
static void fini(void);
@@ -257,6 +258,10 @@ static int __init init(void)
printk("ip_conntrack_irc: dcc_timeout must be a positive integer\n");
return -EBUSY;
}
+
+ irc_buffer = kmalloc(65536, GFP_KERNEL);
+ if (!irc_buffer)
+ return -ENOMEM;
/* If no port given, default to standard irc port */
if (ports_c == 0)
@@ -304,6 +309,7 @@ static void fini(void)
ports[i]);
ip_conntrack_helper_unregister(&irc_helpers[i]);
}
+ kfree(irc_buffer);
}
module_init(init);
diff --git a/net/ipv4/netfilter/ip_conntrack_netbios_ns.c b/net/ipv4/netfilter/ip_conntrack_netbios_ns.c
new file mode 100644
index 00000000000..186646eb249
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_netbios_ns.c
@@ -0,0 +1,142 @@
+/*
+ * NetBIOS name service broadcast connection tracking helper
+ *
+ * (c) 2005 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+/*
+ * This helper tracks locally originating NetBIOS name service
+ * requests by issuing permanent expectations (valid until
+ * timing out) matching all reply connections from the
+ * destination network. The only NetBIOS specific thing is
+ * actually the port number.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <net/route.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+
+#define NMBD_PORT 137
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("NetBIOS name service broadcast connection tracking helper");
+MODULE_LICENSE("GPL");
+
+static unsigned int timeout = 3;
+module_param(timeout, int, 0600);
+MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds");
+
+static int help(struct sk_buff **pskb,
+ struct ip_conntrack *ct, enum ip_conntrack_info ctinfo)
+{
+ struct ip_conntrack_expect *exp;
+ struct iphdr *iph = (*pskb)->nh.iph;
+ struct rtable *rt = (struct rtable *)(*pskb)->dst;
+ struct in_device *in_dev;
+ u_int32_t mask = 0;
+
+ /* we're only interested in locally generated packets */
+ if ((*pskb)->sk == NULL)
+ goto out;
+ if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST))
+ goto out;
+ if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+ goto out;
+
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(rt->u.dst.dev);
+ if (in_dev != NULL) {
+ for_primary_ifa(in_dev) {
+ if (ifa->ifa_broadcast == iph->daddr) {
+ mask = ifa->ifa_mask;
+ break;
+ }
+ } endfor_ifa(in_dev);
+ }
+ rcu_read_unlock();
+
+ if (mask == 0)
+ goto out;
+
+ exp = ip_conntrack_expect_alloc(ct);
+ if (exp == NULL)
+ goto out;
+
+ exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+ exp->tuple.src.u.udp.port = ntohs(NMBD_PORT);
+
+ exp->mask.src.ip = mask;
+ exp->mask.src.u.udp.port = 0xFFFF;
+ exp->mask.dst.ip = 0xFFFFFFFF;
+ exp->mask.dst.u.udp.port = 0xFFFF;
+ exp->mask.dst.protonum = 0xFF;
+
+ exp->expectfn = NULL;
+ exp->flags = IP_CT_EXPECT_PERMANENT;
+
+ ip_conntrack_expect_related(exp);
+ ip_conntrack_expect_put(exp);
+
+ ip_ct_refresh(ct, *pskb, timeout * HZ);
+out:
+ return NF_ACCEPT;
+}
+
+static struct ip_conntrack_helper helper = {
+ .name = "netbios-ns",
+ .tuple = {
+ .src = {
+ .u = {
+ .udp = {
+ .port = __constant_htons(NMBD_PORT),
+ }
+ }
+ },
+ .dst = {
+ .protonum = IPPROTO_UDP,
+ },
+ },
+ .mask = {
+ .src = {
+ .u = {
+ .udp = {
+ .port = 0xFFFF,
+ }
+ }
+ },
+ .dst = {
+ .protonum = 0xFF,
+ },
+ },
+ .max_expected = 1,
+ .me = THIS_MODULE,
+ .help = help,
+};
+
+static int __init init(void)
+{
+ helper.timeout = timeout;
+ return ip_conntrack_helper_register(&helper);
+}
+
+static void __exit fini(void)
+{
+ ip_conntrack_helper_unregister(&helper);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c
new file mode 100644
index 00000000000..166e6069f12
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_netlink.c
@@ -0,0 +1,1622 @@
+/* Connection tracking via netlink socket. Allows for user space
+ * protocol helpers and general trouble making from userspace.
+ *
+ * (C) 2001 by Jay Schulist <jschlst@samba.org>
+ * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org>
+ * (C) 2003 by Patrick Mchardy <kaber@trash.net>
+ * (C) 2005 by Pablo Neira Ayuso <pablo@eurodev.net>
+ *
+ * I've reworked this stuff to use attributes instead of conntrack
+ * structures. 5.44 am. I need more tea. --pablo 05/07/11.
+ *
+ * Initial connection tracking via netlink development funded and
+ * generally made possible by Network Robots, Inc. (www.networkrobots.com)
+ *
+ * Further development of this code funded by Astaro AG (http://www.astaro.com)
+ *
+ * This software may be used and distributed according to the terms
+ * of the GNU General Public License, incorporated herein by reference.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/netlink.h>
+#include <linux/spinlock.h>
+#include <linux/notifier.h>
+#include <linux/rtnetlink.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+MODULE_LICENSE("GPL");
+
+static char __initdata version[] = "0.90";
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+
+static inline int
+ctnetlink_dump_tuples_proto(struct sk_buff *skb,
+ const struct ip_conntrack_tuple *tuple)
+{
+ struct ip_conntrack_protocol *proto;
+
+ NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum);
+
+ proto = ip_conntrack_proto_find_get(tuple->dst.protonum);
+ if (proto && proto->tuple_to_nfattr)
+ return proto->tuple_to_nfattr(skb, tuple);
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_dump_tuples(struct sk_buff *skb,
+ const struct ip_conntrack_tuple *tuple)
+{
+ struct nfattr *nest_parms;
+
+ nest_parms = NFA_NEST(skb, CTA_TUPLE_IP);
+ NFA_PUT(skb, CTA_IP_V4_SRC, sizeof(u_int32_t), &tuple->src.ip);
+ NFA_PUT(skb, CTA_IP_V4_DST, sizeof(u_int32_t), &tuple->dst.ip);
+ NFA_NEST_END(skb, nest_parms);
+
+ nest_parms = NFA_NEST(skb, CTA_TUPLE_PROTO);
+ ctnetlink_dump_tuples_proto(skb, tuple);
+ NFA_NEST_END(skb, nest_parms);
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_dump_status(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+ u_int32_t status = htonl((u_int32_t) ct->status);
+ NFA_PUT(skb, CTA_STATUS, sizeof(status), &status);
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_dump_timeout(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+ long timeout_l = ct->timeout.expires - jiffies;
+ u_int32_t timeout;
+
+ if (timeout_l < 0)
+ timeout = 0;
+ else
+ timeout = htonl(timeout_l / HZ);
+
+ NFA_PUT(skb, CTA_TIMEOUT, sizeof(timeout), &timeout);
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_dump_protoinfo(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+ struct ip_conntrack_protocol *proto = ip_conntrack_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
+
+ struct nfattr *nest_proto;
+ int ret;
+
+ if (!proto || !proto->to_nfattr)
+ return 0;
+
+ nest_proto = NFA_NEST(skb, CTA_PROTOINFO);
+
+ ret = proto->to_nfattr(skb, nest_proto, ct);
+
+ ip_conntrack_proto_put(proto);
+
+ NFA_NEST_END(skb, nest_proto);
+
+ return ret;
+
+nfattr_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+ struct nfattr *nest_helper;
+
+ if (!ct->helper)
+ return 0;
+
+ nest_helper = NFA_NEST(skb, CTA_HELP);
+ NFA_PUT(skb, CTA_HELP_NAME, CTA_HELP_MAXNAMESIZE, &ct->helper->name);
+
+ if (ct->helper->to_nfattr)
+ ct->helper->to_nfattr(skb, ct);
+
+ NFA_NEST_END(skb, nest_helper);
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+#ifdef CONFIG_IP_NF_CT_ACCT
+static inline int
+ctnetlink_dump_counters(struct sk_buff *skb, const struct ip_conntrack *ct,
+ enum ip_conntrack_dir dir)
+{
+ enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG;
+ struct nfattr *nest_count = NFA_NEST(skb, type);
+ u_int64_t tmp;
+
+ tmp = htonl(ct->counters[dir].packets);
+ NFA_PUT(skb, CTA_COUNTERS32_PACKETS, sizeof(u_int32_t), &tmp);
+
+ tmp = htonl(ct->counters[dir].bytes);
+ NFA_PUT(skb, CTA_COUNTERS32_BYTES, sizeof(u_int32_t), &tmp);
+
+ NFA_NEST_END(skb, nest_count);
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+#else
+#define ctnetlink_dump_counters(a, b, c) (0)
+#endif
+
+#ifdef CONFIG_IP_NF_CONNTRACK_MARK
+static inline int
+ctnetlink_dump_mark(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+ u_int32_t mark = htonl(ct->mark);
+
+ NFA_PUT(skb, CTA_MARK, sizeof(u_int32_t), &mark);
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+#else
+#define ctnetlink_dump_mark(a, b) (0)
+#endif
+
+static inline int
+ctnetlink_dump_id(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+ u_int32_t id = htonl(ct->id);
+ NFA_PUT(skb, CTA_ID, sizeof(u_int32_t), &id);
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_dump_use(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+ unsigned int use = htonl(atomic_read(&ct->ct_general.use));
+
+ NFA_PUT(skb, CTA_USE, sizeof(u_int32_t), &use);
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+#define tuple(ct, dir) (&(ct)->tuplehash[dir].tuple)
+
+static int
+ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
+ int event, int nowait,
+ const struct ip_conntrack *ct)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ struct nfattr *nest_parms;
+ unsigned char *b;
+
+ b = skb->tail;
+
+ event |= NFNL_SUBSYS_CTNETLINK << 8;
+ nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg));
+ nfmsg = NLMSG_DATA(nlh);
+
+ nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
+ nfmsg->nfgen_family = AF_INET;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG);
+ if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
+ goto nfattr_failure;
+ NFA_NEST_END(skb, nest_parms);
+
+ nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY);
+ if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0)
+ goto nfattr_failure;
+ NFA_NEST_END(skb, nest_parms);
+
+ if (ctnetlink_dump_status(skb, ct) < 0 ||
+ ctnetlink_dump_timeout(skb, ct) < 0 ||
+ ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
+ ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 ||
+ ctnetlink_dump_protoinfo(skb, ct) < 0 ||
+ ctnetlink_dump_helpinfo(skb, ct) < 0 ||
+ ctnetlink_dump_mark(skb, ct) < 0 ||
+ ctnetlink_dump_id(skb, ct) < 0 ||
+ ctnetlink_dump_use(skb, ct) < 0)
+ goto nfattr_failure;
+
+ nlh->nlmsg_len = skb->tail - b;
+ return skb->len;
+
+nlmsg_failure:
+nfattr_failure:
+ skb_trim(skb, b - skb->data);
+ return -1;
+}
+
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+static int ctnetlink_conntrack_event(struct notifier_block *this,
+ unsigned long events, void *ptr)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ struct nfattr *nest_parms;
+ struct ip_conntrack *ct = (struct ip_conntrack *)ptr;
+ struct sk_buff *skb;
+ unsigned int type;
+ unsigned char *b;
+ unsigned int flags = 0, group;
+
+ /* ignore our fake conntrack entry */
+ if (ct == &ip_conntrack_untracked)
+ return NOTIFY_DONE;
+
+ if (events & IPCT_DESTROY) {
+ type = IPCTNL_MSG_CT_DELETE;
+ group = NFNLGRP_CONNTRACK_DESTROY;
+ goto alloc_skb;
+ }
+ if (events & (IPCT_NEW | IPCT_RELATED)) {
+ type = IPCTNL_MSG_CT_NEW;
+ flags = NLM_F_CREATE|NLM_F_EXCL;
+ /* dump everything */
+ events = ~0UL;
+ group = NFNLGRP_CONNTRACK_NEW;
+ goto alloc_skb;
+ }
+ if (events & (IPCT_STATUS |
+ IPCT_PROTOINFO |
+ IPCT_HELPER |
+ IPCT_HELPINFO |
+ IPCT_NATINFO)) {
+ type = IPCTNL_MSG_CT_NEW;
+ group = NFNLGRP_CONNTRACK_UPDATE;
+ goto alloc_skb;
+ }
+
+ return NOTIFY_DONE;
+
+alloc_skb:
+ /* FIXME: Check if there are any listeners before, don't hurt performance */
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
+ if (!skb)
+ return NOTIFY_DONE;
+
+ b = skb->tail;
+
+ type |= NFNL_SUBSYS_CTNETLINK << 8;
+ nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg));
+ nfmsg = NLMSG_DATA(nlh);
+
+ nlh->nlmsg_flags = flags;
+ nfmsg->nfgen_family = AF_INET;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG);
+ if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
+ goto nfattr_failure;
+ NFA_NEST_END(skb, nest_parms);
+
+ nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY);
+ if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0)
+ goto nfattr_failure;
+ NFA_NEST_END(skb, nest_parms);
+
+ /* NAT stuff is now a status flag */
+ if ((events & IPCT_STATUS || events & IPCT_NATINFO)
+ && ctnetlink_dump_status(skb, ct) < 0)
+ goto nfattr_failure;
+ if (events & IPCT_REFRESH
+ && ctnetlink_dump_timeout(skb, ct) < 0)
+ goto nfattr_failure;
+ if (events & IPCT_PROTOINFO
+ && ctnetlink_dump_protoinfo(skb, ct) < 0)
+ goto nfattr_failure;
+ if (events & IPCT_HELPINFO
+ && ctnetlink_dump_helpinfo(skb, ct) < 0)
+ goto nfattr_failure;
+
+ if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
+ ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0)
+ goto nfattr_failure;
+
+ nlh->nlmsg_len = skb->tail - b;
+ nfnetlink_send(skb, 0, group, 0);
+ return NOTIFY_DONE;
+
+nlmsg_failure:
+nfattr_failure:
+ kfree_skb(skb);
+ return NOTIFY_DONE;
+}
+#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
+
+static int ctnetlink_done(struct netlink_callback *cb)
+{
+ DEBUGP("entered %s\n", __FUNCTION__);
+ return 0;
+}
+
+static int
+ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct ip_conntrack *ct = NULL;
+ struct ip_conntrack_tuple_hash *h;
+ struct list_head *i;
+ u_int32_t *id = (u_int32_t *) &cb->args[1];
+
+ DEBUGP("entered %s, last bucket=%lu id=%u\n", __FUNCTION__,
+ cb->args[0], *id);
+
+ read_lock_bh(&ip_conntrack_lock);
+ for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) {
+ list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) {
+ h = (struct ip_conntrack_tuple_hash *) i;
+ if (DIRECTION(h) != IP_CT_DIR_ORIGINAL)
+ continue;
+ ct = tuplehash_to_ctrack(h);
+ if (ct->id <= *id)
+ continue;
+ if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq,
+ IPCTNL_MSG_CT_NEW,
+ 1, ct) < 0)
+ goto out;
+ *id = ct->id;
+ }
+ }
+out:
+ read_unlock_bh(&ip_conntrack_lock);
+
+ DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id);
+
+ return skb->len;
+}
+
+#ifdef CONFIG_IP_NF_CT_ACCT
+static int
+ctnetlink_dump_table_w(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct ip_conntrack *ct = NULL;
+ struct ip_conntrack_tuple_hash *h;
+ struct list_head *i;
+ u_int32_t *id = (u_int32_t *) &cb->args[1];
+
+ DEBUGP("entered %s, last bucket=%u id=%u\n", __FUNCTION__,
+ cb->args[0], *id);
+
+ write_lock_bh(&ip_conntrack_lock);
+ for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) {
+ list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) {
+ h = (struct ip_conntrack_tuple_hash *) i;
+ if (DIRECTION(h) != IP_CT_DIR_ORIGINAL)
+ continue;
+ ct = tuplehash_to_ctrack(h);
+ if (ct->id <= *id)
+ continue;
+ if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq,
+ IPCTNL_MSG_CT_NEW,
+ 1, ct) < 0)
+ goto out;
+ *id = ct->id;
+
+ memset(&ct->counters, 0, sizeof(ct->counters));
+ }
+ }
+out:
+ write_unlock_bh(&ip_conntrack_lock);
+
+ DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id);
+
+ return skb->len;
+}
+#endif
+
+static const int cta_min_ip[CTA_IP_MAX] = {
+ [CTA_IP_V4_SRC-1] = sizeof(u_int32_t),
+ [CTA_IP_V4_DST-1] = sizeof(u_int32_t),
+};
+
+static inline int
+ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple)
+{
+ struct nfattr *tb[CTA_IP_MAX];
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+
+ if (nfattr_parse_nested(tb, CTA_IP_MAX, attr) < 0)
+ goto nfattr_failure;
+
+ if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip))
+ return -EINVAL;
+
+ if (!tb[CTA_IP_V4_SRC-1])
+ return -EINVAL;
+ tuple->src.ip = *(u_int32_t *)NFA_DATA(tb[CTA_IP_V4_SRC-1]);
+
+ if (!tb[CTA_IP_V4_DST-1])
+ return -EINVAL;
+ tuple->dst.ip = *(u_int32_t *)NFA_DATA(tb[CTA_IP_V4_DST-1]);
+
+ DEBUGP("leaving\n");
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+static const int cta_min_proto[CTA_PROTO_MAX] = {
+ [CTA_PROTO_NUM-1] = sizeof(u_int16_t),
+ [CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t),
+ [CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t),
+ [CTA_PROTO_ICMP_TYPE-1] = sizeof(u_int8_t),
+ [CTA_PROTO_ICMP_CODE-1] = sizeof(u_int8_t),
+ [CTA_PROTO_ICMP_ID-1] = sizeof(u_int16_t),
+};
+
+static inline int
+ctnetlink_parse_tuple_proto(struct nfattr *attr,
+ struct ip_conntrack_tuple *tuple)
+{
+ struct nfattr *tb[CTA_PROTO_MAX];
+ struct ip_conntrack_protocol *proto;
+ int ret = 0;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ if (nfattr_parse_nested(tb, CTA_PROTO_MAX, attr) < 0)
+ goto nfattr_failure;
+
+ if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto))
+ return -EINVAL;
+
+ if (!tb[CTA_PROTO_NUM-1])
+ return -EINVAL;
+ tuple->dst.protonum = *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]);
+
+ proto = ip_conntrack_proto_find_get(tuple->dst.protonum);
+
+ if (likely(proto && proto->nfattr_to_tuple)) {
+ ret = proto->nfattr_to_tuple(tb, tuple);
+ ip_conntrack_proto_put(proto);
+ }
+
+ return ret;
+
+nfattr_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple,
+ enum ctattr_tuple type)
+{
+ struct nfattr *tb[CTA_TUPLE_MAX];
+ int err;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ memset(tuple, 0, sizeof(*tuple));
+
+ if (nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]) < 0)
+ goto nfattr_failure;
+
+ if (!tb[CTA_TUPLE_IP-1])
+ return -EINVAL;
+
+ err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP-1], tuple);
+ if (err < 0)
+ return err;
+
+ if (!tb[CTA_TUPLE_PROTO-1])
+ return -EINVAL;
+
+ err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO-1], tuple);
+ if (err < 0)
+ return err;
+
+ /* orig and expect tuples get DIR_ORIGINAL */
+ if (type == CTA_TUPLE_REPLY)
+ tuple->dst.dir = IP_CT_DIR_REPLY;
+ else
+ tuple->dst.dir = IP_CT_DIR_ORIGINAL;
+
+ DUMP_TUPLE(tuple);
+
+ DEBUGP("leaving\n");
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+#ifdef CONFIG_IP_NF_NAT_NEEDED
+static const int cta_min_protonat[CTA_PROTONAT_MAX] = {
+ [CTA_PROTONAT_PORT_MIN-1] = sizeof(u_int16_t),
+ [CTA_PROTONAT_PORT_MAX-1] = sizeof(u_int16_t),
+};
+
+static int ctnetlink_parse_nat_proto(struct nfattr *attr,
+ const struct ip_conntrack *ct,
+ struct ip_nat_range *range)
+{
+ struct nfattr *tb[CTA_PROTONAT_MAX];
+ struct ip_nat_protocol *npt;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ if (nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr) < 0)
+ goto nfattr_failure;
+
+ if (nfattr_bad_size(tb, CTA_PROTONAT_MAX, cta_min_protonat))
+ goto nfattr_failure;
+
+ npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
+ if (!npt)
+ return 0;
+
+ if (!npt->nfattr_to_range) {
+ ip_nat_proto_put(npt);
+ return 0;
+ }
+
+ /* nfattr_to_range returns 1 if it parsed, 0 if not, neg. on error */
+ if (npt->nfattr_to_range(tb, range) > 0)
+ range->flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
+
+ ip_nat_proto_put(npt);
+
+ DEBUGP("leaving\n");
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_parse_nat(struct nfattr *cda[],
+ const struct ip_conntrack *ct, struct ip_nat_range *range)
+{
+ struct nfattr *tb[CTA_NAT_MAX];
+ int err;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ memset(range, 0, sizeof(*range));
+
+ if (nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]) < 0)
+ goto nfattr_failure;
+
+ if (tb[CTA_NAT_MINIP-1])
+ range->min_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MINIP-1]);
+
+ if (!tb[CTA_NAT_MAXIP-1])
+ range->max_ip = range->min_ip;
+ else
+ range->max_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MAXIP-1]);
+
+ if (range->min_ip)
+ range->flags |= IP_NAT_RANGE_MAP_IPS;
+
+ if (!tb[CTA_NAT_PROTO-1])
+ return 0;
+
+ err = ctnetlink_parse_nat_proto(tb[CTA_NAT_PROTO-1], ct, range);
+ if (err < 0)
+ return err;
+
+ DEBUGP("leaving\n");
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+#endif
+
+static inline int
+ctnetlink_parse_help(struct nfattr *attr, char **helper_name)
+{
+ struct nfattr *tb[CTA_HELP_MAX];
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ if (nfattr_parse_nested(tb, CTA_HELP_MAX, attr) < 0)
+ goto nfattr_failure;
+
+ if (!tb[CTA_HELP_NAME-1])
+ return -EINVAL;
+
+ *helper_name = NFA_DATA(tb[CTA_HELP_NAME-1]);
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+static int
+ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+ struct ip_conntrack_tuple_hash *h;
+ struct ip_conntrack_tuple tuple;
+ struct ip_conntrack *ct;
+ int err = 0;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ if (cda[CTA_TUPLE_ORIG-1])
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG);
+ else if (cda[CTA_TUPLE_REPLY-1])
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY);
+ else {
+ /* Flush the whole table */
+ ip_conntrack_flush();
+ return 0;
+ }
+
+ if (err < 0)
+ return err;
+
+ h = ip_conntrack_find_get(&tuple, NULL);
+ if (!h) {
+ DEBUGP("tuple not found in conntrack hash\n");
+ return -ENOENT;
+ }
+
+ ct = tuplehash_to_ctrack(h);
+
+ if (cda[CTA_ID-1]) {
+ u_int32_t id = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_ID-1]));
+ if (ct->id != id) {
+ ip_conntrack_put(ct);
+ return -ENOENT;
+ }
+ }
+ if (del_timer(&ct->timeout)) {
+ ip_conntrack_put(ct);
+ ct->timeout.function((unsigned long)ct);
+ return 0;
+ }
+ ip_conntrack_put(ct);
+ DEBUGP("leaving\n");
+
+ return 0;
+}
+
+static int
+ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+ struct ip_conntrack_tuple_hash *h;
+ struct ip_conntrack_tuple tuple;
+ struct ip_conntrack *ct;
+ struct sk_buff *skb2 = NULL;
+ int err = 0;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct nfgenmsg *msg = NLMSG_DATA(nlh);
+ u32 rlen;
+
+ if (msg->nfgen_family != AF_INET)
+ return -EAFNOSUPPORT;
+
+ if (NFNL_MSG_TYPE(nlh->nlmsg_type) ==
+ IPCTNL_MSG_CT_GET_CTRZERO) {
+#ifdef CONFIG_IP_NF_CT_ACCT
+ if ((*errp = netlink_dump_start(ctnl, skb, nlh,
+ ctnetlink_dump_table_w,
+ ctnetlink_done)) != 0)
+ return -EINVAL;
+#else
+ return -ENOTSUPP;
+#endif
+ } else {
+ if ((*errp = netlink_dump_start(ctnl, skb, nlh,
+ ctnetlink_dump_table,
+ ctnetlink_done)) != 0)
+ return -EINVAL;
+ }
+
+ rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+ if (rlen > skb->len)
+ rlen = skb->len;
+ skb_pull(skb, rlen);
+ return 0;
+ }
+
+ if (cda[CTA_TUPLE_ORIG-1])
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG);
+ else if (cda[CTA_TUPLE_REPLY-1])
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY);
+ else
+ return -EINVAL;
+
+ if (err < 0)
+ return err;
+
+ h = ip_conntrack_find_get(&tuple, NULL);
+ if (!h) {
+ DEBUGP("tuple not found in conntrack hash");
+ return -ENOENT;
+ }
+ DEBUGP("tuple found\n");
+ ct = tuplehash_to_ctrack(h);
+
+ err = -ENOMEM;
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
+ if (!skb2) {
+ ip_conntrack_put(ct);
+ return -ENOMEM;
+ }
+ NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid;
+
+ err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq,
+ IPCTNL_MSG_CT_NEW, 1, ct);
+ ip_conntrack_put(ct);
+ if (err <= 0)
+ goto out;
+
+ err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+ if (err < 0)
+ goto out;
+
+ DEBUGP("leaving\n");
+ return 0;
+
+out:
+ if (skb2)
+ kfree_skb(skb2);
+ return -1;
+}
+
+static inline int
+ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[])
+{
+ unsigned long d;
+ unsigned status = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_STATUS-1]));
+ d = ct->status ^ status;
+
+ if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING))
+ /* unchangeable */
+ return -EINVAL;
+
+ if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
+ /* SEEN_REPLY bit can only be set */
+ return -EINVAL;
+
+
+ if (d & IPS_ASSURED && !(status & IPS_ASSURED))
+ /* ASSURED bit can only be set */
+ return -EINVAL;
+
+ if (cda[CTA_NAT-1]) {
+#ifndef CONFIG_IP_NF_NAT_NEEDED
+ return -EINVAL;
+#else
+ unsigned int hooknum;
+ struct ip_nat_range range;
+
+ if (ctnetlink_parse_nat(cda, ct, &range) < 0)
+ return -EINVAL;
+
+ DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n",
+ NIPQUAD(range.min_ip), NIPQUAD(range.max_ip),
+ htons(range.min.all), htons(range.max.all));
+
+ /* This is tricky but it works. ip_nat_setup_info needs the
+ * hook number as parameter, so let's do the correct
+ * conversion and run away */
+ if (status & IPS_SRC_NAT_DONE)
+ hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */
+ else if (status & IPS_DST_NAT_DONE)
+ hooknum = NF_IP_PRE_ROUTING; /* IP_NAT_MANIP_DST */
+ else
+ return -EINVAL; /* Missing NAT flags */
+
+ DEBUGP("NAT status: %lu\n",
+ status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
+
+ if (ip_nat_initialized(ct, hooknum))
+ return -EEXIST;
+ ip_nat_setup_info(ct, &range, hooknum);
+
+ DEBUGP("NAT status after setup_info: %lu\n",
+ ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
+#endif
+ }
+
+ /* Be careful here, modifying NAT bits can screw up things,
+ * so don't let users modify them directly if they don't pass
+ * ip_nat_range. */
+ ct->status |= status & ~(IPS_NAT_DONE_MASK | IPS_NAT_MASK);
+ return 0;
+}
+
+
+static inline int
+ctnetlink_change_helper(struct ip_conntrack *ct, struct nfattr *cda[])
+{
+ struct ip_conntrack_helper *helper;
+ char *helpname;
+ int err;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ /* don't change helper of sibling connections */
+ if (ct->master)
+ return -EINVAL;
+
+ err = ctnetlink_parse_help(cda[CTA_HELP-1], &helpname);
+ if (err < 0)
+ return err;
+
+ helper = __ip_conntrack_helper_find_byname(helpname);
+ if (!helper) {
+ if (!strcmp(helpname, ""))
+ helper = NULL;
+ else
+ return -EINVAL;
+ }
+
+ if (ct->helper) {
+ if (!helper) {
+ /* we had a helper before ... */
+ ip_ct_remove_expectations(ct);
+ ct->helper = NULL;
+ } else {
+ /* need to zero data of old helper */
+ memset(&ct->help, 0, sizeof(ct->help));
+ }
+ }
+
+ ct->helper = helper;
+
+ return 0;
+}
+
+static inline int
+ctnetlink_change_timeout(struct ip_conntrack *ct, struct nfattr *cda[])
+{
+ u_int32_t timeout = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_TIMEOUT-1]));
+
+ if (!del_timer(&ct->timeout))
+ return -ETIME;
+
+ ct->timeout.expires = jiffies + timeout * HZ;
+ add_timer(&ct->timeout);
+
+ return 0;
+}
+
+static inline int
+ctnetlink_change_protoinfo(struct ip_conntrack *ct, struct nfattr *cda[])
+{
+ struct nfattr *tb[CTA_PROTOINFO_MAX], *attr = cda[CTA_PROTOINFO-1];
+ struct ip_conntrack_protocol *proto;
+ u_int16_t npt = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
+ int err = 0;
+
+ if (nfattr_parse_nested(tb, CTA_PROTOINFO_MAX, attr) < 0)
+ goto nfattr_failure;
+
+ proto = ip_conntrack_proto_find_get(npt);
+ if (!proto)
+ return -EINVAL;
+
+ if (proto->from_nfattr)
+ err = proto->from_nfattr(tb, ct);
+ ip_conntrack_proto_put(proto);
+
+ return err;
+
+nfattr_failure:
+ return -ENOMEM;
+}
+
+static int
+ctnetlink_change_conntrack(struct ip_conntrack *ct, struct nfattr *cda[])
+{
+ int err;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ if (cda[CTA_HELP-1]) {
+ err = ctnetlink_change_helper(ct, cda);
+ if (err < 0)
+ return err;
+ }
+
+ if (cda[CTA_TIMEOUT-1]) {
+ err = ctnetlink_change_timeout(ct, cda);
+ if (err < 0)
+ return err;
+ }
+
+ if (cda[CTA_STATUS-1]) {
+ err = ctnetlink_change_status(ct, cda);
+ if (err < 0)
+ return err;
+ }
+
+ if (cda[CTA_PROTOINFO-1]) {
+ err = ctnetlink_change_protoinfo(ct, cda);
+ if (err < 0)
+ return err;
+ }
+
+ DEBUGP("all done\n");
+ return 0;
+}
+
+static int
+ctnetlink_create_conntrack(struct nfattr *cda[],
+ struct ip_conntrack_tuple *otuple,
+ struct ip_conntrack_tuple *rtuple)
+{
+ struct ip_conntrack *ct;
+ int err = -EINVAL;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ ct = ip_conntrack_alloc(otuple, rtuple);
+ if (ct == NULL || IS_ERR(ct))
+ return -ENOMEM;
+
+ if (!cda[CTA_TIMEOUT-1])
+ goto err;
+ ct->timeout.expires = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_TIMEOUT-1]));
+
+ ct->timeout.expires = jiffies + ct->timeout.expires * HZ;
+ ct->status |= IPS_CONFIRMED;
+
+ err = ctnetlink_change_status(ct, cda);
+ if (err < 0)
+ goto err;
+
+ if (cda[CTA_PROTOINFO-1]) {
+ err = ctnetlink_change_protoinfo(ct, cda);
+ if (err < 0)
+ return err;
+ }
+
+ ct->helper = ip_conntrack_helper_find_get(rtuple);
+
+ add_timer(&ct->timeout);
+ ip_conntrack_hash_insert(ct);
+
+ if (ct->helper)
+ ip_conntrack_helper_put(ct->helper);
+
+ DEBUGP("conntrack with id %u inserted\n", ct->id);
+ return 0;
+
+err:
+ ip_conntrack_free(ct);
+ return err;
+}
+
+static int
+ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+ struct ip_conntrack_tuple otuple, rtuple;
+ struct ip_conntrack_tuple_hash *h = NULL;
+ int err = 0;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ if (cda[CTA_TUPLE_ORIG-1]) {
+ err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG);
+ if (err < 0)
+ return err;
+ }
+
+ if (cda[CTA_TUPLE_REPLY-1]) {
+ err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY);
+ if (err < 0)
+ return err;
+ }
+
+ write_lock_bh(&ip_conntrack_lock);
+ if (cda[CTA_TUPLE_ORIG-1])
+ h = __ip_conntrack_find(&otuple, NULL);
+ else if (cda[CTA_TUPLE_REPLY-1])
+ h = __ip_conntrack_find(&rtuple, NULL);
+
+ if (h == NULL) {
+ write_unlock_bh(&ip_conntrack_lock);
+ DEBUGP("no such conntrack, create new\n");
+ err = -ENOENT;
+ if (nlh->nlmsg_flags & NLM_F_CREATE)
+ err = ctnetlink_create_conntrack(cda, &otuple, &rtuple);
+ return err;
+ }
+ /* implicit 'else' */
+
+ /* we only allow nat config for new conntracks */
+ if (cda[CTA_NAT-1]) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ /* We manipulate the conntrack inside the global conntrack table lock,
+ * so there's no need to increase the refcount */
+ DEBUGP("conntrack found\n");
+ err = -EEXIST;
+ if (!(nlh->nlmsg_flags & NLM_F_EXCL))
+ err = ctnetlink_change_conntrack(tuplehash_to_ctrack(h), cda);
+
+out_unlock:
+ write_unlock_bh(&ip_conntrack_lock);
+ return err;
+}
+
+/***********************************************************************
+ * EXPECT
+ ***********************************************************************/
+
+static inline int
+ctnetlink_exp_dump_tuple(struct sk_buff *skb,
+ const struct ip_conntrack_tuple *tuple,
+ enum ctattr_expect type)
+{
+ struct nfattr *nest_parms = NFA_NEST(skb, type);
+
+ if (ctnetlink_dump_tuples(skb, tuple) < 0)
+ goto nfattr_failure;
+
+ NFA_NEST_END(skb, nest_parms);
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_exp_dump_expect(struct sk_buff *skb,
+ const struct ip_conntrack_expect *exp)
+{
+ struct ip_conntrack *master = exp->master;
+ u_int32_t timeout = htonl((exp->timeout.expires - jiffies) / HZ);
+ u_int32_t id = htonl(exp->id);
+
+ if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0)
+ goto nfattr_failure;
+ if (ctnetlink_exp_dump_tuple(skb, &exp->mask, CTA_EXPECT_MASK) < 0)
+ goto nfattr_failure;
+ if (ctnetlink_exp_dump_tuple(skb,
+ &master->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ CTA_EXPECT_MASTER) < 0)
+ goto nfattr_failure;
+
+ NFA_PUT(skb, CTA_EXPECT_TIMEOUT, sizeof(timeout), &timeout);
+ NFA_PUT(skb, CTA_EXPECT_ID, sizeof(u_int32_t), &id);
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+static int
+ctnetlink_exp_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
+ int event,
+ int nowait,
+ const struct ip_conntrack_expect *exp)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ unsigned char *b;
+
+ b = skb->tail;
+
+ event |= NFNL_SUBSYS_CTNETLINK_EXP << 8;
+ nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg));
+ nfmsg = NLMSG_DATA(nlh);
+
+ nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
+ nfmsg->nfgen_family = AF_INET;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (ctnetlink_exp_dump_expect(skb, exp) < 0)
+ goto nfattr_failure;
+
+ nlh->nlmsg_len = skb->tail - b;
+ return skb->len;
+
+nlmsg_failure:
+nfattr_failure:
+ skb_trim(skb, b - skb->data);
+ return -1;
+}
+
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+static int ctnetlink_expect_event(struct notifier_block *this,
+ unsigned long events, void *ptr)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ struct ip_conntrack_expect *exp = (struct ip_conntrack_expect *)ptr;
+ struct sk_buff *skb;
+ unsigned int type;
+ unsigned char *b;
+ int flags = 0;
+ u16 proto;
+
+ if (events & IPEXP_NEW) {
+ type = IPCTNL_MSG_EXP_NEW;
+ flags = NLM_F_CREATE|NLM_F_EXCL;
+ } else
+ return NOTIFY_DONE;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
+ if (!skb)
+ return NOTIFY_DONE;
+
+ b = skb->tail;
+
+ type |= NFNL_SUBSYS_CTNETLINK << 8;
+ nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg));
+ nfmsg = NLMSG_DATA(nlh);
+
+ nlh->nlmsg_flags = flags;
+ nfmsg->nfgen_family = AF_INET;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (ctnetlink_exp_dump_expect(skb, exp) < 0)
+ goto nfattr_failure;
+
+ nlh->nlmsg_len = skb->tail - b;
+ proto = exp->tuple.dst.protonum;
+ nfnetlink_send(skb, 0, NFNLGRP_CONNTRACK_EXP_NEW, 0);
+ return NOTIFY_DONE;
+
+nlmsg_failure:
+nfattr_failure:
+ kfree_skb(skb);
+ return NOTIFY_DONE;
+}
+#endif
+
+static int
+ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct ip_conntrack_expect *exp = NULL;
+ struct list_head *i;
+ u_int32_t *id = (u_int32_t *) &cb->args[0];
+
+ DEBUGP("entered %s, last id=%llu\n", __FUNCTION__, *id);
+
+ read_lock_bh(&ip_conntrack_lock);
+ list_for_each_prev(i, &ip_conntrack_expect_list) {
+ exp = (struct ip_conntrack_expect *) i;
+ if (exp->id <= *id)
+ continue;
+ if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq,
+ IPCTNL_MSG_EXP_NEW,
+ 1, exp) < 0)
+ goto out;
+ *id = exp->id;
+ }
+out:
+ read_unlock_bh(&ip_conntrack_lock);
+
+ DEBUGP("leaving, last id=%llu\n", *id);
+
+ return skb->len;
+}
+
+static int
+ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+ struct ip_conntrack_tuple tuple;
+ struct ip_conntrack_expect *exp;
+ struct sk_buff *skb2;
+ int err = 0;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct nfgenmsg *msg = NLMSG_DATA(nlh);
+ u32 rlen;
+
+ if (msg->nfgen_family != AF_INET)
+ return -EAFNOSUPPORT;
+
+ if ((*errp = netlink_dump_start(ctnl, skb, nlh,
+ ctnetlink_exp_dump_table,
+ ctnetlink_done)) != 0)
+ return -EINVAL;
+ rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+ if (rlen > skb->len)
+ rlen = skb->len;
+ skb_pull(skb, rlen);
+ return 0;
+ }
+
+ if (cda[CTA_EXPECT_MASTER-1])
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER);
+ else
+ return -EINVAL;
+
+ if (err < 0)
+ return err;
+
+ exp = ip_conntrack_expect_find(&tuple);
+ if (!exp)
+ return -ENOENT;
+
+ err = -ENOMEM;
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb2)
+ goto out;
+ NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid;
+
+ err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).pid,
+ nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW,
+ 1, exp);
+ if (err <= 0)
+ goto out;
+
+ ip_conntrack_expect_put(exp);
+
+ err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+ if (err < 0)
+ goto free;
+
+ return err;
+
+out:
+ ip_conntrack_expect_put(exp);
+free:
+ if (skb2)
+ kfree_skb(skb2);
+ return err;
+}
+
+static int
+ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+ struct ip_conntrack_expect *exp, *tmp;
+ struct ip_conntrack_tuple tuple;
+ struct ip_conntrack_helper *h;
+ int err;
+
+ if (cda[CTA_EXPECT_TUPLE-1]) {
+ /* delete a single expect by tuple */
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE);
+ if (err < 0)
+ return err;
+
+ /* bump usage count to 2 */
+ exp = ip_conntrack_expect_find(&tuple);
+ if (!exp)
+ return -ENOENT;
+
+ if (cda[CTA_EXPECT_ID-1]) {
+ u_int32_t id =
+ *(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]);
+ if (exp->id != ntohl(id)) {
+ ip_conntrack_expect_put(exp);
+ return -ENOENT;
+ }
+ }
+
+ /* after list removal, usage count == 1 */
+ ip_conntrack_unexpect_related(exp);
+ /* have to put what we 'get' above.
+ * after this line usage count == 0 */
+ ip_conntrack_expect_put(exp);
+ } else if (cda[CTA_EXPECT_HELP_NAME-1]) {
+ char *name = NFA_DATA(cda[CTA_EXPECT_HELP_NAME-1]);
+
+ /* delete all expectations for this helper */
+ write_lock_bh(&ip_conntrack_lock);
+ h = __ip_conntrack_helper_find_byname(name);
+ if (!h) {
+ write_unlock_bh(&ip_conntrack_lock);
+ return -EINVAL;
+ }
+ list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list,
+ list) {
+ if (exp->master->helper == h
+ && del_timer(&exp->timeout)) {
+ ip_ct_unlink_expect(exp);
+ ip_conntrack_expect_put(exp);
+ }
+ }
+ write_unlock(&ip_conntrack_lock);
+ } else {
+ /* This basically means we have to flush everything*/
+ write_lock_bh(&ip_conntrack_lock);
+ list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list,
+ list) {
+ if (del_timer(&exp->timeout)) {
+ ip_ct_unlink_expect(exp);
+ ip_conntrack_expect_put(exp);
+ }
+ }
+ write_unlock_bh(&ip_conntrack_lock);
+ }
+
+ return 0;
+}
+static int
+ctnetlink_change_expect(struct ip_conntrack_expect *x, struct nfattr *cda[])
+{
+ return -EOPNOTSUPP;
+}
+
+static int
+ctnetlink_create_expect(struct nfattr *cda[])
+{
+ struct ip_conntrack_tuple tuple, mask, master_tuple;
+ struct ip_conntrack_tuple_hash *h = NULL;
+ struct ip_conntrack_expect *exp;
+ struct ip_conntrack *ct;
+ int err = 0;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ /* caller guarantees that those three CTA_EXPECT_* exist */
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE);
+ if (err < 0)
+ return err;
+ err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK);
+ if (err < 0)
+ return err;
+ err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER);
+ if (err < 0)
+ return err;
+
+ /* Look for master conntrack of this expectation */
+ h = ip_conntrack_find_get(&master_tuple, NULL);
+ if (!h)
+ return -ENOENT;
+ ct = tuplehash_to_ctrack(h);
+
+ if (!ct->helper) {
+ /* such conntrack hasn't got any helper, abort */
+ err = -EINVAL;
+ goto out;
+ }
+
+ exp = ip_conntrack_expect_alloc(ct);
+ if (!exp) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ exp->expectfn = NULL;
+ exp->flags = 0;
+ exp->master = ct;
+ memcpy(&exp->tuple, &tuple, sizeof(struct ip_conntrack_tuple));
+ memcpy(&exp->mask, &mask, sizeof(struct ip_conntrack_tuple));
+
+ err = ip_conntrack_expect_related(exp);
+ ip_conntrack_expect_put(exp);
+
+out:
+ ip_conntrack_put(tuplehash_to_ctrack(h));
+ return err;
+}
+
+static int
+ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+ struct ip_conntrack_tuple tuple;
+ struct ip_conntrack_expect *exp;
+ int err = 0;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ if (!cda[CTA_EXPECT_TUPLE-1]
+ || !cda[CTA_EXPECT_MASK-1]
+ || !cda[CTA_EXPECT_MASTER-1])
+ return -EINVAL;
+
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE);
+ if (err < 0)
+ return err;
+
+ write_lock_bh(&ip_conntrack_lock);
+ exp = __ip_conntrack_expect_find(&tuple);
+
+ if (!exp) {
+ write_unlock_bh(&ip_conntrack_lock);
+ err = -ENOENT;
+ if (nlh->nlmsg_flags & NLM_F_CREATE)
+ err = ctnetlink_create_expect(cda);
+ return err;
+ }
+
+ err = -EEXIST;
+ if (!(nlh->nlmsg_flags & NLM_F_EXCL))
+ err = ctnetlink_change_expect(exp, cda);
+ write_unlock_bh(&ip_conntrack_lock);
+
+ DEBUGP("leaving\n");
+
+ return err;
+}
+
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+static struct notifier_block ctnl_notifier = {
+ .notifier_call = ctnetlink_conntrack_event,
+};
+
+static struct notifier_block ctnl_notifier_exp = {
+ .notifier_call = ctnetlink_expect_event,
+};
+#endif
+
+static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = {
+ [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack,
+ .attr_count = CTA_MAX,
+ .cap_required = CAP_NET_ADMIN },
+ [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack,
+ .attr_count = CTA_MAX,
+ .cap_required = CAP_NET_ADMIN },
+ [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack,
+ .attr_count = CTA_MAX,
+ .cap_required = CAP_NET_ADMIN },
+ [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack,
+ .attr_count = CTA_MAX,
+ .cap_required = CAP_NET_ADMIN },
+};
+
+static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = {
+ [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect,
+ .attr_count = CTA_EXPECT_MAX,
+ .cap_required = CAP_NET_ADMIN },
+ [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect,
+ .attr_count = CTA_EXPECT_MAX,
+ .cap_required = CAP_NET_ADMIN },
+ [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect,
+ .attr_count = CTA_EXPECT_MAX,
+ .cap_required = CAP_NET_ADMIN },
+};
+
+static struct nfnetlink_subsystem ctnl_subsys = {
+ .name = "conntrack",
+ .subsys_id = NFNL_SUBSYS_CTNETLINK,
+ .cb_count = IPCTNL_MSG_MAX,
+ .cb = ctnl_cb,
+};
+
+static struct nfnetlink_subsystem ctnl_exp_subsys = {
+ .name = "conntrack_expect",
+ .subsys_id = NFNL_SUBSYS_CTNETLINK_EXP,
+ .cb_count = IPCTNL_MSG_EXP_MAX,
+ .cb = ctnl_exp_cb,
+};
+
+static int __init ctnetlink_init(void)
+{
+ int ret;
+
+ printk("ctnetlink v%s: registering with nfnetlink.\n", version);
+ ret = nfnetlink_subsys_register(&ctnl_subsys);
+ if (ret < 0) {
+ printk("ctnetlink_init: cannot register with nfnetlink.\n");
+ goto err_out;
+ }
+
+ ret = nfnetlink_subsys_register(&ctnl_exp_subsys);
+ if (ret < 0) {
+ printk("ctnetlink_init: cannot register exp with nfnetlink.\n");
+ goto err_unreg_subsys;
+ }
+
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+ ret = ip_conntrack_register_notifier(&ctnl_notifier);
+ if (ret < 0) {
+ printk("ctnetlink_init: cannot register notifier.\n");
+ goto err_unreg_exp_subsys;
+ }
+
+ ret = ip_conntrack_expect_register_notifier(&ctnl_notifier_exp);
+ if (ret < 0) {
+ printk("ctnetlink_init: cannot expect register notifier.\n");
+ goto err_unreg_notifier;
+ }
+#endif
+
+ return 0;
+
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+err_unreg_notifier:
+ ip_conntrack_unregister_notifier(&ctnl_notifier);
+err_unreg_exp_subsys:
+ nfnetlink_subsys_unregister(&ctnl_exp_subsys);
+#endif
+err_unreg_subsys:
+ nfnetlink_subsys_unregister(&ctnl_subsys);
+err_out:
+ return ret;
+}
+
+static void __exit ctnetlink_exit(void)
+{
+ printk("ctnetlink: unregistering from nfnetlink.\n");
+
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+ ip_conntrack_unregister_notifier(&ctnl_notifier_exp);
+ ip_conntrack_unregister_notifier(&ctnl_notifier);
+#endif
+
+ nfnetlink_subsys_unregister(&ctnl_exp_subsys);
+ nfnetlink_subsys_unregister(&ctnl_subsys);
+ return;
+}
+
+module_init(ctnetlink_init);
+module_exit(ctnetlink_exit);
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
new file mode 100644
index 00000000000..744abb9d377
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
@@ -0,0 +1,328 @@
+/*
+ * ip_conntrack_proto_gre.c - Version 3.0
+ *
+ * Connection tracking protocol helper module for GRE.
+ *
+ * GRE is a generic encapsulation protocol, which is generally not very
+ * suited for NAT, as it has no protocol-specific part as port numbers.
+ *
+ * It has an optional key field, which may help us distinguishing two
+ * connections between the same two hosts.
+ *
+ * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784
+ *
+ * PPTP is built on top of a modified version of GRE, and has a mandatory
+ * field called "CallID", which serves us for the same purpose as the key
+ * field in plain GRE.
+ *
+ * Documentation about PPTP can be found in RFC 2637
+ *
+ * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/list.h>
+
+static DEFINE_RWLOCK(ip_ct_gre_lock);
+#define ASSERT_READ_LOCK(x)
+#define ASSERT_WRITE_LOCK(x)
+
+#include <linux/netfilter_ipv4/listhelp.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+
+#include <linux/netfilter_ipv4/ip_conntrack_proto_gre.h>
+#include <linux/netfilter_ipv4/ip_conntrack_pptp.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
+MODULE_DESCRIPTION("netfilter connection tracking protocol helper for GRE");
+
+/* shamelessly stolen from ip_conntrack_proto_udp.c */
+#define GRE_TIMEOUT (30*HZ)
+#define GRE_STREAM_TIMEOUT (180*HZ)
+
+#if 0
+#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s: " format, __FILE__, __FUNCTION__, ## args)
+#define DUMP_TUPLE_GRE(x) printk("%u.%u.%u.%u:0x%x -> %u.%u.%u.%u:0x%x\n", \
+ NIPQUAD((x)->src.ip), ntohs((x)->src.u.gre.key), \
+ NIPQUAD((x)->dst.ip), ntohs((x)->dst.u.gre.key))
+#else
+#define DEBUGP(x, args...)
+#define DUMP_TUPLE_GRE(x)
+#endif
+
+/* GRE KEYMAP HANDLING FUNCTIONS */
+static LIST_HEAD(gre_keymap_list);
+
+static inline int gre_key_cmpfn(const struct ip_ct_gre_keymap *km,
+ const struct ip_conntrack_tuple *t)
+{
+ return ((km->tuple.src.ip == t->src.ip) &&
+ (km->tuple.dst.ip == t->dst.ip) &&
+ (km->tuple.dst.protonum == t->dst.protonum) &&
+ (km->tuple.dst.u.all == t->dst.u.all));
+}
+
+/* look up the source key for a given tuple */
+static u_int32_t gre_keymap_lookup(struct ip_conntrack_tuple *t)
+{
+ struct ip_ct_gre_keymap *km;
+ u_int32_t key = 0;
+
+ read_lock_bh(&ip_ct_gre_lock);
+ km = LIST_FIND(&gre_keymap_list, gre_key_cmpfn,
+ struct ip_ct_gre_keymap *, t);
+ if (km)
+ key = km->tuple.src.u.gre.key;
+ read_unlock_bh(&ip_ct_gre_lock);
+
+ DEBUGP("lookup src key 0x%x up key for ", key);
+ DUMP_TUPLE_GRE(t);
+
+ return key;
+}
+
+/* add a single keymap entry, associate with specified master ct */
+int
+ip_ct_gre_keymap_add(struct ip_conntrack *ct,
+ struct ip_conntrack_tuple *t, int reply)
+{
+ struct ip_ct_gre_keymap **exist_km, *km, *old;
+
+ if (!ct->helper || strcmp(ct->helper->name, "pptp")) {
+ DEBUGP("refusing to add GRE keymap to non-pptp session\n");
+ return -1;
+ }
+
+ if (!reply)
+ exist_km = &ct->help.ct_pptp_info.keymap_orig;
+ else
+ exist_km = &ct->help.ct_pptp_info.keymap_reply;
+
+ if (*exist_km) {
+ /* check whether it's a retransmission */
+ old = LIST_FIND(&gre_keymap_list, gre_key_cmpfn,
+ struct ip_ct_gre_keymap *, t);
+ if (old == *exist_km) {
+ DEBUGP("retransmission\n");
+ return 0;
+ }
+
+ DEBUGP("trying to override keymap_%s for ct %p\n",
+ reply? "reply":"orig", ct);
+ return -EEXIST;
+ }
+
+ km = kmalloc(sizeof(*km), GFP_ATOMIC);
+ if (!km)
+ return -ENOMEM;
+
+ memcpy(&km->tuple, t, sizeof(*t));
+ *exist_km = km;
+
+ DEBUGP("adding new entry %p: ", km);
+ DUMP_TUPLE_GRE(&km->tuple);
+
+ write_lock_bh(&ip_ct_gre_lock);
+ list_append(&gre_keymap_list, km);
+ write_unlock_bh(&ip_ct_gre_lock);
+
+ return 0;
+}
+
+/* destroy the keymap entries associated with specified master ct */
+void ip_ct_gre_keymap_destroy(struct ip_conntrack *ct)
+{
+ DEBUGP("entering for ct %p\n", ct);
+
+ if (!ct->helper || strcmp(ct->helper->name, "pptp")) {
+ DEBUGP("refusing to destroy GRE keymap to non-pptp session\n");
+ return;
+ }
+
+ write_lock_bh(&ip_ct_gre_lock);
+ if (ct->help.ct_pptp_info.keymap_orig) {
+ DEBUGP("removing %p from list\n",
+ ct->help.ct_pptp_info.keymap_orig);
+ list_del(&ct->help.ct_pptp_info.keymap_orig->list);
+ kfree(ct->help.ct_pptp_info.keymap_orig);
+ ct->help.ct_pptp_info.keymap_orig = NULL;
+ }
+ if (ct->help.ct_pptp_info.keymap_reply) {
+ DEBUGP("removing %p from list\n",
+ ct->help.ct_pptp_info.keymap_reply);
+ list_del(&ct->help.ct_pptp_info.keymap_reply->list);
+ kfree(ct->help.ct_pptp_info.keymap_reply);
+ ct->help.ct_pptp_info.keymap_reply = NULL;
+ }
+ write_unlock_bh(&ip_ct_gre_lock);
+}
+
+
+/* PUBLIC CONNTRACK PROTO HELPER FUNCTIONS */
+
+/* invert gre part of tuple */
+static int gre_invert_tuple(struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack_tuple *orig)
+{
+ tuple->dst.u.gre.key = orig->src.u.gre.key;
+ tuple->src.u.gre.key = orig->dst.u.gre.key;
+
+ return 1;
+}
+
+/* gre hdr info to tuple */
+static int gre_pkt_to_tuple(const struct sk_buff *skb,
+ unsigned int dataoff,
+ struct ip_conntrack_tuple *tuple)
+{
+ struct gre_hdr_pptp _pgrehdr, *pgrehdr;
+ u_int32_t srckey;
+ struct gre_hdr _grehdr, *grehdr;
+
+ /* first only delinearize old RFC1701 GRE header */
+ grehdr = skb_header_pointer(skb, dataoff, sizeof(_grehdr), &_grehdr);
+ if (!grehdr || grehdr->version != GRE_VERSION_PPTP) {
+ /* try to behave like "ip_conntrack_proto_generic" */
+ tuple->src.u.all = 0;
+ tuple->dst.u.all = 0;
+ return 1;
+ }
+
+ /* PPTP header is variable length, only need up to the call_id field */
+ pgrehdr = skb_header_pointer(skb, dataoff, 8, &_pgrehdr);
+ if (!pgrehdr)
+ return 1;
+
+ if (ntohs(grehdr->protocol) != GRE_PROTOCOL_PPTP) {
+ DEBUGP("GRE_VERSION_PPTP but unknown proto\n");
+ return 0;
+ }
+
+ tuple->dst.u.gre.key = pgrehdr->call_id;
+ srckey = gre_keymap_lookup(tuple);
+ tuple->src.u.gre.key = srckey;
+
+ return 1;
+}
+
+/* print gre part of tuple */
+static int gre_print_tuple(struct seq_file *s,
+ const struct ip_conntrack_tuple *tuple)
+{
+ return seq_printf(s, "srckey=0x%x dstkey=0x%x ",
+ ntohs(tuple->src.u.gre.key),
+ ntohs(tuple->dst.u.gre.key));
+}
+
+/* print private data for conntrack */
+static int gre_print_conntrack(struct seq_file *s,
+ const struct ip_conntrack *ct)
+{
+ return seq_printf(s, "timeout=%u, stream_timeout=%u ",
+ (ct->proto.gre.timeout / HZ),
+ (ct->proto.gre.stream_timeout / HZ));
+}
+
+/* Returns verdict for packet, and may modify conntrack */
+static int gre_packet(struct ip_conntrack *ct,
+ const struct sk_buff *skb,
+ enum ip_conntrack_info conntrackinfo)
+{
+ /* If we've seen traffic both ways, this is a GRE connection.
+ * Extend timeout. */
+ if (ct->status & IPS_SEEN_REPLY) {
+ ip_ct_refresh_acct(ct, conntrackinfo, skb,
+ ct->proto.gre.stream_timeout);
+ /* Also, more likely to be important, and not a probe. */
+ set_bit(IPS_ASSURED_BIT, &ct->status);
+ ip_conntrack_event_cache(IPCT_STATUS, skb);
+ } else
+ ip_ct_refresh_acct(ct, conntrackinfo, skb,
+ ct->proto.gre.timeout);
+
+ return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static int gre_new(struct ip_conntrack *ct,
+ const struct sk_buff *skb)
+{
+ DEBUGP(": ");
+ DUMP_TUPLE_GRE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+
+ /* initialize to sane value. Ideally a conntrack helper
+ * (e.g. in case of pptp) is increasing them */
+ ct->proto.gre.stream_timeout = GRE_STREAM_TIMEOUT;
+ ct->proto.gre.timeout = GRE_TIMEOUT;
+
+ return 1;
+}
+
+/* Called when a conntrack entry has already been removed from the hashes
+ * and is about to be deleted from memory */
+static void gre_destroy(struct ip_conntrack *ct)
+{
+ struct ip_conntrack *master = ct->master;
+ DEBUGP(" entering\n");
+
+ if (!master)
+ DEBUGP("no master !?!\n");
+ else
+ ip_ct_gre_keymap_destroy(master);
+}
+
+/* protocol helper struct */
+static struct ip_conntrack_protocol gre = {
+ .proto = IPPROTO_GRE,
+ .name = "gre",
+ .pkt_to_tuple = gre_pkt_to_tuple,
+ .invert_tuple = gre_invert_tuple,
+ .print_tuple = gre_print_tuple,
+ .print_conntrack = gre_print_conntrack,
+ .packet = gre_packet,
+ .new = gre_new,
+ .destroy = gre_destroy,
+ .me = THIS_MODULE,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+ .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr,
+ .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple,
+#endif
+};
+
+/* ip_conntrack_proto_gre initialization */
+int __init ip_ct_proto_gre_init(void)
+{
+ return ip_conntrack_protocol_register(&gre);
+}
+
+void __exit ip_ct_proto_gre_fini(void)
+{
+ struct list_head *pos, *n;
+
+ /* delete all keymap entries */
+ write_lock_bh(&ip_ct_gre_lock);
+ list_for_each_safe(pos, n, &gre_keymap_list) {
+ DEBUGP("deleting keymap %p at module unload time\n", pos);
+ list_del(pos);
+ kfree(pos);
+ }
+ write_unlock_bh(&ip_ct_gre_lock);
+
+ ip_conntrack_protocol_unregister(&gre);
+}
+
+EXPORT_SYMBOL(ip_ct_gre_keymap_add);
+EXPORT_SYMBOL(ip_ct_gre_keymap_destroy);
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
index 602c74db325..98f0015dd25 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
@@ -102,22 +102,24 @@ static int icmp_packet(struct ip_conntrack *ct,
ct->timeout.function((unsigned long)ct);
} else {
atomic_inc(&ct->proto.icmp.count);
+ ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout);
}
return NF_ACCEPT;
}
+static u_int8_t valid_new[] = {
+ [ICMP_ECHO] = 1,
+ [ICMP_TIMESTAMP] = 1,
+ [ICMP_INFO_REQUEST] = 1,
+ [ICMP_ADDRESS] = 1
+};
+
/* Called when a new connection for this protocol found. */
static int icmp_new(struct ip_conntrack *conntrack,
const struct sk_buff *skb)
{
- static u_int8_t valid_new[]
- = { [ICMP_ECHO] = 1,
- [ICMP_TIMESTAMP] = 1,
- [ICMP_INFO_REQUEST] = 1,
- [ICMP_ADDRESS] = 1 };
-
if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new)
|| !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) {
/* Can't create a new ICMP `conn' with this. */
@@ -158,11 +160,12 @@ icmp_error_message(struct sk_buff *skb,
return NF_ACCEPT;
}
- innerproto = ip_ct_find_proto(inside->ip.protocol);
+ innerproto = ip_conntrack_proto_find_get(inside->ip.protocol);
dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp) + inside->ip.ihl*4;
/* Are they talking about one of our connections? */
if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) {
DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol);
+ ip_conntrack_proto_put(innerproto);
return NF_ACCEPT;
}
@@ -170,8 +173,10 @@ icmp_error_message(struct sk_buff *skb,
been preserved inside the ICMP. */
if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) {
DEBUGP("icmp_error_track: Can't invert tuple\n");
+ ip_conntrack_proto_put(innerproto);
return NF_ACCEPT;
}
+ ip_conntrack_proto_put(innerproto);
*ctinfo = IP_CT_RELATED;
@@ -212,7 +217,7 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih);
if (icmph == NULL) {
if (LOG_INVALID(IPPROTO_ICMP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_icmp: short packet ");
return -NF_ACCEPT;
}
@@ -226,13 +231,13 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
if (!(u16)csum_fold(skb->csum))
break;
if (LOG_INVALID(IPPROTO_ICMP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_icmp: bad HW ICMP checksum ");
return -NF_ACCEPT;
case CHECKSUM_NONE:
if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) {
if (LOG_INVALID(IPPROTO_ICMP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_icmp: bad ICMP checksum ");
return -NF_ACCEPT;
}
@@ -249,7 +254,7 @@ checksum_skipped:
*/
if (icmph->type > NR_ICMP_TYPES) {
if (LOG_INVALID(IPPROTO_ICMP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_icmp: invalid ICMP type ");
return -NF_ACCEPT;
}
@@ -265,6 +270,46 @@ checksum_skipped:
return icmp_error_message(skb, ctinfo, hooknum);
}
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+static int icmp_tuple_to_nfattr(struct sk_buff *skb,
+ const struct ip_conntrack_tuple *t)
+{
+ NFA_PUT(skb, CTA_PROTO_ICMP_ID, sizeof(u_int16_t),
+ &t->src.u.icmp.id);
+ NFA_PUT(skb, CTA_PROTO_ICMP_TYPE, sizeof(u_int8_t),
+ &t->dst.u.icmp.type);
+ NFA_PUT(skb, CTA_PROTO_ICMP_CODE, sizeof(u_int8_t),
+ &t->dst.u.icmp.code);
+
+ if (t->dst.u.icmp.type >= sizeof(valid_new)
+ || !valid_new[t->dst.u.icmp.type])
+ return -EINVAL;
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+static int icmp_nfattr_to_tuple(struct nfattr *tb[],
+ struct ip_conntrack_tuple *tuple)
+{
+ if (!tb[CTA_PROTO_ICMP_TYPE-1]
+ || !tb[CTA_PROTO_ICMP_CODE-1])
+ return -1;
+
+ tuple->dst.u.icmp.type =
+ *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_TYPE-1]);
+ tuple->dst.u.icmp.code =
+ *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_CODE-1]);
+ tuple->src.u.icmp.id =
+ *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]);
+
+ return 0;
+}
+#endif
+
struct ip_conntrack_protocol ip_conntrack_protocol_icmp =
{
.proto = IPPROTO_ICMP,
@@ -276,4 +321,9 @@ struct ip_conntrack_protocol ip_conntrack_protocol_icmp =
.packet = icmp_packet,
.new = icmp_new,
.error = icmp_error,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+ .tuple_to_nfattr = icmp_tuple_to_nfattr,
+ .nfattr_to_tuple = icmp_nfattr_to_tuple,
+#endif
};
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
index 31d75390bf1..59a4a0111dd 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
@@ -404,6 +404,8 @@ static int sctp_packet(struct ip_conntrack *conntrack,
}
conntrack->proto.sctp.state = newconntrack;
+ if (oldsctpstate != newconntrack)
+ ip_conntrack_event_cache(IPCT_PROTOINFO, skb);
write_unlock_bh(&sctp_lock);
}
@@ -414,6 +416,7 @@ static int sctp_packet(struct ip_conntrack *conntrack,
&& newconntrack == SCTP_CONNTRACK_ESTABLISHED) {
DEBUGP("Setting assured bit\n");
set_bit(IPS_ASSURED_BIT, &conntrack->status);
+ ip_conntrack_event_cache(IPCT_STATUS, skb);
}
return NF_ACCEPT;
@@ -503,7 +506,12 @@ static struct ip_conntrack_protocol ip_conntrack_protocol_sctp = {
.packet = sctp_packet,
.new = sctp_new,
.destroy = NULL,
- .me = THIS_MODULE
+ .me = THIS_MODULE,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+ .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr,
+ .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple,
+#endif
};
#ifdef CONFIG_SYSCTL
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
index 809dfed766d..d6701cafbcc 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -336,6 +336,50 @@ static int tcp_print_conntrack(struct seq_file *s,
return seq_printf(s, "%s ", tcp_conntrack_names[state]);
}
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+static int tcp_to_nfattr(struct sk_buff *skb, struct nfattr *nfa,
+ const struct ip_conntrack *ct)
+{
+ struct nfattr *nest_parms = NFA_NEST(skb, CTA_PROTOINFO_TCP);
+
+ read_lock_bh(&tcp_lock);
+ NFA_PUT(skb, CTA_PROTOINFO_TCP_STATE, sizeof(u_int8_t),
+ &ct->proto.tcp.state);
+ read_unlock_bh(&tcp_lock);
+
+ NFA_NEST_END(skb, nest_parms);
+
+ return 0;
+
+nfattr_failure:
+ read_unlock_bh(&tcp_lock);
+ return -1;
+}
+
+static int nfattr_to_tcp(struct nfattr *cda[], struct ip_conntrack *ct)
+{
+ struct nfattr *attr = cda[CTA_PROTOINFO_TCP-1];
+ struct nfattr *tb[CTA_PROTOINFO_TCP_MAX];
+
+ if (nfattr_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, attr) < 0)
+ goto nfattr_failure;
+
+ if (!tb[CTA_PROTOINFO_TCP_STATE-1])
+ return -EINVAL;
+
+ write_lock_bh(&tcp_lock);
+ ct->proto.tcp.state =
+ *(u_int8_t *)NFA_DATA(tb[CTA_PROTOINFO_TCP_STATE-1]);
+ write_unlock_bh(&tcp_lock);
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+#endif
+
static unsigned int get_conntrack_index(const struct tcphdr *tcph)
{
if (tcph->rst) return TCP_RST_SET;
@@ -699,7 +743,7 @@ static int tcp_in_window(struct ip_ct_tcp *state,
res = 1;
} else {
if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_tcp: %s ",
before(seq, sender->td_maxend + 1) ?
after(end, sender->td_end - receiver->td_maxwin - 1) ?
@@ -798,7 +842,7 @@ static int tcp_error(struct sk_buff *skb,
sizeof(_tcph), &_tcph);
if (th == NULL) {
if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_tcp: short packet ");
return -NF_ACCEPT;
}
@@ -806,7 +850,7 @@ static int tcp_error(struct sk_buff *skb,
/* Not whole TCP header or malformed packet */
if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_tcp: truncated/malformed packet ");
return -NF_ACCEPT;
}
@@ -823,7 +867,7 @@ static int tcp_error(struct sk_buff *skb,
skb->ip_summed == CHECKSUM_HW ? skb->csum
: skb_checksum(skb, iph->ihl*4, tcplen, 0))) {
if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_tcp: bad TCP checksum ");
return -NF_ACCEPT;
}
@@ -832,7 +876,7 @@ static int tcp_error(struct sk_buff *skb,
tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR));
if (!tcp_valid_flags[tcpflags]) {
if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_tcp: invalid TCP flag combination ");
return -NF_ACCEPT;
}
@@ -880,8 +924,9 @@ static int tcp_packet(struct ip_conntrack *conntrack,
*/
write_unlock_bh(&tcp_lock);
if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
- "ip_ct_tcp: killing out of sync session ");
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ NULL, "ip_ct_tcp: "
+ "killing out of sync session ");
if (del_timer(&conntrack->timeout))
conntrack->timeout.function((unsigned long)
conntrack);
@@ -895,7 +940,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
write_unlock_bh(&tcp_lock);
if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_tcp: invalid packet ignored ");
return NF_ACCEPT;
case TCP_CONNTRACK_MAX:
@@ -905,7 +950,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
old_state);
write_unlock_bh(&tcp_lock);
if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_tcp: invalid state ");
return -NF_ACCEPT;
case TCP_CONNTRACK_SYN_SENT:
@@ -926,7 +971,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
write_unlock_bh(&tcp_lock);
if (LOG_INVALID(IPPROTO_TCP))
nf_log_packet(PF_INET, 0, skb, NULL, NULL,
- "ip_ct_tcp: invalid SYN");
+ NULL, "ip_ct_tcp: invalid SYN");
return -NF_ACCEPT;
}
case TCP_CONNTRACK_CLOSE:
@@ -973,6 +1018,10 @@ static int tcp_packet(struct ip_conntrack *conntrack,
? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state];
write_unlock_bh(&tcp_lock);
+ ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
+ if (new_state != old_state)
+ ip_conntrack_event_cache(IPCT_PROTOINFO, skb);
+
if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
/* If only reply is a RST, we can consider ourselves not to
have an established connection: this is a fairly common
@@ -991,7 +1040,8 @@ static int tcp_packet(struct ip_conntrack *conntrack,
/* Set ASSURED if we see see valid ack in ESTABLISHED
after SYN_RECV or a valid answer for a picked up
connection. */
- set_bit(IPS_ASSURED_BIT, &conntrack->status);
+ set_bit(IPS_ASSURED_BIT, &conntrack->status);
+ ip_conntrack_event_cache(IPCT_STATUS, skb);
}
ip_ct_refresh_acct(conntrack, ctinfo, skb, timeout);
@@ -1096,4 +1146,11 @@ struct ip_conntrack_protocol ip_conntrack_protocol_tcp =
.packet = tcp_packet,
.new = tcp_new,
.error = tcp_error,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+ .to_nfattr = tcp_to_nfattr,
+ .from_nfattr = nfattr_to_tcp,
+ .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr,
+ .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple,
+#endif
};
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
index 8c1eaba098d..f2dcac7c766 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -73,7 +73,8 @@ static int udp_packet(struct ip_conntrack *conntrack,
ip_ct_refresh_acct(conntrack, ctinfo, skb,
ip_ct_udp_timeout_stream);
/* Also, more likely to be important, and not a probe */
- set_bit(IPS_ASSURED_BIT, &conntrack->status);
+ if (!test_and_set_bit(IPS_ASSURED_BIT, &conntrack->status))
+ ip_conntrack_event_cache(IPCT_STATUS, skb);
} else
ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout);
@@ -97,7 +98,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
hdr = skb_header_pointer(skb, iph->ihl*4, sizeof(_hdr), &_hdr);
if (hdr == NULL) {
if (LOG_INVALID(IPPROTO_UDP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_udp: short packet ");
return -NF_ACCEPT;
}
@@ -105,7 +106,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
/* Truncated/malformed packets */
if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) {
if (LOG_INVALID(IPPROTO_UDP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_udp: truncated/malformed packet ");
return -NF_ACCEPT;
}
@@ -125,7 +126,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
skb->ip_summed == CHECKSUM_HW ? skb->csum
: skb_checksum(skb, iph->ihl*4, udplen, 0))) {
if (LOG_INVALID(IPPROTO_UDP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_udp: bad UDP checksum ");
return -NF_ACCEPT;
}
@@ -144,4 +145,9 @@ struct ip_conntrack_protocol ip_conntrack_protocol_udp =
.packet = udp_packet,
.new = udp_new,
.error = udp_error,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+ .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr,
+ .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple,
+#endif
};
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index 61798c46e91..dd476b191f4 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -5,7 +5,7 @@
*/
/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -147,8 +147,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
if (DIRECTION(hash))
return 0;
- proto = ip_ct_find_proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
- .tuple.dst.protonum);
+ proto = __ip_conntrack_proto_find(conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
IP_NF_ASSERT(proto);
if (seq_printf(s, "%-8s %u %ld ",
@@ -185,7 +184,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
return -ENOSPC;
#if defined(CONFIG_IP_NF_CONNTRACK_MARK)
- if (seq_printf(s, "mark=%lu ", conntrack->mark))
+ if (seq_printf(s, "mark=%u ", conntrack->mark))
return -ENOSPC;
#endif
@@ -283,7 +282,7 @@ static int exp_seq_show(struct seq_file *s, void *v)
seq_printf(s, "proto=%u ", expect->tuple.dst.protonum);
print_tuple(s, &expect->tuple,
- ip_ct_find_proto(expect->tuple.dst.protonum));
+ __ip_conntrack_proto_find(expect->tuple.dst.protonum));
return seq_putc(s, '\n');
}
@@ -889,6 +888,7 @@ static int init_or_cleanup(int init)
return ret;
cleanup:
+ synchronize_net();
#ifdef CONFIG_SYSCTL
unregister_sysctl_table(ip_ct_sysctl_header);
cleanup_localinops:
@@ -971,6 +971,14 @@ void need_ip_conntrack(void)
{
}
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+EXPORT_SYMBOL_GPL(ip_conntrack_chain);
+EXPORT_SYMBOL_GPL(ip_conntrack_expect_chain);
+EXPORT_SYMBOL_GPL(ip_conntrack_register_notifier);
+EXPORT_SYMBOL_GPL(ip_conntrack_unregister_notifier);
+EXPORT_SYMBOL_GPL(__ip_ct_event_cache_init);
+EXPORT_PER_CPU_SYMBOL_GPL(ip_conntrack_ecache);
+#endif
EXPORT_SYMBOL(ip_conntrack_protocol_register);
EXPORT_SYMBOL(ip_conntrack_protocol_unregister);
EXPORT_SYMBOL(ip_ct_get_tuple);
@@ -981,13 +989,17 @@ EXPORT_SYMBOL(need_ip_conntrack);
EXPORT_SYMBOL(ip_conntrack_helper_register);
EXPORT_SYMBOL(ip_conntrack_helper_unregister);
EXPORT_SYMBOL(ip_ct_iterate_cleanup);
-EXPORT_SYMBOL(ip_ct_refresh_acct);
-EXPORT_SYMBOL(ip_ct_protos);
-EXPORT_SYMBOL(ip_ct_find_proto);
+EXPORT_SYMBOL(__ip_ct_refresh_acct);
+
EXPORT_SYMBOL(ip_conntrack_expect_alloc);
EXPORT_SYMBOL(ip_conntrack_expect_put);
+EXPORT_SYMBOL_GPL(__ip_conntrack_expect_find);
+EXPORT_SYMBOL_GPL(ip_conntrack_expect_find);
EXPORT_SYMBOL(ip_conntrack_expect_related);
EXPORT_SYMBOL(ip_conntrack_unexpect_related);
+EXPORT_SYMBOL_GPL(ip_conntrack_expect_list);
+EXPORT_SYMBOL_GPL(ip_ct_unlink_expect);
+
EXPORT_SYMBOL(ip_conntrack_tuple_taken);
EXPORT_SYMBOL(ip_ct_gather_frags);
EXPORT_SYMBOL(ip_conntrack_htable_size);
@@ -995,7 +1007,28 @@ EXPORT_SYMBOL(ip_conntrack_lock);
EXPORT_SYMBOL(ip_conntrack_hash);
EXPORT_SYMBOL(ip_conntrack_untracked);
EXPORT_SYMBOL_GPL(ip_conntrack_find_get);
-EXPORT_SYMBOL_GPL(ip_conntrack_put);
#ifdef CONFIG_IP_NF_NAT_NEEDED
EXPORT_SYMBOL(ip_conntrack_tcp_update);
#endif
+
+EXPORT_SYMBOL_GPL(ip_conntrack_flush);
+EXPORT_SYMBOL_GPL(__ip_conntrack_find);
+
+EXPORT_SYMBOL_GPL(ip_conntrack_alloc);
+EXPORT_SYMBOL_GPL(ip_conntrack_free);
+EXPORT_SYMBOL_GPL(ip_conntrack_hash_insert);
+
+EXPORT_SYMBOL_GPL(ip_ct_remove_expectations);
+
+EXPORT_SYMBOL_GPL(ip_conntrack_helper_find_get);
+EXPORT_SYMBOL_GPL(ip_conntrack_helper_put);
+EXPORT_SYMBOL_GPL(__ip_conntrack_helper_find_byname);
+
+EXPORT_SYMBOL_GPL(ip_conntrack_proto_find_get);
+EXPORT_SYMBOL_GPL(ip_conntrack_proto_put);
+EXPORT_SYMBOL_GPL(__ip_conntrack_proto_find);
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr);
+EXPORT_SYMBOL_GPL(ip_ct_port_nfattr_to_tuple);
+#endif
diff --git a/net/ipv4/netfilter/ip_conntrack_tftp.c b/net/ipv4/netfilter/ip_conntrack_tftp.c
index f8ff170f390..a78736b8525 100644
--- a/net/ipv4/netfilter/ip_conntrack_tftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_tftp.c
@@ -26,9 +26,9 @@ MODULE_DESCRIPTION("tftp connection tracking helper");
MODULE_LICENSE("GPL");
#define MAX_PORTS 8
-static int ports[MAX_PORTS];
+static short ports[MAX_PORTS];
static int ports_c;
-module_param_array(ports, int, &ports_c, 0400);
+module_param_array(ports, short, &ports_c, 0400);
MODULE_PARM_DESC(ports, "port numbers of tftp servers");
#if 0
@@ -75,6 +75,7 @@ static int tftp_help(struct sk_buff **pskb,
exp->mask.dst.u.udp.port = 0xffff;
exp->mask.dst.protonum = 0xff;
exp->expectfn = NULL;
+ exp->flags = 0;
DEBUGP("expect: ");
DUMP_TUPLE(&exp->tuple);
@@ -99,7 +100,7 @@ static int tftp_help(struct sk_buff **pskb,
}
static struct ip_conntrack_helper tftp[MAX_PORTS];
-static char tftp_names[MAX_PORTS][10];
+static char tftp_names[MAX_PORTS][sizeof("tftp-65535")];
static void fini(void)
{
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c
index 739b6dde1c8..c5e3abd2467 100644
--- a/net/ipv4/netfilter/ip_nat_core.c
+++ b/net/ipv4/netfilter/ip_nat_core.c
@@ -47,8 +47,41 @@ DEFINE_RWLOCK(ip_nat_lock);
static unsigned int ip_nat_htable_size;
static struct list_head *bysource;
+
+#define MAX_IP_NAT_PROTO 256
struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO];
+static inline struct ip_nat_protocol *
+__ip_nat_proto_find(u_int8_t protonum)
+{
+ return ip_nat_protos[protonum];
+}
+
+struct ip_nat_protocol *
+ip_nat_proto_find_get(u_int8_t protonum)
+{
+ struct ip_nat_protocol *p;
+
+ /* we need to disable preemption to make sure 'p' doesn't get
+ * removed until we've grabbed the reference */
+ preempt_disable();
+ p = __ip_nat_proto_find(protonum);
+ if (p) {
+ if (!try_module_get(p->me))
+ p = &ip_nat_unknown_protocol;
+ }
+ preempt_enable();
+
+ return p;
+}
+EXPORT_SYMBOL_GPL(ip_nat_proto_find_get);
+
+void
+ip_nat_proto_put(struct ip_nat_protocol *p)
+{
+ module_put(p->me);
+}
+EXPORT_SYMBOL_GPL(ip_nat_proto_put);
/* We keep an extra hash for each conntrack, for fast searching. */
static inline unsigned int
@@ -80,6 +113,7 @@ ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
oldcheck^0xFFFF));
}
+EXPORT_SYMBOL(ip_nat_cheat_check);
/* Is this tuple already taken? (not by us) */
int
@@ -96,6 +130,7 @@ ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
invert_tuplepr(&reply, tuple);
return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
}
+EXPORT_SYMBOL(ip_nat_used_tuple);
/* If we source map this tuple so reply looks like reply_tuple, will
* that meet the constraints of range. */
@@ -103,7 +138,8 @@ static int
in_range(const struct ip_conntrack_tuple *tuple,
const struct ip_nat_range *range)
{
- struct ip_nat_protocol *proto = ip_nat_find_proto(tuple->dst.protonum);
+ struct ip_nat_protocol *proto =
+ __ip_nat_proto_find(tuple->dst.protonum);
/* If we are supposed to map IPs, then we must be in the
range specified, otherwise let this drag us onto a new src IP. */
@@ -216,8 +252,7 @@ get_unique_tuple(struct ip_conntrack_tuple *tuple,
struct ip_conntrack *conntrack,
enum ip_nat_manip_type maniptype)
{
- struct ip_nat_protocol *proto
- = ip_nat_find_proto(orig_tuple->dst.protonum);
+ struct ip_nat_protocol *proto;
/* 1) If this srcip/proto/src-proto-part is currently mapped,
and that same mapping gives a unique tuple within the given
@@ -242,14 +277,20 @@ get_unique_tuple(struct ip_conntrack_tuple *tuple,
/* 3) The per-protocol part of the manip is made to map into
the range to make a unique tuple. */
+ proto = ip_nat_proto_find_get(orig_tuple->dst.protonum);
+
/* Only bother mapping if it's not already in range and unique */
if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
|| proto->in_range(tuple, maniptype, &range->min, &range->max))
- && !ip_nat_used_tuple(tuple, conntrack))
+ && !ip_nat_used_tuple(tuple, conntrack)) {
+ ip_nat_proto_put(proto);
return;
+ }
/* Last change: get protocol to try to obtain unique tuple. */
proto->unique_tuple(tuple, range, maniptype, conntrack);
+
+ ip_nat_proto_put(proto);
}
unsigned int
@@ -310,6 +351,7 @@ ip_nat_setup_info(struct ip_conntrack *conntrack,
return NF_ACCEPT;
}
+EXPORT_SYMBOL(ip_nat_setup_info);
/* Returns true if succeeded. */
static int
@@ -320,17 +362,20 @@ manip_pkt(u_int16_t proto,
enum ip_nat_manip_type maniptype)
{
struct iphdr *iph;
+ struct ip_nat_protocol *p;
- (*pskb)->nfcache |= NFC_ALTERED;
- if (!skb_ip_make_writable(pskb, iphdroff + sizeof(*iph)))
+ if (!skb_make_writable(pskb, iphdroff + sizeof(*iph)))
return 0;
iph = (void *)(*pskb)->data + iphdroff;
/* Manipulate protcol part. */
- if (!ip_nat_find_proto(proto)->manip_pkt(pskb, iphdroff,
- target, maniptype))
+ p = ip_nat_proto_find_get(proto);
+ if (!p->manip_pkt(pskb, iphdroff, target, maniptype)) {
+ ip_nat_proto_put(p);
return 0;
+ }
+ ip_nat_proto_put(p);
iph = (void *)(*pskb)->data + iphdroff;
@@ -347,10 +392,10 @@ manip_pkt(u_int16_t proto,
}
/* Do packet manipulations according to ip_nat_setup_info. */
-unsigned int nat_packet(struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int hooknum,
- struct sk_buff **pskb)
+unsigned int ip_nat_packet(struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int hooknum,
+ struct sk_buff **pskb)
{
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
unsigned long statusbit;
@@ -377,12 +422,13 @@ unsigned int nat_packet(struct ip_conntrack *ct,
}
return NF_ACCEPT;
}
+EXPORT_SYMBOL_GPL(ip_nat_packet);
/* Dir is direction ICMP is coming from (opposite to packet it contains) */
-int icmp_reply_translation(struct sk_buff **pskb,
- struct ip_conntrack *ct,
- enum ip_nat_manip_type manip,
- enum ip_conntrack_dir dir)
+int ip_nat_icmp_reply_translation(struct sk_buff **pskb,
+ struct ip_conntrack *ct,
+ enum ip_nat_manip_type manip,
+ enum ip_conntrack_dir dir)
{
struct {
struct icmphdr icmp;
@@ -391,7 +437,7 @@ int icmp_reply_translation(struct sk_buff **pskb,
struct ip_conntrack_tuple inner, target;
int hdrlen = (*pskb)->nh.iph->ihl * 4;
- if (!skb_ip_make_writable(pskb, hdrlen + sizeof(*inside)))
+ if (!skb_make_writable(pskb, hdrlen + sizeof(*inside)))
return 0;
inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
@@ -426,7 +472,8 @@ int icmp_reply_translation(struct sk_buff **pskb,
if (!ip_ct_get_tuple(&inside->ip, *pskb, (*pskb)->nh.iph->ihl*4 +
sizeof(struct icmphdr) + inside->ip.ihl*4,
- &inner, ip_ct_find_proto(inside->ip.protocol)))
+ &inner,
+ __ip_conntrack_proto_find(inside->ip.protocol)))
return 0;
/* Change inner back to look like incoming packet. We do the
@@ -468,6 +515,7 @@ int icmp_reply_translation(struct sk_buff **pskb,
return 1;
}
+EXPORT_SYMBOL_GPL(ip_nat_icmp_reply_translation);
/* Protocol registration. */
int ip_nat_protocol_register(struct ip_nat_protocol *proto)
@@ -484,6 +532,7 @@ int ip_nat_protocol_register(struct ip_nat_protocol *proto)
write_unlock_bh(&ip_nat_lock);
return ret;
}
+EXPORT_SYMBOL(ip_nat_protocol_register);
/* Noone stores the protocol anywhere; simply delete it. */
void ip_nat_protocol_unregister(struct ip_nat_protocol *proto)
@@ -495,8 +544,54 @@ void ip_nat_protocol_unregister(struct ip_nat_protocol *proto)
/* Someone could be still looking at the proto in a bh. */
synchronize_net();
}
+EXPORT_SYMBOL(ip_nat_protocol_unregister);
+
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+int
+ip_nat_port_range_to_nfattr(struct sk_buff *skb,
+ const struct ip_nat_range *range)
+{
+ NFA_PUT(skb, CTA_PROTONAT_PORT_MIN, sizeof(u_int16_t),
+ &range->min.tcp.port);
+ NFA_PUT(skb, CTA_PROTONAT_PORT_MAX, sizeof(u_int16_t),
+ &range->max.tcp.port);
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+int
+ip_nat_port_nfattr_to_range(struct nfattr *tb[], struct ip_nat_range *range)
+{
+ int ret = 0;
+
+ /* we have to return whether we actually parsed something or not */
+
+ if (tb[CTA_PROTONAT_PORT_MIN-1]) {
+ ret = 1;
+ range->min.tcp.port =
+ *(u_int16_t *)NFA_DATA(tb[CTA_PROTONAT_PORT_MIN-1]);
+ }
+
+ if (!tb[CTA_PROTONAT_PORT_MAX-1]) {
+ if (ret)
+ range->max.tcp.port = range->min.tcp.port;
+ } else {
+ ret = 1;
+ range->max.tcp.port =
+ *(u_int16_t *)NFA_DATA(tb[CTA_PROTONAT_PORT_MAX-1]);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ip_nat_port_nfattr_to_range);
+EXPORT_SYMBOL_GPL(ip_nat_port_range_to_nfattr);
+#endif
-int __init ip_nat_init(void)
+static int __init ip_nat_init(void)
{
size_t i;
@@ -538,10 +633,14 @@ static int clean_nat(struct ip_conntrack *i, void *data)
return 0;
}
-/* Not __exit: called from ip_nat_standalone.c:init_or_cleanup() --RR */
-void ip_nat_cleanup(void)
+static void __exit ip_nat_cleanup(void)
{
ip_ct_iterate_cleanup(&clean_nat, NULL);
ip_conntrack_destroyed = NULL;
vfree(bysource);
}
+
+MODULE_LICENSE("GPL");
+
+module_init(ip_nat_init);
+module_exit(ip_nat_cleanup);
diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c
index 158f34f32c0..5d506e0564d 100644
--- a/net/ipv4/netfilter/ip_nat_helper.c
+++ b/net/ipv4/netfilter/ip_nat_helper.c
@@ -168,7 +168,7 @@ ip_nat_mangle_tcp_packet(struct sk_buff **pskb,
struct tcphdr *tcph;
int datalen;
- if (!skb_ip_make_writable(pskb, (*pskb)->len))
+ if (!skb_make_writable(pskb, (*pskb)->len))
return 0;
if (rep_len > match_len
@@ -199,6 +199,7 @@ ip_nat_mangle_tcp_packet(struct sk_buff **pskb,
}
return 1;
}
+EXPORT_SYMBOL(ip_nat_mangle_tcp_packet);
/* Generic function for mangling variable-length address changes inside
* NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX
@@ -228,7 +229,7 @@ ip_nat_mangle_udp_packet(struct sk_buff **pskb,
match_offset + match_len)
return 0;
- if (!skb_ip_make_writable(pskb, (*pskb)->len))
+ if (!skb_make_writable(pskb, (*pskb)->len))
return 0;
if (rep_len > match_len
@@ -256,6 +257,7 @@ ip_nat_mangle_udp_packet(struct sk_buff **pskb,
return 1;
}
+EXPORT_SYMBOL(ip_nat_mangle_udp_packet);
/* Adjust one found SACK option including checksum correction */
static void
@@ -315,7 +317,7 @@ ip_nat_sack_adjust(struct sk_buff **pskb,
optoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct tcphdr);
optend = (*pskb)->nh.iph->ihl*4 + tcph->doff*4;
- if (!skb_ip_make_writable(pskb, optend))
+ if (!skb_make_writable(pskb, optend))
return 0;
dir = CTINFO2DIR(ctinfo);
@@ -363,7 +365,7 @@ ip_nat_seq_adjust(struct sk_buff **pskb,
this_way = &ct->nat.info.seq[dir];
other_way = &ct->nat.info.seq[!dir];
- if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
+ if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
return 0;
tcph = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
@@ -399,6 +401,7 @@ ip_nat_seq_adjust(struct sk_buff **pskb,
return 1;
}
+EXPORT_SYMBOL(ip_nat_seq_adjust);
/* Setup NAT on this expected conntrack so it follows master. */
/* If we fail to get a free NAT slot, we'll get dropped on confirm */
@@ -425,3 +428,4 @@ void ip_nat_follow_master(struct ip_conntrack *ct,
/* hook doesn't matter, but it has to do destination manip */
ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING);
}
+EXPORT_SYMBOL(ip_nat_follow_master);
diff --git a/net/ipv4/netfilter/ip_nat_helper_pptp.c b/net/ipv4/netfilter/ip_nat_helper_pptp.c
new file mode 100644
index 00000000000..3cdd0684d30
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_helper_pptp.c
@@ -0,0 +1,401 @@
+/*
+ * ip_nat_pptp.c - Version 3.0
+ *
+ * NAT support for PPTP (Point to Point Tunneling Protocol).
+ * PPTP is a a protocol for creating virtual private networks.
+ * It is a specification defined by Microsoft and some vendors
+ * working with Microsoft. PPTP is built on top of a modified
+ * version of the Internet Generic Routing Encapsulation Protocol.
+ * GRE is defined in RFC 1701 and RFC 1702. Documentation of
+ * PPTP can be found in RFC 2637
+ *
+ * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ *
+ * TODO: - NAT to a unique tuple, not to TCP source port
+ * (needs netfilter tuple reservation)
+ *
+ * Changes:
+ * 2002-02-10 - Version 1.3
+ * - Use ip_nat_mangle_tcp_packet() because of cloned skb's
+ * in local connections (Philip Craig <philipc@snapgear.com>)
+ * - add checks for magicCookie and pptp version
+ * - make argument list of pptp_{out,in}bound_packet() shorter
+ * - move to C99 style initializers
+ * - print version number at module loadtime
+ * 2003-09-22 - Version 1.5
+ * - use SNATed tcp sourceport as callid, since we get called before
+ * TCP header is mangled (Philip Craig <philipc@snapgear.com>)
+ * 2004-10-22 - Version 2.0
+ * - kernel 2.6.x version
+ * 2005-06-10 - Version 3.0
+ * - kernel >= 2.6.11 version,
+ * funded by Oxcoda NetBox Blue (http://www.netboxblue.com/)
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/netfilter_ipv4/ip_nat_pptp.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_proto_gre.h>
+#include <linux/netfilter_ipv4/ip_conntrack_pptp.h>
+
+#define IP_NAT_PPTP_VERSION "3.0"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
+MODULE_DESCRIPTION("Netfilter NAT helper module for PPTP");
+
+
+#if 0
+extern const char *pptp_msg_name[];
+#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s: " format, __FILE__, \
+ __FUNCTION__, ## args)
+#else
+#define DEBUGP(format, args...)
+#endif
+
+static void pptp_nat_expected(struct ip_conntrack *ct,
+ struct ip_conntrack_expect *exp)
+{
+ struct ip_conntrack *master = ct->master;
+ struct ip_conntrack_expect *other_exp;
+ struct ip_conntrack_tuple t;
+ struct ip_ct_pptp_master *ct_pptp_info;
+ struct ip_nat_pptp *nat_pptp_info;
+
+ ct_pptp_info = &master->help.ct_pptp_info;
+ nat_pptp_info = &master->nat.help.nat_pptp_info;
+
+ /* And here goes the grand finale of corrosion... */
+
+ if (exp->dir == IP_CT_DIR_ORIGINAL) {
+ DEBUGP("we are PNS->PAC\n");
+ /* therefore, build tuple for PAC->PNS */
+ t.src.ip = master->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip;
+ t.src.u.gre.key = htons(master->help.ct_pptp_info.pac_call_id);
+ t.dst.ip = master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip;
+ t.dst.u.gre.key = htons(master->help.ct_pptp_info.pns_call_id);
+ t.dst.protonum = IPPROTO_GRE;
+ } else {
+ DEBUGP("we are PAC->PNS\n");
+ /* build tuple for PNS->PAC */
+ t.src.ip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip;
+ t.src.u.gre.key =
+ htons(master->nat.help.nat_pptp_info.pns_call_id);
+ t.dst.ip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip;
+ t.dst.u.gre.key =
+ htons(master->nat.help.nat_pptp_info.pac_call_id);
+ t.dst.protonum = IPPROTO_GRE;
+ }
+
+ DEBUGP("trying to unexpect other dir: ");
+ DUMP_TUPLE(&t);
+ other_exp = ip_conntrack_expect_find(&t);
+ if (other_exp) {
+ ip_conntrack_unexpect_related(other_exp);
+ ip_conntrack_expect_put(other_exp);
+ DEBUGP("success\n");
+ } else {
+ DEBUGP("not found!\n");
+ }
+
+ ip_nat_follow_master(ct, exp);
+}
+
+/* outbound packets == from PNS to PAC */
+static int
+pptp_outbound_pkt(struct sk_buff **pskb,
+ struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo,
+ struct PptpControlHeader *ctlh,
+ union pptp_ctrl_union *pptpReq)
+
+{
+ struct ip_ct_pptp_master *ct_pptp_info = &ct->help.ct_pptp_info;
+ struct ip_nat_pptp *nat_pptp_info = &ct->nat.help.nat_pptp_info;
+
+ u_int16_t msg, *cid = NULL, new_callid;
+
+ new_callid = htons(ct_pptp_info->pns_call_id);
+
+ switch (msg = ntohs(ctlh->messageType)) {
+ case PPTP_OUT_CALL_REQUEST:
+ cid = &pptpReq->ocreq.callID;
+ /* FIXME: ideally we would want to reserve a call ID
+ * here. current netfilter NAT core is not able to do
+ * this :( For now we use TCP source port. This breaks
+ * multiple calls within one control session */
+
+ /* save original call ID in nat_info */
+ nat_pptp_info->pns_call_id = ct_pptp_info->pns_call_id;
+
+ /* don't use tcph->source since we are at a DSTmanip
+ * hook (e.g. PREROUTING) and pkt is not mangled yet */
+ new_callid = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port;
+
+ /* save new call ID in ct info */
+ ct_pptp_info->pns_call_id = ntohs(new_callid);
+ break;
+ case PPTP_IN_CALL_REPLY:
+ cid = &pptpReq->icreq.callID;
+ break;
+ case PPTP_CALL_CLEAR_REQUEST:
+ cid = &pptpReq->clrreq.callID;
+ break;
+ default:
+ DEBUGP("unknown outbound packet 0x%04x:%s\n", msg,
+ (msg <= PPTP_MSG_MAX)?
+ pptp_msg_name[msg]:pptp_msg_name[0]);
+ /* fall through */
+
+ case PPTP_SET_LINK_INFO:
+ /* only need to NAT in case PAC is behind NAT box */
+ case PPTP_START_SESSION_REQUEST:
+ case PPTP_START_SESSION_REPLY:
+ case PPTP_STOP_SESSION_REQUEST:
+ case PPTP_STOP_SESSION_REPLY:
+ case PPTP_ECHO_REQUEST:
+ case PPTP_ECHO_REPLY:
+ /* no need to alter packet */
+ return NF_ACCEPT;
+ }
+
+ /* only OUT_CALL_REQUEST, IN_CALL_REPLY, CALL_CLEAR_REQUEST pass
+ * down to here */
+
+ IP_NF_ASSERT(cid);
+
+ DEBUGP("altering call id from 0x%04x to 0x%04x\n",
+ ntohs(*cid), ntohs(new_callid));
+
+ /* mangle packet */
+ if (ip_nat_mangle_tcp_packet(pskb, ct, ctinfo,
+ (void *)cid - ((void *)ctlh - sizeof(struct pptp_pkt_hdr)),
+ sizeof(new_callid),
+ (char *)&new_callid,
+ sizeof(new_callid)) == 0)
+ return NF_DROP;
+
+ return NF_ACCEPT;
+}
+
+static int
+pptp_exp_gre(struct ip_conntrack_expect *expect_orig,
+ struct ip_conntrack_expect *expect_reply)
+{
+ struct ip_ct_pptp_master *ct_pptp_info =
+ &expect_orig->master->help.ct_pptp_info;
+ struct ip_nat_pptp *nat_pptp_info =
+ &expect_orig->master->nat.help.nat_pptp_info;
+
+ struct ip_conntrack *ct = expect_orig->master;
+
+ struct ip_conntrack_tuple inv_t;
+ struct ip_conntrack_tuple *orig_t, *reply_t;
+
+ /* save original PAC call ID in nat_info */
+ nat_pptp_info->pac_call_id = ct_pptp_info->pac_call_id;
+
+ /* alter expectation */
+ orig_t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ reply_t = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+
+ /* alter expectation for PNS->PAC direction */
+ invert_tuplepr(&inv_t, &expect_orig->tuple);
+ expect_orig->saved_proto.gre.key = htons(nat_pptp_info->pac_call_id);
+ expect_orig->tuple.src.u.gre.key = htons(nat_pptp_info->pns_call_id);
+ expect_orig->tuple.dst.u.gre.key = htons(ct_pptp_info->pac_call_id);
+ inv_t.src.ip = reply_t->src.ip;
+ inv_t.dst.ip = reply_t->dst.ip;
+ inv_t.src.u.gre.key = htons(nat_pptp_info->pac_call_id);
+ inv_t.dst.u.gre.key = htons(ct_pptp_info->pns_call_id);
+
+ if (!ip_conntrack_expect_related(expect_orig)) {
+ DEBUGP("successfully registered expect\n");
+ } else {
+ DEBUGP("can't expect_related(expect_orig)\n");
+ return 1;
+ }
+
+ /* alter expectation for PAC->PNS direction */
+ invert_tuplepr(&inv_t, &expect_reply->tuple);
+ expect_reply->saved_proto.gre.key = htons(nat_pptp_info->pns_call_id);
+ expect_reply->tuple.src.u.gre.key = htons(nat_pptp_info->pac_call_id);
+ expect_reply->tuple.dst.u.gre.key = htons(ct_pptp_info->pns_call_id);
+ inv_t.src.ip = orig_t->src.ip;
+ inv_t.dst.ip = orig_t->dst.ip;
+ inv_t.src.u.gre.key = htons(nat_pptp_info->pns_call_id);
+ inv_t.dst.u.gre.key = htons(ct_pptp_info->pac_call_id);
+
+ if (!ip_conntrack_expect_related(expect_reply)) {
+ DEBUGP("successfully registered expect\n");
+ } else {
+ DEBUGP("can't expect_related(expect_reply)\n");
+ ip_conntrack_unexpect_related(expect_orig);
+ return 1;
+ }
+
+ if (ip_ct_gre_keymap_add(ct, &expect_reply->tuple, 0) < 0) {
+ DEBUGP("can't register original keymap\n");
+ ip_conntrack_unexpect_related(expect_orig);
+ ip_conntrack_unexpect_related(expect_reply);
+ return 1;
+ }
+
+ if (ip_ct_gre_keymap_add(ct, &inv_t, 1) < 0) {
+ DEBUGP("can't register reply keymap\n");
+ ip_conntrack_unexpect_related(expect_orig);
+ ip_conntrack_unexpect_related(expect_reply);
+ ip_ct_gre_keymap_destroy(ct);
+ return 1;
+ }
+
+ return 0;
+}
+
+/* inbound packets == from PAC to PNS */
+static int
+pptp_inbound_pkt(struct sk_buff **pskb,
+ struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo,
+ struct PptpControlHeader *ctlh,
+ union pptp_ctrl_union *pptpReq)
+{
+ struct ip_nat_pptp *nat_pptp_info = &ct->nat.help.nat_pptp_info;
+ u_int16_t msg, new_cid = 0, new_pcid, *pcid = NULL, *cid = NULL;
+
+ int ret = NF_ACCEPT, rv;
+
+ new_pcid = htons(nat_pptp_info->pns_call_id);
+
+ switch (msg = ntohs(ctlh->messageType)) {
+ case PPTP_OUT_CALL_REPLY:
+ pcid = &pptpReq->ocack.peersCallID;
+ cid = &pptpReq->ocack.callID;
+ break;
+ case PPTP_IN_CALL_CONNECT:
+ pcid = &pptpReq->iccon.peersCallID;
+ break;
+ case PPTP_IN_CALL_REQUEST:
+ /* only need to nat in case PAC is behind NAT box */
+ break;
+ case PPTP_WAN_ERROR_NOTIFY:
+ pcid = &pptpReq->wanerr.peersCallID;
+ break;
+ case PPTP_CALL_DISCONNECT_NOTIFY:
+ pcid = &pptpReq->disc.callID;
+ break;
+ case PPTP_SET_LINK_INFO:
+ pcid = &pptpReq->setlink.peersCallID;
+ break;
+
+ default:
+ DEBUGP("unknown inbound packet %s\n", (msg <= PPTP_MSG_MAX)?
+ pptp_msg_name[msg]:pptp_msg_name[0]);
+ /* fall through */
+
+ case PPTP_START_SESSION_REQUEST:
+ case PPTP_START_SESSION_REPLY:
+ case PPTP_STOP_SESSION_REQUEST:
+ case PPTP_STOP_SESSION_REPLY:
+ case PPTP_ECHO_REQUEST:
+ case PPTP_ECHO_REPLY:
+ /* no need to alter packet */
+ return NF_ACCEPT;
+ }
+
+ /* only OUT_CALL_REPLY, IN_CALL_CONNECT, IN_CALL_REQUEST,
+ * WAN_ERROR_NOTIFY, CALL_DISCONNECT_NOTIFY pass down here */
+
+ /* mangle packet */
+ IP_NF_ASSERT(pcid);
+ DEBUGP("altering peer call id from 0x%04x to 0x%04x\n",
+ ntohs(*pcid), ntohs(new_pcid));
+
+ rv = ip_nat_mangle_tcp_packet(pskb, ct, ctinfo,
+ (void *)pcid - ((void *)ctlh - sizeof(struct pptp_pkt_hdr)),
+ sizeof(new_pcid), (char *)&new_pcid,
+ sizeof(new_pcid));
+ if (rv != NF_ACCEPT)
+ return rv;
+
+ if (new_cid) {
+ IP_NF_ASSERT(cid);
+ DEBUGP("altering call id from 0x%04x to 0x%04x\n",
+ ntohs(*cid), ntohs(new_cid));
+ rv = ip_nat_mangle_tcp_packet(pskb, ct, ctinfo,
+ (void *)cid - ((void *)ctlh - sizeof(struct pptp_pkt_hdr)),
+ sizeof(new_cid),
+ (char *)&new_cid,
+ sizeof(new_cid));
+ if (rv != NF_ACCEPT)
+ return rv;
+ }
+
+ /* check for earlier return value of 'switch' above */
+ if (ret != NF_ACCEPT)
+ return ret;
+
+ /* great, at least we don't need to resize packets */
+ return NF_ACCEPT;
+}
+
+
+extern int __init ip_nat_proto_gre_init(void);
+extern void __exit ip_nat_proto_gre_fini(void);
+
+static int __init init(void)
+{
+ int ret;
+
+ DEBUGP("%s: registering NAT helper\n", __FILE__);
+
+ ret = ip_nat_proto_gre_init();
+ if (ret < 0)
+ return ret;
+
+ BUG_ON(ip_nat_pptp_hook_outbound);
+ ip_nat_pptp_hook_outbound = &pptp_outbound_pkt;
+
+ BUG_ON(ip_nat_pptp_hook_inbound);
+ ip_nat_pptp_hook_inbound = &pptp_inbound_pkt;
+
+ BUG_ON(ip_nat_pptp_hook_exp_gre);
+ ip_nat_pptp_hook_exp_gre = &pptp_exp_gre;
+
+ BUG_ON(ip_nat_pptp_hook_expectfn);
+ ip_nat_pptp_hook_expectfn = &pptp_nat_expected;
+
+ printk("ip_nat_pptp version %s loaded\n", IP_NAT_PPTP_VERSION);
+ return 0;
+}
+
+static void __exit fini(void)
+{
+ DEBUGP("cleanup_module\n" );
+
+ ip_nat_pptp_hook_expectfn = NULL;
+ ip_nat_pptp_hook_exp_gre = NULL;
+ ip_nat_pptp_hook_inbound = NULL;
+ ip_nat_pptp_hook_outbound = NULL;
+
+ ip_nat_proto_gre_fini();
+ /* Make sure noone calls it, meanwhile */
+ synchronize_net();
+
+ printk("ip_nat_pptp version %s unloaded\n", IP_NAT_PPTP_VERSION);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_nat_proto_gre.c b/net/ipv4/netfilter/ip_nat_proto_gre.c
new file mode 100644
index 00000000000..7c128540167
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_proto_gre.c
@@ -0,0 +1,214 @@
+/*
+ * ip_nat_proto_gre.c - Version 2.0
+ *
+ * NAT protocol helper module for GRE.
+ *
+ * GRE is a generic encapsulation protocol, which is generally not very
+ * suited for NAT, as it has no protocol-specific part as port numbers.
+ *
+ * It has an optional key field, which may help us distinguishing two
+ * connections between the same two hosts.
+ *
+ * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784
+ *
+ * PPTP is built on top of a modified version of GRE, and has a mandatory
+ * field called "CallID", which serves us for the same purpose as the key
+ * field in plain GRE.
+ *
+ * Documentation about PPTP can be found in RFC 2637
+ *
+ * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+#include <linux/netfilter_ipv4/ip_conntrack_proto_gre.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
+MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE");
+
+#if 0
+#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s: " format, __FILE__, \
+ __FUNCTION__, ## args)
+#else
+#define DEBUGP(x, args...)
+#endif
+
+/* is key in given range between min and max */
+static int
+gre_in_range(const struct ip_conntrack_tuple *tuple,
+ enum ip_nat_manip_type maniptype,
+ const union ip_conntrack_manip_proto *min,
+ const union ip_conntrack_manip_proto *max)
+{
+ u_int32_t key;
+
+ if (maniptype == IP_NAT_MANIP_SRC)
+ key = tuple->src.u.gre.key;
+ else
+ key = tuple->dst.u.gre.key;
+
+ return ntohl(key) >= ntohl(min->gre.key)
+ && ntohl(key) <= ntohl(max->gre.key);
+}
+
+/* generate unique tuple ... */
+static int
+gre_unique_tuple(struct ip_conntrack_tuple *tuple,
+ const struct ip_nat_range *range,
+ enum ip_nat_manip_type maniptype,
+ const struct ip_conntrack *conntrack)
+{
+ static u_int16_t key;
+ u_int16_t *keyptr;
+ unsigned int min, i, range_size;
+
+ if (maniptype == IP_NAT_MANIP_SRC)
+ keyptr = &tuple->src.u.gre.key;
+ else
+ keyptr = &tuple->dst.u.gre.key;
+
+ if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) {
+ DEBUGP("%p: NATing GRE PPTP\n", conntrack);
+ min = 1;
+ range_size = 0xffff;
+ } else {
+ min = ntohl(range->min.gre.key);
+ range_size = ntohl(range->max.gre.key) - min + 1;
+ }
+
+ DEBUGP("min = %u, range_size = %u\n", min, range_size);
+
+ for (i = 0; i < range_size; i++, key++) {
+ *keyptr = htonl(min + key % range_size);
+ if (!ip_nat_used_tuple(tuple, conntrack))
+ return 1;
+ }
+
+ DEBUGP("%p: no NAT mapping\n", conntrack);
+
+ return 0;
+}
+
+/* manipulate a GRE packet according to maniptype */
+static int
+gre_manip_pkt(struct sk_buff **pskb,
+ unsigned int iphdroff,
+ const struct ip_conntrack_tuple *tuple,
+ enum ip_nat_manip_type maniptype)
+{
+ struct gre_hdr *greh;
+ struct gre_hdr_pptp *pgreh;
+ struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff);
+ unsigned int hdroff = iphdroff + iph->ihl*4;
+
+ /* pgreh includes two optional 32bit fields which are not required
+ * to be there. That's where the magic '8' comes from */
+ if (!skb_make_writable(pskb, hdroff + sizeof(*pgreh)-8))
+ return 0;
+
+ greh = (void *)(*pskb)->data + hdroff;
+ pgreh = (struct gre_hdr_pptp *) greh;
+
+ /* we only have destination manip of a packet, since 'source key'
+ * is not present in the packet itself */
+ if (maniptype == IP_NAT_MANIP_DST) {
+ /* key manipulation is always dest */
+ switch (greh->version) {
+ case 0:
+ if (!greh->key) {
+ DEBUGP("can't nat GRE w/o key\n");
+ break;
+ }
+ if (greh->csum) {
+ /* FIXME: Never tested this code... */
+ *(gre_csum(greh)) =
+ ip_nat_cheat_check(~*(gre_key(greh)),
+ tuple->dst.u.gre.key,
+ *(gre_csum(greh)));
+ }
+ *(gre_key(greh)) = tuple->dst.u.gre.key;
+ break;
+ case GRE_VERSION_PPTP:
+ DEBUGP("call_id -> 0x%04x\n",
+ ntohl(tuple->dst.u.gre.key));
+ pgreh->call_id = htons(ntohl(tuple->dst.u.gre.key));
+ break;
+ default:
+ DEBUGP("can't nat unknown GRE version\n");
+ return 0;
+ break;
+ }
+ }
+ return 1;
+}
+
+/* print out a nat tuple */
+static unsigned int
+gre_print(char *buffer,
+ const struct ip_conntrack_tuple *match,
+ const struct ip_conntrack_tuple *mask)
+{
+ unsigned int len = 0;
+
+ if (mask->src.u.gre.key)
+ len += sprintf(buffer + len, "srckey=0x%x ",
+ ntohl(match->src.u.gre.key));
+
+ if (mask->dst.u.gre.key)
+ len += sprintf(buffer + len, "dstkey=0x%x ",
+ ntohl(match->src.u.gre.key));
+
+ return len;
+}
+
+/* print a range of keys */
+static unsigned int
+gre_print_range(char *buffer, const struct ip_nat_range *range)
+{
+ if (range->min.gre.key != 0
+ || range->max.gre.key != 0xFFFF) {
+ if (range->min.gre.key == range->max.gre.key)
+ return sprintf(buffer, "key 0x%x ",
+ ntohl(range->min.gre.key));
+ else
+ return sprintf(buffer, "keys 0x%u-0x%u ",
+ ntohl(range->min.gre.key),
+ ntohl(range->max.gre.key));
+ } else
+ return 0;
+}
+
+/* nat helper struct */
+static struct ip_nat_protocol gre = {
+ .name = "GRE",
+ .protonum = IPPROTO_GRE,
+ .manip_pkt = gre_manip_pkt,
+ .in_range = gre_in_range,
+ .unique_tuple = gre_unique_tuple,
+ .print = gre_print,
+ .print_range = gre_print_range,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+ .range_to_nfattr = ip_nat_port_range_to_nfattr,
+ .nfattr_to_range = ip_nat_port_nfattr_to_range,
+#endif
+};
+
+int __init ip_nat_proto_gre_init(void)
+{
+ return ip_nat_protocol_register(&gre);
+}
+
+void __exit ip_nat_proto_gre_fini(void)
+{
+ ip_nat_protocol_unregister(&gre);
+}
diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c
index 6596c9ee165..93871904399 100644
--- a/net/ipv4/netfilter/ip_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_icmp.c
@@ -62,7 +62,7 @@ icmp_manip_pkt(struct sk_buff **pskb,
struct icmphdr *hdr;
unsigned int hdroff = iphdroff + iph->ihl*4;
- if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr)))
+ if (!skb_make_writable(pskb, hdroff + sizeof(*hdr)))
return 0;
hdr = (struct icmphdr *)((*pskb)->data + hdroff);
@@ -106,11 +106,18 @@ icmp_print_range(char *buffer, const struct ip_nat_range *range)
else return 0;
}
-struct ip_nat_protocol ip_nat_protocol_icmp
-= { "ICMP", IPPROTO_ICMP,
- icmp_manip_pkt,
- icmp_in_range,
- icmp_unique_tuple,
- icmp_print,
- icmp_print_range
+struct ip_nat_protocol ip_nat_protocol_icmp = {
+ .name = "ICMP",
+ .protonum = IPPROTO_ICMP,
+ .me = THIS_MODULE,
+ .manip_pkt = icmp_manip_pkt,
+ .in_range = icmp_in_range,
+ .unique_tuple = icmp_unique_tuple,
+ .print = icmp_print,
+ .print_range = icmp_print_range,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+ .range_to_nfattr = ip_nat_port_range_to_nfattr,
+ .nfattr_to_range = ip_nat_port_nfattr_to_range,
+#endif
};
diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c
index a98e36d2b3c..1d381bf6857 100644
--- a/net/ipv4/netfilter/ip_nat_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_tcp.c
@@ -12,6 +12,7 @@
#include <linux/ip.h>
#include <linux/tcp.h>
#include <linux/if.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
#include <linux/netfilter_ipv4/ip_nat.h>
#include <linux/netfilter_ipv4/ip_nat_rule.h>
#include <linux/netfilter_ipv4/ip_nat_protocol.h>
@@ -102,7 +103,7 @@ tcp_manip_pkt(struct sk_buff **pskb,
if ((*pskb)->len >= hdroff + sizeof(struct tcphdr))
hdrsize = sizeof(struct tcphdr);
- if (!skb_ip_make_writable(pskb, hdroff + hdrsize))
+ if (!skb_make_writable(pskb, hdroff + hdrsize))
return 0;
iph = (struct iphdr *)((*pskb)->data + iphdroff);
@@ -169,11 +170,18 @@ tcp_print_range(char *buffer, const struct ip_nat_range *range)
else return 0;
}
-struct ip_nat_protocol ip_nat_protocol_tcp
-= { "TCP", IPPROTO_TCP,
- tcp_manip_pkt,
- tcp_in_range,
- tcp_unique_tuple,
- tcp_print,
- tcp_print_range
+struct ip_nat_protocol ip_nat_protocol_tcp = {
+ .name = "TCP",
+ .protonum = IPPROTO_TCP,
+ .me = THIS_MODULE,
+ .manip_pkt = tcp_manip_pkt,
+ .in_range = tcp_in_range,
+ .unique_tuple = tcp_unique_tuple,
+ .print = tcp_print,
+ .print_range = tcp_print_range,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+ .range_to_nfattr = ip_nat_port_range_to_nfattr,
+ .nfattr_to_range = ip_nat_port_nfattr_to_range,
+#endif
};
diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c
index 9f66e562566..c4906e1aa24 100644
--- a/net/ipv4/netfilter/ip_nat_proto_udp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_udp.c
@@ -94,7 +94,7 @@ udp_manip_pkt(struct sk_buff **pskb,
u32 oldip, newip;
u16 *portptr, newport;
- if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr)))
+ if (!skb_make_writable(pskb, hdroff + sizeof(*hdr)))
return 0;
iph = (struct iphdr *)((*pskb)->data + iphdroff);
@@ -156,11 +156,18 @@ udp_print_range(char *buffer, const struct ip_nat_range *range)
else return 0;
}
-struct ip_nat_protocol ip_nat_protocol_udp
-= { "UDP", IPPROTO_UDP,
- udp_manip_pkt,
- udp_in_range,
- udp_unique_tuple,
- udp_print,
- udp_print_range
+struct ip_nat_protocol ip_nat_protocol_udp = {
+ .name = "UDP",
+ .protonum = IPPROTO_UDP,
+ .me = THIS_MODULE,
+ .manip_pkt = udp_manip_pkt,
+ .in_range = udp_in_range,
+ .unique_tuple = udp_unique_tuple,
+ .print = udp_print,
+ .print_range = udp_print_range,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+ .range_to_nfattr = ip_nat_port_range_to_nfattr,
+ .nfattr_to_range = ip_nat_port_nfattr_to_range,
+#endif
};
diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c
index f5525bd58d1..99bbef56f84 100644
--- a/net/ipv4/netfilter/ip_nat_proto_unknown.c
+++ b/net/ipv4/netfilter/ip_nat_proto_unknown.c
@@ -61,10 +61,11 @@ unknown_print_range(char *buffer, const struct ip_nat_range *range)
}
struct ip_nat_protocol ip_nat_unknown_protocol = {
- "unknown", 0,
- unknown_manip_pkt,
- unknown_in_range,
- unknown_unique_tuple,
- unknown_print,
- unknown_print_range
+ .name = "unknown",
+ .me = THIS_MODULE,
+ .manip_pkt = unknown_manip_pkt,
+ .in_range = unknown_in_range,
+ .unique_tuple = unknown_unique_tuple,
+ .print = unknown_print,
+ .print_range = unknown_print_range
};
diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c
index 60d70fa41a1..cb66b8bddeb 100644
--- a/net/ipv4/netfilter/ip_nat_rule.c
+++ b/net/ipv4/netfilter/ip_nat_rule.c
@@ -255,6 +255,27 @@ alloc_null_binding(struct ip_conntrack *conntrack,
return ip_nat_setup_info(conntrack, &range, hooknum);
}
+unsigned int
+alloc_null_binding_confirmed(struct ip_conntrack *conntrack,
+ struct ip_nat_info *info,
+ unsigned int hooknum)
+{
+ u_int32_t ip
+ = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC
+ ? conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip
+ : conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip);
+ u_int16_t all
+ = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC
+ ? conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.all
+ : conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.u.all);
+ struct ip_nat_range range
+ = { IP_NAT_RANGE_MAP_IPS, ip, ip, { all }, { all } };
+
+ DEBUGP("Allocating NULL binding for confirmed %p (%u.%u.%u.%u)\n",
+ conntrack, NIPQUAD(ip));
+ return ip_nat_setup_info(conntrack, &range, hooknum);
+}
+
int ip_nat_rule_find(struct sk_buff **pskb,
unsigned int hooknum,
const struct net_device *in,
diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c
index 2a48b6e635a..93b2c5111bb 100644
--- a/net/ipv4/netfilter/ip_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c
@@ -1275,7 +1275,7 @@ static int help(struct sk_buff **pskb,
return NF_DROP;
}
- if (!skb_ip_make_writable(pskb, (*pskb)->len))
+ if (!skb_make_writable(pskb, (*pskb)->len))
return NF_DROP;
spin_lock_bh(&snmp_lock);
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
index bc59d0d6e89..30cd4e18c12 100644
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -73,8 +73,6 @@ ip_nat_fn(unsigned int hooknum,
IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
& htons(IP_MF|IP_OFFSET)));
- (*pskb)->nfcache |= NFC_UNKNOWN;
-
/* If we had a hardware checksum before, it's now invalid */
if ((*pskb)->ip_summed == CHECKSUM_HW)
if (skb_checksum_help(*pskb, (out == NULL)))
@@ -102,12 +100,16 @@ ip_nat_fn(unsigned int hooknum,
return NF_ACCEPT;
}
+ /* Don't try to NAT if this packet is not conntracked */
+ if (ct == &ip_conntrack_untracked)
+ return NF_ACCEPT;
+
switch (ctinfo) {
case IP_CT_RELATED:
case IP_CT_RELATED+IP_CT_IS_REPLY:
if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) {
- if (!icmp_reply_translation(pskb, ct, maniptype,
- CTINFO2DIR(ctinfo)))
+ if (!ip_nat_icmp_reply_translation(pskb, ct, maniptype,
+ CTINFO2DIR(ctinfo)))
return NF_DROP;
else
return NF_ACCEPT;
@@ -121,8 +123,12 @@ ip_nat_fn(unsigned int hooknum,
if (!ip_nat_initialized(ct, maniptype)) {
unsigned int ret;
- /* LOCAL_IN hook doesn't have a chain! */
- if (hooknum == NF_IP_LOCAL_IN)
+ if (unlikely(is_confirmed(ct)))
+ /* NAT module was loaded late */
+ ret = alloc_null_binding_confirmed(ct, info,
+ hooknum);
+ else if (hooknum == NF_IP_LOCAL_IN)
+ /* LOCAL_IN hook doesn't have a chain! */
ret = alloc_null_binding(ct, info, hooknum);
else
ret = ip_nat_rule_find(pskb, hooknum,
@@ -146,7 +152,7 @@ ip_nat_fn(unsigned int hooknum,
}
IP_NF_ASSERT(info);
- return nat_packet(ct, ctinfo, hooknum, pskb);
+ return ip_nat_packet(ct, ctinfo, hooknum, pskb);
}
static unsigned int
@@ -319,15 +325,10 @@ static int init_or_cleanup(int init)
printk("ip_nat_init: can't setup rules.\n");
goto cleanup_nothing;
}
- ret = ip_nat_init();
- if (ret < 0) {
- printk("ip_nat_init: can't setup rules.\n");
- goto cleanup_rule_init;
- }
ret = nf_register_hook(&ip_nat_in_ops);
if (ret < 0) {
printk("ip_nat_init: can't register in hook.\n");
- goto cleanup_nat;
+ goto cleanup_rule_init;
}
ret = nf_register_hook(&ip_nat_out_ops);
if (ret < 0) {
@@ -368,8 +369,6 @@ static int init_or_cleanup(int init)
nf_unregister_hook(&ip_nat_out_ops);
cleanup_inops:
nf_unregister_hook(&ip_nat_in_ops);
- cleanup_nat:
- ip_nat_cleanup();
cleanup_rule_init:
ip_nat_rule_cleanup();
cleanup_nothing:
@@ -389,12 +388,4 @@ static void __exit fini(void)
module_init(init);
module_exit(fini);
-EXPORT_SYMBOL(ip_nat_setup_info);
-EXPORT_SYMBOL(ip_nat_protocol_register);
-EXPORT_SYMBOL(ip_nat_protocol_unregister);
-EXPORT_SYMBOL(ip_nat_cheat_check);
-EXPORT_SYMBOL(ip_nat_mangle_tcp_packet);
-EXPORT_SYMBOL(ip_nat_mangle_udp_packet);
-EXPORT_SYMBOL(ip_nat_used_tuple);
-EXPORT_SYMBOL(ip_nat_follow_master);
MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index eda1fba431a..36339eb39e1 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -43,17 +43,10 @@
#define NET_IPQ_QMAX 2088
#define NET_IPQ_QMAX_NAME "ip_queue_maxlen"
-struct ipq_rt_info {
- __u8 tos;
- __u32 daddr;
- __u32 saddr;
-};
-
struct ipq_queue_entry {
struct list_head list;
struct nf_info *info;
struct sk_buff *skb;
- struct ipq_rt_info rt_info;
};
typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long);
@@ -214,6 +207,12 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
break;
case IPQ_COPY_PACKET:
+ if (entry->skb->ip_summed == CHECKSUM_HW &&
+ (*errp = skb_checksum_help(entry->skb,
+ entry->info->outdev == NULL))) {
+ read_unlock_bh(&queue_lock);
+ return NULL;
+ }
if (copy_range == 0 || copy_range > entry->skb->len)
data_len = entry->skb->len;
else
@@ -241,8 +240,8 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
pmsg->packet_id = (unsigned long )entry;
pmsg->data_len = data_len;
- pmsg->timestamp_sec = entry->skb->stamp.tv_sec;
- pmsg->timestamp_usec = entry->skb->stamp.tv_usec;
+ pmsg->timestamp_sec = entry->skb->tstamp.off_sec;
+ pmsg->timestamp_usec = entry->skb->tstamp.off_usec;
pmsg->mark = entry->skb->nfmark;
pmsg->hook = entry->info->hook;
pmsg->hw_protocol = entry->skb->protocol;
@@ -281,7 +280,8 @@ nlmsg_failure:
}
static int
-ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data)
+ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info,
+ unsigned int queuenum, void *data)
{
int status = -EINVAL;
struct sk_buff *nskb;
@@ -299,14 +299,6 @@ ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data)
entry->info = info;
entry->skb = skb;
- if (entry->info->hook == NF_IP_LOCAL_OUT) {
- struct iphdr *iph = skb->nh.iph;
-
- entry->rt_info.tos = iph->tos;
- entry->rt_info.daddr = iph->daddr;
- entry->rt_info.saddr = iph->saddr;
- }
-
nskb = ipq_build_packet_message(entry, &status);
if (nskb == NULL)
goto err_out_free;
@@ -382,23 +374,11 @@ ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e)
}
skb_put(e->skb, diff);
}
- if (!skb_ip_make_writable(&e->skb, v->data_len))
+ if (!skb_make_writable(&e->skb, v->data_len))
return -ENOMEM;
memcpy(e->skb->data, v->payload, v->data_len);
- e->skb->nfcache |= NFC_ALTERED;
-
- /*
- * Extra routing may needed on local out, as the QUEUE target never
- * returns control to the table.
- */
- if (e->info->hook == NF_IP_LOCAL_OUT) {
- struct iphdr *iph = e->skb->nh.iph;
-
- if (!(iph->tos == e->rt_info.tos
- && iph->daddr == e->rt_info.daddr
- && iph->saddr == e->rt_info.saddr))
- return ip_route_me_harder(&e->skb);
- }
+ e->skb->ip_summed = CHECKSUM_NONE;
+
return 0;
}
@@ -676,6 +656,11 @@ ipq_get_info(char *buffer, char **start, off_t offset, int length)
}
#endif /* CONFIG_PROC_FS */
+static struct nf_queue_handler nfqh = {
+ .name = "ip_queue",
+ .outfn = &ipq_enqueue_packet,
+};
+
static int
init_or_cleanup(int init)
{
@@ -686,7 +671,8 @@ init_or_cleanup(int init)
goto cleanup;
netlink_register_notifier(&ipq_nl_notifier);
- ipqnl = netlink_kernel_create(NETLINK_FIREWALL, ipq_rcv_sk);
+ ipqnl = netlink_kernel_create(NETLINK_FIREWALL, 0, ipq_rcv_sk,
+ THIS_MODULE);
if (ipqnl == NULL) {
printk(KERN_ERR "ip_queue: failed to create netlink socket\n");
goto cleanup_netlink_notifier;
@@ -703,7 +689,7 @@ init_or_cleanup(int init)
register_netdevice_notifier(&ipq_dev_notifier);
ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0);
- status = nf_register_queue_handler(PF_INET, ipq_enqueue_packet, NULL);
+ status = nf_register_queue_handler(PF_INET, &nfqh);
if (status < 0) {
printk(KERN_ERR "ip_queue: failed to register queue handler\n");
goto cleanup_sysctl;
@@ -711,7 +697,7 @@ init_or_cleanup(int init)
return status;
cleanup:
- nf_unregister_queue_handler(PF_INET);
+ nf_unregister_queue_handlers(&nfqh);
synchronize_net();
ipq_flush(NF_DROP);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index c88dfcd38c5..75c27e92f6a 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -27,6 +27,7 @@
#include <asm/semaphore.h>
#include <linux/proc_fs.h>
#include <linux/err.h>
+#include <linux/cpumask.h>
#include <linux/netfilter_ipv4/ip_tables.h>
@@ -312,7 +313,6 @@ ipt_do_table(struct sk_buff **pskb,
do {
IP_NF_ASSERT(e);
IP_NF_ASSERT(back);
- (*pskb)->nfcache |= e->nfcache;
if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) {
struct ipt_entry_target *t;
@@ -341,8 +341,8 @@ ipt_do_table(struct sk_buff **pskb,
back->comefrom);
continue;
}
- if (table_base + v
- != (void *)e + e->next_offset) {
+ if (table_base + v != (void *)e + e->next_offset
+ && !(e->ip.flags & IPT_F_GOTO)) {
/* Save old back ptr in next entry */
struct ipt_entry *next
= (void *)e + e->next_offset;
@@ -922,8 +922,10 @@ translate_table(const char *name,
}
/* And one copy for every other CPU */
- for (i = 1; i < num_possible_cpus(); i++) {
- memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i,
+ for_each_cpu(i) {
+ if (i == 0)
+ continue;
+ memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i,
newinfo->entries,
SMP_ALIGN(newinfo->size));
}
@@ -944,7 +946,7 @@ replace_table(struct ipt_table *table,
struct ipt_entry *table_base;
unsigned int i;
- for (i = 0; i < num_possible_cpus(); i++) {
+ for_each_cpu(i) {
table_base =
(void *)newinfo->entries
+ TABLE_OFFSET(newinfo, i);
@@ -991,7 +993,7 @@ get_counters(const struct ipt_table_info *t,
unsigned int cpu;
unsigned int i;
- for (cpu = 0; cpu < num_possible_cpus(); cpu++) {
+ for_each_cpu(cpu) {
i = 0;
IPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu),
t->size,
@@ -1129,7 +1131,8 @@ do_replace(void __user *user, unsigned int len)
return -ENOMEM;
newinfo = vmalloc(sizeof(struct ipt_table_info)
- + SMP_ALIGN(tmp.size) * num_possible_cpus());
+ + SMP_ALIGN(tmp.size) *
+ (highest_possible_processor_id()+1));
if (!newinfo)
return -ENOMEM;
@@ -1459,7 +1462,8 @@ int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl)
= { 0, 0, 0, { 0 }, { 0 }, { } };
newinfo = vmalloc(sizeof(struct ipt_table_info)
- + SMP_ALIGN(repl->size) * num_possible_cpus());
+ + SMP_ALIGN(repl->size) *
+ (highest_possible_processor_id()+1));
if (!newinfo)
return -ENOMEM;
diff --git a/net/ipv4/netfilter/ipt_CLASSIFY.c b/net/ipv4/netfilter/ipt_CLASSIFY.c
index 9842e6e2318..dab78d8bd49 100644
--- a/net/ipv4/netfilter/ipt_CLASSIFY.c
+++ b/net/ipv4/netfilter/ipt_CLASSIFY.c
@@ -32,10 +32,8 @@ target(struct sk_buff **pskb,
{
const struct ipt_classify_target_info *clinfo = targinfo;
- if((*pskb)->priority != clinfo->priority) {
+ if((*pskb)->priority != clinfo->priority)
(*pskb)->priority = clinfo->priority;
- (*pskb)->nfcache |= NFC_ALTERED;
- }
return IPT_CONTINUE;
}
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 6706d3a1bc4..9bcb398fbc1 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -13,6 +13,7 @@
#include <linux/config.h>
#include <linux/proc_fs.h>
#include <linux/jhash.h>
+#include <linux/bitops.h>
#include <linux/skbuff.h>
#include <linux/ip.h>
#include <linux/tcp.h>
@@ -30,7 +31,7 @@
#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
#include <linux/netfilter_ipv4/ip_conntrack.h>
-#define CLUSTERIP_VERSION "0.7"
+#define CLUSTERIP_VERSION "0.8"
#define DEBUG_CLUSTERIP
@@ -49,13 +50,14 @@ MODULE_DESCRIPTION("iptables target for CLUSTERIP");
struct clusterip_config {
struct list_head list; /* list of all configs */
atomic_t refcount; /* reference count */
+ atomic_t entries; /* number of entries/rules
+ * referencing us */
u_int32_t clusterip; /* the IP address */
u_int8_t clustermac[ETH_ALEN]; /* the MAC address */
struct net_device *dev; /* device */
u_int16_t num_total_nodes; /* total number of nodes */
- u_int16_t num_local_nodes; /* number of local nodes */
- u_int16_t local_nodes[CLUSTERIP_MAX_NODES]; /* node number array */
+ unsigned long local_nodes; /* node number array */
#ifdef CONFIG_PROC_FS
struct proc_dir_entry *pde; /* proc dir entry */
@@ -66,8 +68,7 @@ struct clusterip_config {
static LIST_HEAD(clusterip_configs);
-/* clusterip_lock protects the clusterip_configs list _AND_ the configurable
- * data within all structurses (num_local_nodes, local_nodes[]) */
+/* clusterip_lock protects the clusterip_configs list */
static DEFINE_RWLOCK(clusterip_lock);
#ifdef CONFIG_PROC_FS
@@ -76,23 +77,48 @@ static struct proc_dir_entry *clusterip_procdir;
#endif
static inline void
-clusterip_config_get(struct clusterip_config *c) {
+clusterip_config_get(struct clusterip_config *c)
+{
atomic_inc(&c->refcount);
}
static inline void
-clusterip_config_put(struct clusterip_config *c) {
- if (atomic_dec_and_test(&c->refcount)) {
+clusterip_config_put(struct clusterip_config *c)
+{
+ if (atomic_dec_and_test(&c->refcount))
+ kfree(c);
+}
+
+/* increase the count of entries(rules) using/referencing this config */
+static inline void
+clusterip_config_entry_get(struct clusterip_config *c)
+{
+ atomic_inc(&c->entries);
+}
+
+/* decrease the count of entries using/referencing this config. If last
+ * entry(rule) is removed, remove the config from lists, but don't free it
+ * yet, since proc-files could still be holding references */
+static inline void
+clusterip_config_entry_put(struct clusterip_config *c)
+{
+ if (atomic_dec_and_test(&c->entries)) {
write_lock_bh(&clusterip_lock);
list_del(&c->list);
write_unlock_bh(&clusterip_lock);
+
dev_mc_delete(c->dev, c->clustermac, ETH_ALEN, 0);
dev_put(c->dev);
- kfree(c);
+
+ /* In case anyone still accesses the file, the open/close
+ * functions are also incrementing the refcount on their own,
+ * so it's safe to remove the entry even if it's in use. */
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry(c->pde->name, c->pde->parent);
+#endif
}
}
-
static struct clusterip_config *
__clusterip_config_find(u_int32_t clusterip)
{
@@ -111,7 +137,7 @@ __clusterip_config_find(u_int32_t clusterip)
}
static inline struct clusterip_config *
-clusterip_config_find_get(u_int32_t clusterip)
+clusterip_config_find_get(u_int32_t clusterip, int entry)
{
struct clusterip_config *c;
@@ -122,11 +148,24 @@ clusterip_config_find_get(u_int32_t clusterip)
return NULL;
}
atomic_inc(&c->refcount);
+ if (entry)
+ atomic_inc(&c->entries);
read_unlock_bh(&clusterip_lock);
return c;
}
+static void
+clusterip_config_init_nodelist(struct clusterip_config *c,
+ const struct ipt_clusterip_tgt_info *i)
+{
+ int n;
+
+ for (n = 0; n < i->num_local_nodes; n++) {
+ set_bit(i->local_nodes[n] - 1, &c->local_nodes);
+ }
+}
+
static struct clusterip_config *
clusterip_config_init(struct ipt_clusterip_tgt_info *i, u_int32_t ip,
struct net_device *dev)
@@ -143,11 +182,11 @@ clusterip_config_init(struct ipt_clusterip_tgt_info *i, u_int32_t ip,
c->clusterip = ip;
memcpy(&c->clustermac, &i->clustermac, ETH_ALEN);
c->num_total_nodes = i->num_total_nodes;
- c->num_local_nodes = i->num_local_nodes;
- memcpy(&c->local_nodes, &i->local_nodes, sizeof(&c->local_nodes));
+ clusterip_config_init_nodelist(c, i);
c->hash_mode = i->hash_mode;
c->hash_initval = i->hash_initval;
atomic_set(&c->refcount, 1);
+ atomic_set(&c->entries, 1);
#ifdef CONFIG_PROC_FS
/* create proc dir entry */
@@ -171,53 +210,28 @@ clusterip_config_init(struct ipt_clusterip_tgt_info *i, u_int32_t ip,
static int
clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum)
{
- int i;
-
- write_lock_bh(&clusterip_lock);
- if (c->num_local_nodes >= CLUSTERIP_MAX_NODES
- || nodenum > CLUSTERIP_MAX_NODES) {
- write_unlock_bh(&clusterip_lock);
+ if (nodenum == 0 ||
+ nodenum > c->num_total_nodes)
return 1;
- }
-
- /* check if we alrady have this number in our array */
- for (i = 0; i < c->num_local_nodes; i++) {
- if (c->local_nodes[i] == nodenum) {
- write_unlock_bh(&clusterip_lock);
- return 1;
- }
- }
- c->local_nodes[c->num_local_nodes++] = nodenum;
+ /* check if we already have this number in our bitfield */
+ if (test_and_set_bit(nodenum - 1, &c->local_nodes))
+ return 1;
- write_unlock_bh(&clusterip_lock);
return 0;
}
static int
clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum)
{
- int i;
-
- write_lock_bh(&clusterip_lock);
-
- if (c->num_local_nodes <= 1 || nodenum > CLUSTERIP_MAX_NODES) {
- write_unlock_bh(&clusterip_lock);
+ if (nodenum == 0 ||
+ nodenum > c->num_total_nodes)
return 1;
- }
- for (i = 0; i < c->num_local_nodes; i++) {
- if (c->local_nodes[i] == nodenum) {
- int size = sizeof(u_int16_t)*(c->num_local_nodes-(i+1));
- memmove(&c->local_nodes[i], &c->local_nodes[i+1], size);
- c->num_local_nodes--;
- write_unlock_bh(&clusterip_lock);
- return 0;
- }
- }
+ if (test_and_clear_bit(nodenum - 1, &c->local_nodes))
+ return 0;
- write_unlock_bh(&clusterip_lock);
return 1;
}
@@ -285,25 +299,7 @@ clusterip_hashfn(struct sk_buff *skb, struct clusterip_config *config)
static inline int
clusterip_responsible(struct clusterip_config *config, u_int32_t hash)
{
- int i;
-
- read_lock_bh(&clusterip_lock);
-
- if (config->num_local_nodes == 0) {
- read_unlock_bh(&clusterip_lock);
- return 0;
- }
-
- for (i = 0; i < config->num_local_nodes; i++) {
- if (config->local_nodes[i] == hash) {
- read_unlock_bh(&clusterip_lock);
- return 1;
- }
- }
-
- read_unlock_bh(&clusterip_lock);
-
- return 0;
+ return test_bit(hash - 1, &config->local_nodes);
}
/***********************************************************************
@@ -367,7 +363,7 @@ target(struct sk_buff **pskb,
#ifdef DEBUG_CLUSTERP
DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
#endif
- DEBUGP("hash=%u ct_hash=%lu ", hash, ct->mark);
+ DEBUGP("hash=%u ct_hash=%u ", hash, ct->mark);
if (!clusterip_responsible(cipinfo->config, hash)) {
DEBUGP("not responsible\n");
return NF_DROP;
@@ -415,8 +411,26 @@ checkentry(const char *tablename,
/* FIXME: further sanity checks */
- config = clusterip_config_find_get(e->ip.dst.s_addr);
- if (!config) {
+ config = clusterip_config_find_get(e->ip.dst.s_addr, 1);
+ if (config) {
+ if (cipinfo->config != NULL) {
+ /* Case A: This is an entry that gets reloaded, since
+ * it still has a cipinfo->config pointer. Simply
+ * increase the entry refcount and return */
+ if (cipinfo->config != config) {
+ printk(KERN_ERR "CLUSTERIP: Reloaded entry "
+ "has invalid config pointer!\n");
+ return 0;
+ }
+ clusterip_config_entry_get(cipinfo->config);
+ } else {
+ /* Case B: This is a new rule referring to an existing
+ * clusterip config. */
+ cipinfo->config = config;
+ clusterip_config_entry_get(cipinfo->config);
+ }
+ } else {
+ /* Case C: This is a completely new clusterip config */
if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) {
printk(KERN_WARNING "CLUSTERIP: no config found for %u.%u.%u.%u, need 'new'\n", NIPQUAD(e->ip.dst.s_addr));
return 0;
@@ -443,10 +457,9 @@ checkentry(const char *tablename,
}
dev_mc_add(config->dev,config->clustermac, ETH_ALEN, 0);
}
+ cipinfo->config = config;
}
- cipinfo->config = config;
-
return 1;
}
@@ -455,13 +468,10 @@ static void destroy(void *matchinfo, unsigned int matchinfosize)
{
struct ipt_clusterip_tgt_info *cipinfo = matchinfo;
- /* we first remove the proc entry and then drop the reference
- * count. In case anyone still accesses the file, the open/close
- * functions are also incrementing the refcount on their own */
-#ifdef CONFIG_PROC_FS
- remove_proc_entry(cipinfo->config->pde->name,
- cipinfo->config->pde->parent);
-#endif
+ /* if no more entries are referencing the config, remove it
+ * from the list and destroy the proc entry */
+ clusterip_config_entry_put(cipinfo->config);
+
clusterip_config_put(cipinfo->config);
}
@@ -533,7 +543,7 @@ arp_mangle(unsigned int hook,
/* if there is no clusterip configuration for the arp reply's
* source ip, we don't want to mangle it */
- c = clusterip_config_find_get(payload->src_ip);
+ c = clusterip_config_find_get(payload->src_ip, 0);
if (!c)
return NF_ACCEPT;
@@ -574,56 +584,69 @@ static struct nf_hook_ops cip_arp_ops = {
#ifdef CONFIG_PROC_FS
+struct clusterip_seq_position {
+ unsigned int pos; /* position */
+ unsigned int weight; /* number of bits set == size */
+ unsigned int bit; /* current bit */
+ unsigned long val; /* current value */
+};
+
static void *clusterip_seq_start(struct seq_file *s, loff_t *pos)
{
struct proc_dir_entry *pde = s->private;
struct clusterip_config *c = pde->data;
- unsigned int *nodeidx;
-
- read_lock_bh(&clusterip_lock);
- if (*pos >= c->num_local_nodes)
+ unsigned int weight;
+ u_int32_t local_nodes;
+ struct clusterip_seq_position *idx;
+
+ /* FIXME: possible race */
+ local_nodes = c->local_nodes;
+ weight = hweight32(local_nodes);
+ if (*pos >= weight)
return NULL;
- nodeidx = kmalloc(sizeof(unsigned int), GFP_KERNEL);
- if (!nodeidx)
+ idx = kmalloc(sizeof(struct clusterip_seq_position), GFP_KERNEL);
+ if (!idx)
return ERR_PTR(-ENOMEM);
- *nodeidx = *pos;
- return nodeidx;
+ idx->pos = *pos;
+ idx->weight = weight;
+ idx->bit = ffs(local_nodes);
+ idx->val = local_nodes;
+ clear_bit(idx->bit - 1, &idx->val);
+
+ return idx;
}
static void *clusterip_seq_next(struct seq_file *s, void *v, loff_t *pos)
{
- struct proc_dir_entry *pde = s->private;
- struct clusterip_config *c = pde->data;
- unsigned int *nodeidx = (unsigned int *)v;
+ struct clusterip_seq_position *idx = (struct clusterip_seq_position *)v;
- *pos = ++(*nodeidx);
- if (*pos >= c->num_local_nodes) {
+ *pos = ++idx->pos;
+ if (*pos >= idx->weight) {
kfree(v);
return NULL;
}
- return nodeidx;
+ idx->bit = ffs(idx->val);
+ clear_bit(idx->bit - 1, &idx->val);
+ return idx;
}
static void clusterip_seq_stop(struct seq_file *s, void *v)
{
kfree(v);
-
- read_unlock_bh(&clusterip_lock);
}
static int clusterip_seq_show(struct seq_file *s, void *v)
{
- struct proc_dir_entry *pde = s->private;
- struct clusterip_config *c = pde->data;
- unsigned int *nodeidx = (unsigned int *)v;
+ struct clusterip_seq_position *idx = (struct clusterip_seq_position *)v;
- if (*nodeidx != 0)
+ if (idx->pos != 0)
seq_putc(s, ',');
- seq_printf(s, "%u", c->local_nodes[*nodeidx]);
- if (*nodeidx == c->num_local_nodes-1)
+ seq_printf(s, "%u", idx->bit);
+
+ if (idx->pos == idx->weight - 1)
seq_putc(s, '\n');
return 0;
diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/ipv4/netfilter/ipt_CONNMARK.c
index 30ddd3e18eb..13463802133 100644
--- a/net/ipv4/netfilter/ipt_CONNMARK.c
+++ b/net/ipv4/netfilter/ipt_CONNMARK.c
@@ -40,9 +40,9 @@ target(struct sk_buff **pskb,
void *userinfo)
{
const struct ipt_connmark_target_info *markinfo = targinfo;
- unsigned long diff;
- unsigned long nfmark;
- unsigned long newmark;
+ u_int32_t diff;
+ u_int32_t nfmark;
+ u_int32_t newmark;
enum ip_conntrack_info ctinfo;
struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo);
@@ -61,10 +61,8 @@ target(struct sk_buff **pskb,
case IPT_CONNMARK_RESTORE:
nfmark = (*pskb)->nfmark;
diff = (ct->mark ^ nfmark) & markinfo->mask;
- if (diff != 0) {
+ if (diff != 0)
(*pskb)->nfmark = nfmark ^ diff;
- (*pskb)->nfcache |= NFC_ALTERED;
- }
break;
}
}
@@ -94,6 +92,11 @@ checkentry(const char *tablename,
}
}
+ if (matchinfo->mark > 0xffffffff || matchinfo->mask > 0xffffffff) {
+ printk(KERN_WARNING "CONNMARK: Only supports 32bit mark\n");
+ return 0;
+ }
+
return 1;
}
diff --git a/net/ipv4/netfilter/ipt_DSCP.c b/net/ipv4/netfilter/ipt_DSCP.c
index 3ea4509099f..6e319570a28 100644
--- a/net/ipv4/netfilter/ipt_DSCP.c
+++ b/net/ipv4/netfilter/ipt_DSCP.c
@@ -39,7 +39,7 @@ target(struct sk_buff **pskb,
if (((*pskb)->nh.iph->tos & IPT_DSCP_MASK) != sh_dscp) {
u_int16_t diffs[2];
- if (!skb_ip_make_writable(pskb, sizeof(struct iphdr)))
+ if (!skb_make_writable(pskb, sizeof(struct iphdr)))
return NF_DROP;
diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
@@ -51,7 +51,6 @@ target(struct sk_buff **pskb,
sizeof(diffs),
(*pskb)->nh.iph->check
^ 0xFFFF));
- (*pskb)->nfcache |= NFC_ALTERED;
}
return IPT_CONTINUE;
}
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index ada9911118e..a1319693f64 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -31,7 +31,7 @@ set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
!= (einfo->ip_ect & IPT_ECN_IP_MASK)) {
u_int16_t diffs[2];
- if (!skb_ip_make_writable(pskb, sizeof(struct iphdr)))
+ if (!skb_make_writable(pskb, sizeof(struct iphdr)))
return 0;
diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
@@ -43,7 +43,6 @@ set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
sizeof(diffs),
(*pskb)->nh.iph->check
^0xFFFF));
- (*pskb)->nfcache |= NFC_ALTERED;
}
return 1;
}
@@ -61,16 +60,20 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward)
if (!tcph)
return 0;
- if (!(einfo->operation & IPT_ECN_OP_SET_ECE
- || tcph->ece == einfo->proto.tcp.ece)
- && (!(einfo->operation & IPT_ECN_OP_SET_CWR
- || tcph->cwr == einfo->proto.tcp.cwr)))
+ if ((!(einfo->operation & IPT_ECN_OP_SET_ECE) ||
+ tcph->ece == einfo->proto.tcp.ece) &&
+ ((!(einfo->operation & IPT_ECN_OP_SET_CWR) ||
+ tcph->cwr == einfo->proto.tcp.cwr)))
return 1;
- if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
+ if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
return 0;
tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4;
+ if ((*pskb)->ip_summed == CHECKSUM_HW &&
+ skb_checksum_help(*pskb, inward))
+ return 0;
+
diffs[0] = ((u_int16_t *)tcph)[6];
if (einfo->operation & IPT_ECN_OP_SET_ECE)
tcph->ece = einfo->proto.tcp.ece;
@@ -79,14 +82,10 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward)
diffs[1] = ((u_int16_t *)tcph)[6];
diffs[0] = diffs[0] ^ 0xFFFF;
- if ((*pskb)->ip_summed != CHECKSUM_HW)
+ if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY)
tcph->check = csum_fold(csum_partial((char *)diffs,
sizeof(diffs),
tcph->check^0xFFFF));
- else
- if (skb_checksum_help(*pskb, inward))
- return 0;
- (*pskb)->nfcache |= NFC_ALTERED;
return 1;
}
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index ef08733d26d..92ed050fac6 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -27,10 +27,6 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
MODULE_DESCRIPTION("iptables syslog logging module");
-static unsigned int nflog = 1;
-module_param(nflog, int, 0400);
-MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
-
#if 0
#define DEBUGP printk
#else
@@ -41,11 +37,17 @@ MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
static DEFINE_SPINLOCK(log_lock);
/* One level of recursion won't kill us */
-static void dump_packet(const struct ipt_log_info *info,
+static void dump_packet(const struct nf_loginfo *info,
const struct sk_buff *skb,
unsigned int iphoff)
{
struct iphdr _iph, *ih;
+ unsigned int logflags;
+
+ if (info->type == NF_LOG_TYPE_LOG)
+ logflags = info->u.log.logflags;
+ else
+ logflags = NF_LOG_MASK;
ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
if (ih == NULL) {
@@ -76,7 +78,7 @@ static void dump_packet(const struct ipt_log_info *info,
if (ntohs(ih->frag_off) & IP_OFFSET)
printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
- if ((info->logflags & IPT_LOG_IPOPT)
+ if ((logflags & IPT_LOG_IPOPT)
&& ih->ihl * 4 > sizeof(struct iphdr)) {
unsigned char _opt[4 * 15 - sizeof(struct iphdr)], *op;
unsigned int i, optsize;
@@ -119,7 +121,7 @@ static void dump_packet(const struct ipt_log_info *info,
printk("SPT=%u DPT=%u ",
ntohs(th->source), ntohs(th->dest));
/* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
- if (info->logflags & IPT_LOG_TCPSEQ)
+ if (logflags & IPT_LOG_TCPSEQ)
printk("SEQ=%u ACK=%u ",
ntohl(th->seq), ntohl(th->ack_seq));
/* Max length: 13 "WINDOW=65535 " */
@@ -146,7 +148,7 @@ static void dump_packet(const struct ipt_log_info *info,
/* Max length: 11 "URGP=65535 " */
printk("URGP=%u ", ntohs(th->urg_ptr));
- if ((info->logflags & IPT_LOG_TCPOPT)
+ if ((logflags & IPT_LOG_TCPOPT)
&& th->doff * 4 > sizeof(struct tcphdr)) {
unsigned char _opt[4 * 15 - sizeof(struct tcphdr)];
unsigned char *op;
@@ -328,7 +330,7 @@ static void dump_packet(const struct ipt_log_info *info,
}
/* Max length: 15 "UID=4294967295 " */
- if ((info->logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
+ if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
read_lock_bh(&skb->sk->sk_callback_lock);
if (skb->sk->sk_socket && skb->sk->sk_socket->file)
printk("UID=%u ", skb->sk->sk_socket->file->f_uid);
@@ -349,19 +351,31 @@ static void dump_packet(const struct ipt_log_info *info,
/* maxlen = 230+ 91 + 230 + 252 = 803 */
}
+struct nf_loginfo default_loginfo = {
+ .type = NF_LOG_TYPE_LOG,
+ .u = {
+ .log = {
+ .level = 0,
+ .logflags = NF_LOG_MASK,
+ },
+ },
+};
+
static void
-ipt_log_packet(unsigned int hooknum,
+ipt_log_packet(unsigned int pf,
+ unsigned int hooknum,
const struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
- const struct ipt_log_info *loginfo,
- const char *level_string,
+ const struct nf_loginfo *loginfo,
const char *prefix)
{
+ if (!loginfo)
+ loginfo = &default_loginfo;
+
spin_lock_bh(&log_lock);
- printk(level_string);
- printk("%sIN=%s OUT=%s ",
- prefix == NULL ? loginfo->prefix : prefix,
+ printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
+ prefix,
in ? in->name : "",
out ? out->name : "");
#ifdef CONFIG_BRIDGE_NETFILTER
@@ -405,28 +419,15 @@ ipt_log_target(struct sk_buff **pskb,
void *userinfo)
{
const struct ipt_log_info *loginfo = targinfo;
- char level_string[4] = "< >";
+ struct nf_loginfo li;
- level_string[1] = '0' + (loginfo->level % 8);
- ipt_log_packet(hooknum, *pskb, in, out, loginfo, level_string, NULL);
+ li.type = NF_LOG_TYPE_LOG;
+ li.u.log.level = loginfo->level;
+ li.u.log.logflags = loginfo->logflags;
- return IPT_CONTINUE;
-}
+ nf_log_packet(PF_INET, hooknum, *pskb, in, out, &li, loginfo->prefix);
-static void
-ipt_logfn(unsigned int hooknum,
- const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const char *prefix)
-{
- struct ipt_log_info loginfo = {
- .level = 0,
- .logflags = IPT_LOG_MASK,
- .prefix = ""
- };
-
- ipt_log_packet(hooknum, skb, in, out, &loginfo, KERN_WARNING, prefix);
+ return IPT_CONTINUE;
}
static int ipt_log_checkentry(const char *tablename,
@@ -464,20 +465,29 @@ static struct ipt_target ipt_log_reg = {
.me = THIS_MODULE,
};
+static struct nf_logger ipt_log_logger ={
+ .name = "ipt_LOG",
+ .logfn = &ipt_log_packet,
+ .me = THIS_MODULE,
+};
+
static int __init init(void)
{
if (ipt_register_target(&ipt_log_reg))
return -EINVAL;
- if (nflog)
- nf_log_register(PF_INET, &ipt_logfn);
+ if (nf_log_register(PF_INET, &ipt_log_logger) < 0) {
+ printk(KERN_WARNING "ipt_LOG: not logging via system console "
+ "since somebody else already registered for PF_INET\n");
+ /* we cannot make module load fail here, since otherwise
+ * iptables userspace would abort */
+ }
return 0;
}
static void __exit fini(void)
{
- if (nflog)
- nf_log_unregister(PF_INET, &ipt_logfn);
+ nf_log_unregister_logger(&ipt_log_logger);
ipt_unregister_target(&ipt_log_reg);
}
diff --git a/net/ipv4/netfilter/ipt_MARK.c b/net/ipv4/netfilter/ipt_MARK.c
index 33c6f9b63b8..52b4f2c296b 100644
--- a/net/ipv4/netfilter/ipt_MARK.c
+++ b/net/ipv4/netfilter/ipt_MARK.c
@@ -29,10 +29,9 @@ target_v0(struct sk_buff **pskb,
{
const struct ipt_mark_target_info *markinfo = targinfo;
- if((*pskb)->nfmark != markinfo->mark) {
+ if((*pskb)->nfmark != markinfo->mark)
(*pskb)->nfmark = markinfo->mark;
- (*pskb)->nfcache |= NFC_ALTERED;
- }
+
return IPT_CONTINUE;
}
@@ -61,10 +60,9 @@ target_v1(struct sk_buff **pskb,
break;
}
- if((*pskb)->nfmark != mark) {
+ if((*pskb)->nfmark != mark)
(*pskb)->nfmark = mark;
- (*pskb)->nfcache |= NFC_ALTERED;
- }
+
return IPT_CONTINUE;
}
@@ -76,6 +74,8 @@ checkentry_v0(const char *tablename,
unsigned int targinfosize,
unsigned int hook_mask)
{
+ struct ipt_mark_target_info *markinfo = targinfo;
+
if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info))) {
printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n",
targinfosize,
@@ -88,6 +88,11 @@ checkentry_v0(const char *tablename,
return 0;
}
+ if (markinfo->mark > 0xffffffff) {
+ printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
+ return 0;
+ }
+
return 1;
}
@@ -120,6 +125,11 @@ checkentry_v1(const char *tablename,
return 0;
}
+ if (markinfo->mark > 0xffffffff) {
+ printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
+ return 0;
+ }
+
return 1;
}
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 91e74502c3d..275a174c6fe 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -86,15 +86,16 @@ masquerade_target(struct sk_buff **pskb,
IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING);
- /* FIXME: For the moment, don't do local packets, breaks
- testsuite for 2.3.49 --RR */
- if ((*pskb)->sk)
- return NF_ACCEPT;
-
ct = ip_conntrack_get(*pskb, &ctinfo);
IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED
|| ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY));
+ /* Source address is 0.0.0.0 - locally generated packet that is
+ * probably not supposed to be masqueraded.
+ */
+ if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip == 0)
+ return NF_ACCEPT;
+
mr = targinfo;
rt = (struct rtable *)(*pskb)->dst;
newsrc = inet_select_addr(out, rt->rt_gateway, RT_SCOPE_UNIVERSE);
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
index 06254b29d03..e6e7b609536 100644
--- a/net/ipv4/netfilter/ipt_NETMAP.c
+++ b/net/ipv4/netfilter/ipt_NETMAP.c
@@ -46,7 +46,8 @@ check(const char *tablename,
DEBUGP(MODULENAME":check: size %u.\n", targinfosize);
return 0;
}
- if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING))) {
+ if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING) |
+ (1 << NF_IP_LOCAL_OUT))) {
DEBUGP(MODULENAME":check: bad hooks %x.\n", hook_mask);
return 0;
}
@@ -76,12 +77,13 @@ target(struct sk_buff **pskb,
struct ip_nat_range newrange;
IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
- || hooknum == NF_IP_POST_ROUTING);
+ || hooknum == NF_IP_POST_ROUTING
+ || hooknum == NF_IP_LOCAL_OUT);
ct = ip_conntrack_get(*pskb, &ctinfo);
netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
- if (hooknum == NF_IP_PRE_ROUTING)
+ if (hooknum == NF_IP_PRE_ROUTING || hooknum == NF_IP_LOCAL_OUT)
new_ip = (*pskb)->nh.iph->daddr & ~netmask;
else
new_ip = (*pskb)->nh.iph->saddr & ~netmask;
diff --git a/net/ipv4/netfilter/ipt_NFQUEUE.c b/net/ipv4/netfilter/ipt_NFQUEUE.c
new file mode 100644
index 00000000000..3cedc9be880
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_NFQUEUE.c
@@ -0,0 +1,70 @@
+/* iptables module for using new netfilter netlink queue
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_NFQUEUE.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("iptables NFQUEUE target");
+MODULE_LICENSE("GPL");
+
+static unsigned int
+target(struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ unsigned int hooknum,
+ const void *targinfo,
+ void *userinfo)
+{
+ const struct ipt_NFQ_info *tinfo = targinfo;
+
+ return NF_QUEUE_NR(tinfo->queuenum);
+}
+
+static int
+checkentry(const char *tablename,
+ const struct ipt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ if (targinfosize != IPT_ALIGN(sizeof(struct ipt_NFQ_info))) {
+ printk(KERN_WARNING "NFQUEUE: targinfosize %u != %Zu\n",
+ targinfosize,
+ IPT_ALIGN(sizeof(struct ipt_NFQ_info)));
+ return 0;
+ }
+
+ return 1;
+}
+
+static struct ipt_target ipt_NFQ_reg = {
+ .name = "NFQUEUE",
+ .target = target,
+ .checkentry = checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ return ipt_register_target(&ipt_NFQ_reg);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_target(&ipt_NFQ_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c
index d2e13447678..5245bfd33d5 100644
--- a/net/ipv4/netfilter/ipt_REDIRECT.c
+++ b/net/ipv4/netfilter/ipt_REDIRECT.c
@@ -88,14 +88,18 @@ redirect_target(struct sk_buff **pskb,
newdst = htonl(0x7F000001);
else {
struct in_device *indev;
+ struct in_ifaddr *ifa;
- /* Device might not have an associated in_device. */
- indev = (struct in_device *)(*pskb)->dev->ip_ptr;
- if (indev == NULL || indev->ifa_list == NULL)
- return NF_DROP;
+ newdst = 0;
+
+ rcu_read_lock();
+ indev = __in_dev_get_rcu((*pskb)->dev);
+ if (indev && (ifa = indev->ifa_list))
+ newdst = ifa->ifa_local;
+ rcu_read_unlock();
- /* Grab first address on interface. */
- newdst = indev->ifa_list->ifa_local;
+ if (!newdst)
+ return NF_DROP;
}
/* Transfer from original range. */
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 91569644602..f057025a719 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -92,10 +92,7 @@ static inline struct rtable *route_reverse(struct sk_buff *skb,
fl.fl_ip_sport = tcph->dest;
fl.fl_ip_dport = tcph->source;
- if (xfrm_lookup((struct dst_entry **)&rt, &fl, NULL, 0)) {
- dst_release(&rt->u.dst);
- rt = NULL;
- }
+ xfrm_lookup((struct dst_entry **)&rt, &fl, NULL, 0);
return rt;
}
@@ -156,7 +153,6 @@ static void send_reset(struct sk_buff *oldskb, int hook)
/* This packet will not be the same as the other: clear nf fields */
nf_reset(nskb);
- nskb->nfcache = 0;
nskb->nfmark = 0;
#ifdef CONFIG_BRIDGE_NETFILTER
nf_bridge_put(nskb->nf_bridge);
diff --git a/net/ipv4/netfilter/ipt_TCPMSS.c b/net/ipv4/netfilter/ipt_TCPMSS.c
index 1049050b2bf..8db70d6908c 100644
--- a/net/ipv4/netfilter/ipt_TCPMSS.c
+++ b/net/ipv4/netfilter/ipt_TCPMSS.c
@@ -58,7 +58,11 @@ ipt_tcpmss_target(struct sk_buff **pskb,
unsigned int i;
u_int8_t *opt;
- if (!skb_ip_make_writable(pskb, (*pskb)->len))
+ if (!skb_make_writable(pskb, (*pskb)->len))
+ return NF_DROP;
+
+ if ((*pskb)->ip_summed == CHECKSUM_HW &&
+ skb_checksum_help(*pskb, out == NULL))
return NF_DROP;
iph = (*pskb)->nh.iph;
@@ -186,10 +190,6 @@ ipt_tcpmss_target(struct sk_buff **pskb,
newmss);
retmodified:
- /* We never hw checksum SYN packets. */
- BUG_ON((*pskb)->ip_summed == CHECKSUM_HW);
-
- (*pskb)->nfcache |= NFC_UNKNOWN | NFC_ALTERED;
return IPT_CONTINUE;
}
diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c
index 85c70d240f8..deadb36d442 100644
--- a/net/ipv4/netfilter/ipt_TOS.c
+++ b/net/ipv4/netfilter/ipt_TOS.c
@@ -33,7 +33,7 @@ target(struct sk_buff **pskb,
if (((*pskb)->nh.iph->tos & IPTOS_TOS_MASK) != tosinfo->tos) {
u_int16_t diffs[2];
- if (!skb_ip_make_writable(pskb, sizeof(struct iphdr)))
+ if (!skb_make_writable(pskb, sizeof(struct iphdr)))
return NF_DROP;
diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
@@ -46,7 +46,6 @@ target(struct sk_buff **pskb,
sizeof(diffs),
(*pskb)->nh.iph->check
^0xFFFF));
- (*pskb)->nfcache |= NFC_ALTERED;
}
return IPT_CONTINUE;
}
diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c
new file mode 100644
index 00000000000..b9ae6a9382f
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_TTL.c
@@ -0,0 +1,119 @@
+/* TTL modification target for IP tables
+ * (C) 2000,2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_TTL.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("IP tables TTL modification module");
+MODULE_LICENSE("GPL");
+
+static unsigned int
+ipt_ttl_target(struct sk_buff **pskb, const struct net_device *in,
+ const struct net_device *out, unsigned int hooknum,
+ const void *targinfo, void *userinfo)
+{
+ struct iphdr *iph;
+ const struct ipt_TTL_info *info = targinfo;
+ u_int16_t diffs[2];
+ int new_ttl;
+
+ if (!skb_make_writable(pskb, (*pskb)->len))
+ return NF_DROP;
+
+ iph = (*pskb)->nh.iph;
+
+ switch (info->mode) {
+ case IPT_TTL_SET:
+ new_ttl = info->ttl;
+ break;
+ case IPT_TTL_INC:
+ new_ttl = iph->ttl + info->ttl;
+ if (new_ttl > 255)
+ new_ttl = 255;
+ break;
+ case IPT_TTL_DEC:
+ new_ttl = iph->ttl - info->ttl;
+ if (new_ttl < 0)
+ new_ttl = 0;
+ break;
+ default:
+ new_ttl = iph->ttl;
+ break;
+ }
+
+ if (new_ttl != iph->ttl) {
+ diffs[0] = htons(((unsigned)iph->ttl) << 8) ^ 0xFFFF;
+ iph->ttl = new_ttl;
+ diffs[1] = htons(((unsigned)iph->ttl) << 8);
+ iph->check = csum_fold(csum_partial((char *)diffs,
+ sizeof(diffs),
+ iph->check^0xFFFF));
+ }
+
+ return IPT_CONTINUE;
+}
+
+static int ipt_ttl_checkentry(const char *tablename,
+ const struct ipt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ struct ipt_TTL_info *info = targinfo;
+
+ if (targinfosize != IPT_ALIGN(sizeof(struct ipt_TTL_info))) {
+ printk(KERN_WARNING "ipt_TTL: targinfosize %u != %Zu\n",
+ targinfosize,
+ IPT_ALIGN(sizeof(struct ipt_TTL_info)));
+ return 0;
+ }
+
+ if (strcmp(tablename, "mangle")) {
+ printk(KERN_WARNING "ipt_TTL: can only be called from "
+ "\"mangle\" table, not \"%s\"\n", tablename);
+ return 0;
+ }
+
+ if (info->mode > IPT_TTL_MAXMODE) {
+ printk(KERN_WARNING "ipt_TTL: invalid or unknown Mode %u\n",
+ info->mode);
+ return 0;
+ }
+
+ if ((info->mode != IPT_TTL_SET) && (info->ttl == 0))
+ return 0;
+
+ return 1;
+}
+
+static struct ipt_target ipt_TTL = {
+ .name = "TTL",
+ .target = ipt_ttl_target,
+ .checkentry = ipt_ttl_checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ return ipt_register_target(&ipt_TTL);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_target(&ipt_TTL);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 52a0076302a..2883ccd8a91 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -62,6 +62,7 @@
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
MODULE_DESCRIPTION("iptables userspace logging module");
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NFLOG);
#define ULOG_NL_EVENT 111 /* Harald's favorite number */
#define ULOG_MAXNLGROUPS 32 /* numer of nlgroups */
@@ -115,10 +116,10 @@ static void ulog_send(unsigned int nlgroupnum)
if (ub->qlen > 1)
ub->lastnlh->nlmsg_type = NLMSG_DONE;
- NETLINK_CB(ub->skb).dst_groups = (1 << nlgroupnum);
- DEBUGP("ipt_ULOG: throwing %d packets to netlink mask %u\n",
- ub->qlen, nlgroupnum);
- netlink_broadcast(nflognl, ub->skb, 0, (1 << nlgroupnum), GFP_ATOMIC);
+ NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1;
+ DEBUGP("ipt_ULOG: throwing %d packets to netlink group %u\n",
+ ub->qlen, nlgroupnum + 1);
+ netlink_broadcast(nflognl, ub->skb, 0, nlgroupnum + 1, GFP_ATOMIC);
ub->qlen = 0;
ub->skb = NULL;
@@ -219,13 +220,13 @@ static void ipt_ulog_packet(unsigned int hooknum,
pm = NLMSG_DATA(nlh);
/* We might not have a timestamp, get one */
- if (skb->stamp.tv_sec == 0)
- do_gettimeofday((struct timeval *)&skb->stamp);
+ if (skb->tstamp.off_sec == 0)
+ __net_timestamp((struct sk_buff *)skb);
/* copy hook, prefix, timestamp, payload, etc. */
pm->data_len = copy_len;
- pm->timestamp_sec = skb->stamp.tv_sec;
- pm->timestamp_usec = skb->stamp.tv_usec;
+ pm->timestamp_sec = skb->tstamp.off_sec;
+ pm->timestamp_usec = skb->tstamp.off_usec;
pm->mark = skb->nfmark;
pm->hook = hooknum;
if (prefix != NULL)
@@ -303,18 +304,27 @@ static unsigned int ipt_ulog_target(struct sk_buff **pskb,
return IPT_CONTINUE;
}
-static void ipt_logfn(unsigned int hooknum,
+static void ipt_logfn(unsigned int pf,
+ unsigned int hooknum,
const struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
+ const struct nf_loginfo *li,
const char *prefix)
{
- struct ipt_ulog_info loginfo = {
- .nl_group = ULOG_DEFAULT_NLGROUP,
- .copy_range = 0,
- .qthreshold = ULOG_DEFAULT_QTHRESHOLD,
- .prefix = ""
- };
+ struct ipt_ulog_info loginfo;
+
+ if (!li || li->type != NF_LOG_TYPE_ULOG) {
+ loginfo.nl_group = ULOG_DEFAULT_NLGROUP;
+ loginfo.copy_range = 0;
+ loginfo.qthreshold = ULOG_DEFAULT_QTHRESHOLD;
+ loginfo.prefix[0] = '\0';
+ } else {
+ loginfo.nl_group = li->u.ulog.group;
+ loginfo.copy_range = li->u.ulog.copy_len;
+ loginfo.qthreshold = li->u.ulog.qthreshold;
+ strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix));
+ }
ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix);
}
@@ -354,6 +364,12 @@ static struct ipt_target ipt_ulog_reg = {
.me = THIS_MODULE,
};
+static struct nf_logger ipt_ulog_logger = {
+ .name = "ipt_ULOG",
+ .logfn = &ipt_logfn,
+ .me = THIS_MODULE,
+};
+
static int __init init(void)
{
int i;
@@ -372,7 +388,8 @@ static int __init init(void)
ulog_buffers[i].timer.data = i;
}
- nflognl = netlink_kernel_create(NETLINK_NFLOG, NULL);
+ nflognl = netlink_kernel_create(NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL,
+ THIS_MODULE);
if (!nflognl)
return -ENOMEM;
@@ -381,7 +398,7 @@ static int __init init(void)
return -EINVAL;
}
if (nflog)
- nf_log_register(PF_INET, &ipt_logfn);
+ nf_log_register(PF_INET, &ipt_ulog_logger);
return 0;
}
@@ -394,7 +411,7 @@ static void __exit fini(void)
DEBUGP("ipt_ULOG: cleanup_module\n");
if (nflog)
- nf_log_unregister(PF_INET, &ipt_logfn);
+ nf_log_unregister_logger(&ipt_ulog_logger);
ipt_unregister_target(&ipt_ulog_reg);
sock_release(nflognl->sk_socket);
diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c
index f5909a4c3fc..e19c2a52d00 100644
--- a/net/ipv4/netfilter/ipt_addrtype.c
+++ b/net/ipv4/netfilter/ipt_addrtype.c
@@ -48,7 +48,7 @@ static int checkentry(const char *tablename, const struct ipt_ip *ip,
unsigned int hook_mask)
{
if (matchsize != IPT_ALIGN(sizeof(struct ipt_addrtype_info))) {
- printk(KERN_ERR "ipt_addrtype: invalid size (%u != %Zu)\n.",
+ printk(KERN_ERR "ipt_addrtype: invalid size (%u != %Zu)\n",
matchsize, IPT_ALIGN(sizeof(struct ipt_addrtype_info)));
return 0;
}
diff --git a/net/ipv4/netfilter/ipt_connbytes.c b/net/ipv4/netfilter/ipt_connbytes.c
new file mode 100644
index 00000000000..df4a42c6da2
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_connbytes.c
@@ -0,0 +1,162 @@
+/* Kernel module to match connection tracking byte counter.
+ * GPL (C) 2002 Martin Devera (devik@cdi.cz).
+ *
+ * 2004-07-20 Harald Welte <laforge@netfilter.org>
+ * - reimplemented to use per-connection accounting counters
+ * - add functionality to match number of packets
+ * - add functionality to match average packet size
+ * - add support to match directions seperately
+ *
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_connbytes.h>
+
+#include <asm/div64.h>
+#include <asm/bitops.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("iptables match for matching number of pkts/bytes per connection");
+
+/* 64bit divisor, dividend and result. dynamic precision */
+static u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor)
+{
+ u_int32_t d = divisor;
+
+ if (divisor > 0xffffffffULL) {
+ unsigned int shift = fls(divisor >> 32);
+
+ d = divisor >> shift;
+ dividend >>= shift;
+ }
+
+ do_div(dividend, d);
+ return dividend;
+}
+
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ const struct ipt_connbytes_info *sinfo = matchinfo;
+ enum ip_conntrack_info ctinfo;
+ struct ip_conntrack *ct;
+ u_int64_t what = 0; /* initialize to make gcc happy */
+
+ if (!(ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo)))
+ return 0; /* no match */
+
+ switch (sinfo->what) {
+ case IPT_CONNBYTES_PKTS:
+ switch (sinfo->direction) {
+ case IPT_CONNBYTES_DIR_ORIGINAL:
+ what = ct->counters[IP_CT_DIR_ORIGINAL].packets;
+ break;
+ case IPT_CONNBYTES_DIR_REPLY:
+ what = ct->counters[IP_CT_DIR_REPLY].packets;
+ break;
+ case IPT_CONNBYTES_DIR_BOTH:
+ what = ct->counters[IP_CT_DIR_ORIGINAL].packets;
+ what += ct->counters[IP_CT_DIR_REPLY].packets;
+ break;
+ }
+ break;
+ case IPT_CONNBYTES_BYTES:
+ switch (sinfo->direction) {
+ case IPT_CONNBYTES_DIR_ORIGINAL:
+ what = ct->counters[IP_CT_DIR_ORIGINAL].bytes;
+ break;
+ case IPT_CONNBYTES_DIR_REPLY:
+ what = ct->counters[IP_CT_DIR_REPLY].bytes;
+ break;
+ case IPT_CONNBYTES_DIR_BOTH:
+ what = ct->counters[IP_CT_DIR_ORIGINAL].bytes;
+ what += ct->counters[IP_CT_DIR_REPLY].bytes;
+ break;
+ }
+ break;
+ case IPT_CONNBYTES_AVGPKT:
+ switch (sinfo->direction) {
+ case IPT_CONNBYTES_DIR_ORIGINAL:
+ what = div64_64(ct->counters[IP_CT_DIR_ORIGINAL].bytes,
+ ct->counters[IP_CT_DIR_ORIGINAL].packets);
+ break;
+ case IPT_CONNBYTES_DIR_REPLY:
+ what = div64_64(ct->counters[IP_CT_DIR_REPLY].bytes,
+ ct->counters[IP_CT_DIR_REPLY].packets);
+ break;
+ case IPT_CONNBYTES_DIR_BOTH:
+ {
+ u_int64_t bytes;
+ u_int64_t pkts;
+ bytes = ct->counters[IP_CT_DIR_ORIGINAL].bytes +
+ ct->counters[IP_CT_DIR_REPLY].bytes;
+ pkts = ct->counters[IP_CT_DIR_ORIGINAL].packets+
+ ct->counters[IP_CT_DIR_REPLY].packets;
+
+ /* FIXME_THEORETICAL: what to do if sum
+ * overflows ? */
+
+ what = div64_64(bytes, pkts);
+ }
+ break;
+ }
+ break;
+ }
+
+ if (sinfo->count.to)
+ return (what <= sinfo->count.to && what >= sinfo->count.from);
+ else
+ return (what >= sinfo->count.from);
+}
+
+static int check(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ const struct ipt_connbytes_info *sinfo = matchinfo;
+
+ if (matchsize != IPT_ALIGN(sizeof(struct ipt_connbytes_info)))
+ return 0;
+
+ if (sinfo->what != IPT_CONNBYTES_PKTS &&
+ sinfo->what != IPT_CONNBYTES_BYTES &&
+ sinfo->what != IPT_CONNBYTES_AVGPKT)
+ return 0;
+
+ if (sinfo->direction != IPT_CONNBYTES_DIR_ORIGINAL &&
+ sinfo->direction != IPT_CONNBYTES_DIR_REPLY &&
+ sinfo->direction != IPT_CONNBYTES_DIR_BOTH)
+ return 0;
+
+ return 1;
+}
+
+static struct ipt_match state_match = {
+ .name = "connbytes",
+ .match = &match,
+ .checkentry = &check,
+ .me = THIS_MODULE
+};
+
+static int __init init(void)
+{
+ return ipt_register_match(&state_match);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&state_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_connmark.c b/net/ipv4/netfilter/ipt_connmark.c
index 2706f96cea5..bf8de47ce00 100644
--- a/net/ipv4/netfilter/ipt_connmark.c
+++ b/net/ipv4/netfilter/ipt_connmark.c
@@ -54,9 +54,16 @@ checkentry(const char *tablename,
unsigned int matchsize,
unsigned int hook_mask)
{
+ struct ipt_connmark_info *cm =
+ (struct ipt_connmark_info *)matchinfo;
if (matchsize != IPT_ALIGN(sizeof(struct ipt_connmark_info)))
return 0;
+ if (cm->mark > 0xffffffff || cm->mask > 0xffffffff) {
+ printk(KERN_WARNING "connmark: only support 32bit mark\n");
+ return 0;
+ }
+
return 1;
}
diff --git a/net/ipv4/netfilter/ipt_dccp.c b/net/ipv4/netfilter/ipt_dccp.c
new file mode 100644
index 00000000000..ad3278bba6c
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_dccp.c
@@ -0,0 +1,176 @@
+/*
+ * iptables module for DCCP protocol header matching
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <net/ip.h>
+#include <linux/dccp.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_dccp.h>
+
+#define DCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \
+ || (!!((invflag) & (option)) ^ (cond)))
+
+static unsigned char *dccp_optbuf;
+static DEFINE_SPINLOCK(dccp_buflock);
+
+static inline int
+dccp_find_option(u_int8_t option,
+ const struct sk_buff *skb,
+ const struct dccp_hdr *dh,
+ int *hotdrop)
+{
+ /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
+ unsigned char *op;
+ unsigned int optoff = __dccp_hdr_len(dh);
+ unsigned int optlen = dh->dccph_doff*4 - __dccp_hdr_len(dh);
+ unsigned int i;
+
+ if (dh->dccph_doff * 4 < __dccp_hdr_len(dh)) {
+ *hotdrop = 1;
+ return 0;
+ }
+
+ if (!optlen)
+ return 0;
+
+ spin_lock_bh(&dccp_buflock);
+ op = skb_header_pointer(skb,
+ skb->nh.iph->ihl*4 + optoff,
+ optlen, dccp_optbuf);
+ if (op == NULL) {
+ /* If we don't have the whole header, drop packet. */
+ spin_unlock_bh(&dccp_buflock);
+ *hotdrop = 1;
+ return 0;
+ }
+
+ for (i = 0; i < optlen; ) {
+ if (op[i] == option) {
+ spin_unlock_bh(&dccp_buflock);
+ return 1;
+ }
+
+ if (op[i] < 2)
+ i++;
+ else
+ i += op[i+1]?:1;
+ }
+
+ spin_unlock_bh(&dccp_buflock);
+ return 0;
+}
+
+
+static inline int
+match_types(const struct dccp_hdr *dh, u_int16_t typemask)
+{
+ return (typemask & (1 << dh->dccph_type));
+}
+
+static inline int
+match_option(u_int8_t option, const struct sk_buff *skb,
+ const struct dccp_hdr *dh, int *hotdrop)
+{
+ return dccp_find_option(option, skb, dh, hotdrop);
+}
+
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ const struct ipt_dccp_info *info =
+ (const struct ipt_dccp_info *)matchinfo;
+ struct dccp_hdr _dh, *dh;
+
+ if (offset)
+ return 0;
+
+ dh = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_dh), &_dh);
+ if (dh == NULL) {
+ *hotdrop = 1;
+ return 0;
+ }
+
+ return DCCHECK(((ntohs(dh->dccph_sport) >= info->spts[0])
+ && (ntohs(dh->dccph_sport) <= info->spts[1])),
+ IPT_DCCP_SRC_PORTS, info->flags, info->invflags)
+ && DCCHECK(((ntohs(dh->dccph_dport) >= info->dpts[0])
+ && (ntohs(dh->dccph_dport) <= info->dpts[1])),
+ IPT_DCCP_DEST_PORTS, info->flags, info->invflags)
+ && DCCHECK(match_types(dh, info->typemask),
+ IPT_DCCP_TYPE, info->flags, info->invflags)
+ && DCCHECK(match_option(info->option, skb, dh, hotdrop),
+ IPT_DCCP_OPTION, info->flags, info->invflags);
+}
+
+static int
+checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ const struct ipt_dccp_info *info;
+
+ info = (const struct ipt_dccp_info *)matchinfo;
+
+ return ip->proto == IPPROTO_DCCP
+ && !(ip->invflags & IPT_INV_PROTO)
+ && matchsize == IPT_ALIGN(sizeof(struct ipt_dccp_info))
+ && !(info->flags & ~IPT_DCCP_VALID_FLAGS)
+ && !(info->invflags & ~IPT_DCCP_VALID_FLAGS)
+ && !(info->invflags & ~info->flags);
+}
+
+static struct ipt_match dccp_match =
+{
+ .name = "dccp",
+ .match = &match,
+ .checkentry = &checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ int ret;
+
+ /* doff is 8 bits, so the maximum option size is (4*256). Don't put
+ * this in BSS since DaveM is worried about locked TLB's for kernel
+ * BSS. */
+ dccp_optbuf = kmalloc(256 * 4, GFP_KERNEL);
+ if (!dccp_optbuf)
+ return -ENOMEM;
+ ret = ipt_register_match(&dccp_match);
+ if (ret)
+ kfree(dccp_optbuf);
+
+ return ret;
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&dccp_match);
+ kfree(dccp_optbuf);
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("Match for DCCP protocol packets");
+
diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c
index 564b49bfebc..2dd1cccbdab 100644
--- a/net/ipv4/netfilter/ipt_hashlimit.c
+++ b/net/ipv4/netfilter/ipt_hashlimit.c
@@ -94,7 +94,7 @@ struct ipt_hashlimit_htable {
static DEFINE_SPINLOCK(hashlimit_lock); /* protects htables list */
static DECLARE_MUTEX(hlimit_mutex); /* additional checkentry protection */
static HLIST_HEAD(hashlimit_htables);
-static kmem_cache_t *hashlimit_cachep;
+static kmem_cache_t *hashlimit_cachep __read_mostly;
static inline int dst_cmp(const struct dsthash_ent *ent, struct dsthash_dst *b)
{
diff --git a/net/ipv4/netfilter/ipt_mark.c b/net/ipv4/netfilter/ipt_mark.c
index 8955728127b..00bef6cdd3f 100644
--- a/net/ipv4/netfilter/ipt_mark.c
+++ b/net/ipv4/netfilter/ipt_mark.c
@@ -37,9 +37,16 @@ checkentry(const char *tablename,
unsigned int matchsize,
unsigned int hook_mask)
{
+ struct ipt_mark_info *minfo = (struct ipt_mark_info *) matchinfo;
+
if (matchsize != IPT_ALIGN(sizeof(struct ipt_mark_info)))
return 0;
+ if (minfo->mark > 0xffffffff || minfo->mask > 0xffffffff) {
+ printk(KERN_WARNING "mark: only supports 32bit mark\n");
+ return 0;
+ }
+
return 1;
}
diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c
index 3b9065e0638..0cee2862ed8 100644
--- a/net/ipv4/netfilter/ipt_owner.c
+++ b/net/ipv4/netfilter/ipt_owner.c
@@ -11,6 +11,7 @@
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/file.h>
+#include <linux/rcupdate.h>
#include <net/sock.h>
#include <linux/netfilter_ipv4/ipt_owner.h>
@@ -21,106 +22,6 @@ MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
MODULE_DESCRIPTION("iptables owner match");
static int
-match_comm(const struct sk_buff *skb, const char *comm)
-{
- struct task_struct *g, *p;
- struct files_struct *files;
- int i;
-
- read_lock(&tasklist_lock);
- do_each_thread(g, p) {
- if(strncmp(p->comm, comm, sizeof(p->comm)))
- continue;
-
- task_lock(p);
- files = p->files;
- if(files) {
- spin_lock(&files->file_lock);
- for (i=0; i < files->max_fds; i++) {
- if (fcheck_files(files, i) ==
- skb->sk->sk_socket->file) {
- spin_unlock(&files->file_lock);
- task_unlock(p);
- read_unlock(&tasklist_lock);
- return 1;
- }
- }
- spin_unlock(&files->file_lock);
- }
- task_unlock(p);
- } while_each_thread(g, p);
- read_unlock(&tasklist_lock);
- return 0;
-}
-
-static int
-match_pid(const struct sk_buff *skb, pid_t pid)
-{
- struct task_struct *p;
- struct files_struct *files;
- int i;
-
- read_lock(&tasklist_lock);
- p = find_task_by_pid(pid);
- if (!p)
- goto out;
- task_lock(p);
- files = p->files;
- if(files) {
- spin_lock(&files->file_lock);
- for (i=0; i < files->max_fds; i++) {
- if (fcheck_files(files, i) ==
- skb->sk->sk_socket->file) {
- spin_unlock(&files->file_lock);
- task_unlock(p);
- read_unlock(&tasklist_lock);
- return 1;
- }
- }
- spin_unlock(&files->file_lock);
- }
- task_unlock(p);
-out:
- read_unlock(&tasklist_lock);
- return 0;
-}
-
-static int
-match_sid(const struct sk_buff *skb, pid_t sid)
-{
- struct task_struct *g, *p;
- struct file *file = skb->sk->sk_socket->file;
- int i, found=0;
-
- read_lock(&tasklist_lock);
- do_each_thread(g, p) {
- struct files_struct *files;
- if (p->signal->session != sid)
- continue;
-
- task_lock(p);
- files = p->files;
- if (files) {
- spin_lock(&files->file_lock);
- for (i=0; i < files->max_fds; i++) {
- if (fcheck_files(files, i) == file) {
- found = 1;
- break;
- }
- }
- spin_unlock(&files->file_lock);
- }
- task_unlock(p);
- if (found)
- goto out;
- } while_each_thread(g, p);
-out:
- read_unlock(&tasklist_lock);
-
- return found;
-}
-
-static int
match(const struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
@@ -145,24 +46,6 @@ match(const struct sk_buff *skb,
return 0;
}
- if(info->match & IPT_OWNER_PID) {
- if (!match_pid(skb, info->pid) ^
- !!(info->invert & IPT_OWNER_PID))
- return 0;
- }
-
- if(info->match & IPT_OWNER_SID) {
- if (!match_sid(skb, info->sid) ^
- !!(info->invert & IPT_OWNER_SID))
- return 0;
- }
-
- if(info->match & IPT_OWNER_COMM) {
- if (!match_comm(skb, info->comm) ^
- !!(info->invert & IPT_OWNER_COMM))
- return 0;
- }
-
return 1;
}
@@ -173,6 +56,8 @@ checkentry(const char *tablename,
unsigned int matchsize,
unsigned int hook_mask)
{
+ const struct ipt_owner_info *info = matchinfo;
+
if (hook_mask
& ~((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_POST_ROUTING))) {
printk("ipt_owner: only valid for LOCAL_OUT or POST_ROUTING.\n");
@@ -184,15 +69,13 @@ checkentry(const char *tablename,
IPT_ALIGN(sizeof(struct ipt_owner_info)));
return 0;
}
-#ifdef CONFIG_SMP
- /* files->file_lock can not be used in a BH */
- if (((struct ipt_owner_info *)matchinfo)->match
- & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) {
- printk("ipt_owner: pid, sid and command matching is broken "
- "on SMP.\n");
+
+ if (info->match & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) {
+ printk("ipt_owner: pid, sid and command matching "
+ "not supported anymore\n");
return 0;
}
-#endif
+
return 1;
}
diff --git a/net/ipv4/netfilter/ipt_string.c b/net/ipv4/netfilter/ipt_string.c
new file mode 100644
index 00000000000..b5def204d79
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_string.c
@@ -0,0 +1,91 @@
+/* String matching match for iptables
+ *
+ * (C) 2005 Pablo Neira Ayuso <pablo@eurodev.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_string.h>
+#include <linux/textsearch.h>
+
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@eurodev.net>");
+MODULE_DESCRIPTION("IP tables string match module");
+MODULE_LICENSE("GPL");
+
+static int match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ struct ts_state state;
+ struct ipt_string_info *conf = (struct ipt_string_info *) matchinfo;
+
+ memset(&state, 0, sizeof(struct ts_state));
+
+ return (skb_find_text((struct sk_buff *)skb, conf->from_offset,
+ conf->to_offset, conf->config, &state)
+ != UINT_MAX) && !conf->invert;
+}
+
+#define STRING_TEXT_PRIV(m) ((struct ipt_string_info *) m)
+
+static int checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ struct ipt_string_info *conf = matchinfo;
+ struct ts_config *ts_conf;
+
+ if (matchsize != IPT_ALIGN(sizeof(struct ipt_string_info)))
+ return 0;
+
+ /* Damn, can't handle this case properly with iptables... */
+ if (conf->from_offset > conf->to_offset)
+ return 0;
+
+ ts_conf = textsearch_prepare(conf->algo, conf->pattern, conf->patlen,
+ GFP_KERNEL, TS_AUTOLOAD);
+ if (IS_ERR(ts_conf))
+ return 0;
+
+ conf->config = ts_conf;
+
+ return 1;
+}
+
+static void destroy(void *matchinfo, unsigned int matchsize)
+{
+ textsearch_destroy(STRING_TEXT_PRIV(matchinfo)->config);
+}
+
+static struct ipt_match string_match = {
+ .name = "string",
+ .match = match,
+ .checkentry = checkentry,
+ .destroy = destroy,
+ .me = THIS_MODULE
+};
+
+static int __init init(void)
+{
+ return ipt_register_match(&string_match);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&string_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 912bbcc7f41..a65e508fbd4 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -59,13 +59,10 @@ static int fold_prot_inuse(struct proto *proto)
*/
static int sockstat_seq_show(struct seq_file *seq, void *v)
{
- /* From net/socket.c */
- extern void socket_seq_show(struct seq_file *seq);
-
socket_seq_show(seq);
seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n",
fold_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count),
- tcp_tw_count, atomic_read(&tcp_sockets_allocated),
+ tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated),
atomic_read(&tcp_memory_allocated));
seq_printf(seq, "UDP: inuse %d\n", fold_prot_inuse(&udp_prot));
seq_printf(seq, "RAW: inuse %d\n", fold_prot_inuse(&raw_prot));
@@ -93,9 +90,7 @@ fold_field(void *mib[], int offt)
unsigned long res = 0;
int i;
- for (i = 0; i < NR_CPUS; i++) {
- if (!cpu_possible(i))
- continue;
+ for_each_cpu(i) {
res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt);
res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt);
}
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 0db405a869f..291831e792a 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -40,7 +40,6 @@
#include <linux/timer.h>
#include <net/ip.h>
#include <net/protocol.h>
-#include <net/tcp.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/icmp.h>
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index d1835b1bc8c..4b0d7e4d626 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -59,7 +59,6 @@
#include <linux/netdevice.h>
#include <linux/in_route.h>
#include <linux/route.h>
-#include <linux/tcp.h>
#include <linux/skbuff.h>
#include <net/dst.h>
#include <net/sock.h>
@@ -71,6 +70,7 @@
#include <net/udp.h>
#include <net/raw.h>
#include <net/snmp.h>
+#include <net/tcp_states.h>
#include <net/inet_common.h>
#include <net/checksum.h>
#include <net/xfrm.h>
@@ -150,10 +150,11 @@ static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
* RFC 1122: SHOULD pass TOS value up to the transport layer.
* -> It does. And not only TOS, but all IP header.
*/
-void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
+int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
{
struct sock *sk;
struct hlist_head *head;
+ int delivered = 0;
read_lock(&raw_v4_lock);
head = &raw_v4_htable[hash];
@@ -164,6 +165,7 @@ void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
skb->dev->ifindex);
while (sk) {
+ delivered = 1;
if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) {
struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
@@ -177,6 +179,7 @@ void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
}
out:
read_unlock(&raw_v4_lock);
+ return delivered;
}
void raw_err (struct sock *sk, struct sk_buff *skb, u32 info)
@@ -358,7 +361,7 @@ static void raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
if (type && code) {
get_user(fl->fl_icmp_type, type);
- __get_user(fl->fl_icmp_code, code);
+ get_user(fl->fl_icmp_code, code);
probed = 1;
}
break;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d675ff80b04..381dd6a6aeb 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -240,7 +240,9 @@ static unsigned rt_hash_mask;
static int rt_hash_log;
static unsigned int rt_hash_rnd;
-struct rt_cache_stat *rt_cache_stat;
+static struct rt_cache_stat *rt_cache_stat;
+#define RT_CACHE_STAT_INC(field) \
+ (per_cpu_ptr(rt_cache_stat, raw_smp_processor_id())->field++)
static int rt_intern_hash(unsigned hash, struct rtable *rth,
struct rtable **res);
@@ -1758,6 +1760,7 @@ static inline int __mkroute_input(struct sk_buff *skb,
goto cleanup;
}
+ atomic_set(&rth->u.dst.__refcnt, 1);
rth->u.dst.flags= DST_HOST;
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
if (res->fi->fib_nhs > 1)
@@ -1818,7 +1821,6 @@ static inline int ip_mkroute_input_def(struct sk_buff *skb,
err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
if (err)
return err;
- atomic_set(&rth->u.dst.__refcnt, 1);
/* put it into the cache */
hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
@@ -1832,8 +1834,8 @@ static inline int ip_mkroute_input(struct sk_buff *skb,
u32 daddr, u32 saddr, u32 tos)
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
- struct rtable* rth = NULL;
- unsigned char hop, hopcount, lasthop;
+ struct rtable* rth = NULL, *rtres;
+ unsigned char hop, hopcount;
int err = -EINVAL;
unsigned int hash;
@@ -1842,8 +1844,6 @@ static inline int ip_mkroute_input(struct sk_buff *skb,
else
hopcount = 1;
- lasthop = hopcount - 1;
-
/* distinguish between multipath and singlepath */
if (hopcount < 2)
return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
@@ -1853,6 +1853,10 @@ static inline int ip_mkroute_input(struct sk_buff *skb,
for (hop = 0; hop < hopcount; hop++) {
res->nh_sel = hop;
+ /* put reference to previous result */
+ if (hop)
+ ip_rt_put(rtres);
+
/* create a routing cache entry */
err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
&rth);
@@ -1861,7 +1865,7 @@ static inline int ip_mkroute_input(struct sk_buff *skb,
/* put it into the cache */
hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
- err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
+ err = rt_intern_hash(hash, rth, &rtres);
if (err)
return err;
@@ -1871,13 +1875,8 @@ static inline int ip_mkroute_input(struct sk_buff *skb,
FIB_RES_NETMASK(*res),
res->prefixlen,
&FIB_RES_NH(*res));
-
- /* only for the last hop the reference count is handled
- * outside
- */
- if (hop == lasthop)
- atomic_set(&(skb->dst->__refcnt), 1);
}
+ skb->dst = &rtres->u.dst;
return err;
#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
@@ -2129,7 +2128,7 @@ int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
struct in_device *in_dev;
rcu_read_lock();
- if ((in_dev = __in_dev_get(dev)) != NULL) {
+ if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
int our = ip_check_mc(in_dev, daddr, saddr,
skb->nh.iph->protocol);
if (our
@@ -2206,6 +2205,7 @@ static inline int __mkroute_output(struct rtable **result,
goto cleanup;
}
+ atomic_set(&rth->u.dst.__refcnt, 1);
rth->u.dst.flags= DST_HOST;
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
if (res->fi) {
@@ -2288,8 +2288,6 @@ static inline int ip_mkroute_output_def(struct rtable **rp,
if (err == 0) {
u32 tos = RT_FL_TOS(oldflp);
- atomic_set(&rth->u.dst.__refcnt, 1);
-
hash = rt_hash_code(oldflp->fl4_dst,
oldflp->fl4_src ^ (oldflp->oif << 5), tos);
err = rt_intern_hash(hash, rth, rp);
@@ -2324,6 +2322,10 @@ static inline int ip_mkroute_output(struct rtable** rp,
dev2nexthop = FIB_RES_DEV(*res);
dev_hold(dev2nexthop);
+ /* put reference to previous result */
+ if (hop)
+ ip_rt_put(*rp);
+
err = __mkroute_output(&rth, res, fl, oldflp,
dev2nexthop, flags);
@@ -2348,7 +2350,6 @@ static inline int ip_mkroute_output(struct rtable** rp,
if (err != 0)
return err;
}
- atomic_set(&(*rp)->u.dst.__refcnt, 1);
return err;
} else {
return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
@@ -2442,7 +2443,9 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
err = -ENODEV;
if (dev_out == NULL)
goto out;
- if (__in_dev_get(dev_out) == NULL) {
+
+ /* RACE: Check return value of inet_select_addr instead. */
+ if (__in_dev_get_rtnl(dev_out) == NULL) {
dev_put(dev_out);
goto out; /* Wrong error code */
}
@@ -2600,6 +2603,8 @@ int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
return ip_route_output_slow(rp, flp);
}
+EXPORT_SYMBOL_GPL(__ip_route_output_key);
+
int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
{
int err;
@@ -2618,6 +2623,8 @@ int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk,
return 0;
}
+EXPORT_SYMBOL_GPL(ip_route_output_flow);
+
int ip_route_output_key(struct rtable **rp, struct flowi *flp)
{
return ip_route_output_flow(rp, flp, NULL, 0);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 72d01444218..a34e60ea48a 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -169,8 +169,6 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
return mssind < NUM_MSS ? msstab[mssind] + 1 : 0;
}
-extern struct request_sock_ops tcp_request_sock_ops;
-
static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct dst_entry *dst)
@@ -180,7 +178,7 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
child = tp->af_specific->syn_recv_sock(sk, skb, req, dst);
if (child)
- tcp_acceptq_queue(sk, req, child);
+ inet_csk_reqsk_queue_add(sk, req, child);
else
reqsk_free(req);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index e3289453241..65268562351 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -11,7 +11,9 @@
#include <linux/module.h>
#include <linux/sysctl.h>
#include <linux/config.h>
+#include <linux/igmp.h>
#include <net/snmp.h>
+#include <net/icmp.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/tcp.h>
@@ -19,36 +21,6 @@
/* From af_inet.c */
extern int sysctl_ip_nonlocal_bind;
-/* From icmp.c */
-extern int sysctl_icmp_echo_ignore_all;
-extern int sysctl_icmp_echo_ignore_broadcasts;
-extern int sysctl_icmp_ignore_bogus_error_responses;
-extern int sysctl_icmp_errors_use_inbound_ifaddr;
-
-/* From ip_fragment.c */
-extern int sysctl_ipfrag_low_thresh;
-extern int sysctl_ipfrag_high_thresh;
-extern int sysctl_ipfrag_time;
-extern int sysctl_ipfrag_secret_interval;
-
-/* From ip_output.c */
-extern int sysctl_ip_dynaddr;
-
-/* From icmp.c */
-extern int sysctl_icmp_ratelimit;
-extern int sysctl_icmp_ratemask;
-
-/* From igmp.c */
-extern int sysctl_igmp_max_memberships;
-extern int sysctl_igmp_max_msf;
-
-/* From inetpeer.c */
-extern int inet_peer_threshold;
-extern int inet_peer_minttl;
-extern int inet_peer_maxttl;
-extern int inet_peer_gc_mintime;
-extern int inet_peer_gc_maxtime;
-
#ifdef CONFIG_SYSCTL
static int tcp_retr1_max = 255;
static int ip_local_port_range_min[] = { 1, 1 };
@@ -57,8 +29,6 @@ static int ip_local_port_range_max[] = { 65535, 65535 };
struct ipv4_config ipv4_config;
-extern ctl_table ipv4_route_table[];
-
#ifdef CONFIG_SYSCTL
static
@@ -136,10 +106,11 @@ static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file *
return ret;
}
-int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, int nlen,
- void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen,
- void **context)
+static int sysctl_tcp_congestion_control(ctl_table *table, int __user *name,
+ int nlen, void __user *oldval,
+ size_t __user *oldlenp,
+ void __user *newval, size_t newlen,
+ void **context)
{
char val[TCP_CA_NAME_MAX];
ctl_table tbl = {
@@ -259,7 +230,7 @@ ctl_table ipv4_table[] = {
{
.ctl_name = NET_TCP_MAX_TW_BUCKETS,
.procname = "tcp_max_tw_buckets",
- .data = &sysctl_tcp_max_tw_buckets,
+ .data = &tcp_death_row.sysctl_max_tw_buckets,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec
@@ -363,7 +334,7 @@ ctl_table ipv4_table[] = {
{
.ctl_name = NET_TCP_TW_RECYCLE,
.procname = "tcp_tw_recycle",
- .data = &sysctl_tcp_tw_recycle,
+ .data = &tcp_death_row.sysctl_tw_recycle,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ddb6ce4ecff..f3f0013a958 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -269,13 +269,12 @@
int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
-DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
-
-kmem_cache_t *tcp_bucket_cachep;
-kmem_cache_t *tcp_timewait_cachep;
+DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly;
atomic_t tcp_orphan_count = ATOMIC_INIT(0);
+EXPORT_SYMBOL_GPL(tcp_orphan_count);
+
int sysctl_tcp_mem[3];
int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
@@ -311,15 +310,6 @@ void tcp_enter_memory_pressure(void)
EXPORT_SYMBOL(tcp_enter_memory_pressure);
/*
- * LISTEN is a special case for poll..
- */
-static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
- poll_table *wait)
-{
- return !reqsk_queue_empty(&tcp_sk(sk)->accept_queue) ? (POLLIN | POLLRDNORM) : 0;
-}
-
-/*
* Wait for a TCP event.
*
* Note that we don't need to lock the socket, as the upper poll layers
@@ -334,7 +324,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
poll_wait(file, sk->sk_sleep, wait);
if (sk->sk_state == TCP_LISTEN)
- return tcp_listen_poll(sk, wait);
+ return inet_csk_listen_poll(sk);
/* Socket is not locked. We are protected from async events
by poll logic and correct handling of state changes
@@ -457,109 +447,6 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
return put_user(answ, (int __user *)arg);
}
-
-int tcp_listen_start(struct sock *sk)
-{
- struct inet_sock *inet = inet_sk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
- int rc = reqsk_queue_alloc(&tp->accept_queue, TCP_SYNQ_HSIZE);
-
- if (rc != 0)
- return rc;
-
- sk->sk_max_ack_backlog = 0;
- sk->sk_ack_backlog = 0;
- tcp_delack_init(tp);
-
- /* There is race window here: we announce ourselves listening,
- * but this transition is still not validated by get_port().
- * It is OK, because this socket enters to hash table only
- * after validation is complete.
- */
- sk->sk_state = TCP_LISTEN;
- if (!sk->sk_prot->get_port(sk, inet->num)) {
- inet->sport = htons(inet->num);
-
- sk_dst_reset(sk);
- sk->sk_prot->hash(sk);
-
- return 0;
- }
-
- sk->sk_state = TCP_CLOSE;
- reqsk_queue_destroy(&tp->accept_queue);
- return -EADDRINUSE;
-}
-
-/*
- * This routine closes sockets which have been at least partially
- * opened, but not yet accepted.
- */
-
-static void tcp_listen_stop (struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct listen_sock *lopt;
- struct request_sock *acc_req;
- struct request_sock *req;
- int i;
-
- tcp_delete_keepalive_timer(sk);
-
- /* make all the listen_opt local to us */
- lopt = reqsk_queue_yank_listen_sk(&tp->accept_queue);
- acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue);
-
- if (lopt->qlen) {
- for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
- while ((req = lopt->syn_table[i]) != NULL) {
- lopt->syn_table[i] = req->dl_next;
- lopt->qlen--;
- reqsk_free(req);
-
- /* Following specs, it would be better either to send FIN
- * (and enter FIN-WAIT-1, it is normal close)
- * or to send active reset (abort).
- * Certainly, it is pretty dangerous while synflood, but it is
- * bad justification for our negligence 8)
- * To be honest, we are not able to make either
- * of the variants now. --ANK
- */
- }
- }
- }
- BUG_TRAP(!lopt->qlen);
-
- kfree(lopt);
-
- while ((req = acc_req) != NULL) {
- struct sock *child = req->sk;
-
- acc_req = req->dl_next;
-
- local_bh_disable();
- bh_lock_sock(child);
- BUG_TRAP(!sock_owned_by_user(child));
- sock_hold(child);
-
- tcp_disconnect(child, O_NONBLOCK);
-
- sock_orphan(child);
-
- atomic_inc(&tcp_orphan_count);
-
- tcp_destroy_sock(child);
-
- bh_unlock_sock(child);
- local_bh_enable();
- sock_put(child);
-
- sk_acceptq_removed(sk);
- __reqsk_free(req);
- }
- BUG_TRAP(!sk->sk_ack_backlog);
-}
-
static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
{
TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
@@ -584,7 +471,7 @@ static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
sk_charge_skb(sk, skb);
if (!sk->sk_send_head)
sk->sk_send_head = skb;
- else if (tp->nonagle&TCP_NAGLE_PUSH)
+ if (tp->nonagle & TCP_NAGLE_PUSH)
tp->nonagle &= ~TCP_NAGLE_PUSH;
}
@@ -665,8 +552,7 @@ new_segment:
tcp_mark_push(tp, skb);
goto new_segment;
}
- if (sk->sk_forward_alloc < copy &&
- !sk_stream_mem_schedule(sk, copy, 0))
+ if (!sk_stream_wmem_schedule(sk, copy))
goto wait_for_memory;
if (can_coalesce) {
@@ -883,19 +769,23 @@ new_segment:
if (off == PAGE_SIZE) {
put_page(page);
TCP_PAGE(sk) = page = NULL;
+ off = 0;
}
- }
+ } else
+ off = 0;
+
+ if (copy > PAGE_SIZE - off)
+ copy = PAGE_SIZE - off;
+
+ if (!sk_stream_wmem_schedule(sk, copy))
+ goto wait_for_memory;
if (!page) {
/* Allocate new cache page. */
if (!(page = sk_stream_alloc_page(sk)))
goto wait_for_memory;
- off = 0;
}
- if (copy > PAGE_SIZE - off)
- copy = PAGE_SIZE - off;
-
/* Time to copy data. We are close to
* the end! */
err = skb_copy_to_page(sk, from, skb, page,
@@ -975,7 +865,7 @@ do_fault:
if (!skb->len) {
if (sk->sk_send_head == skb)
sk->sk_send_head = NULL;
- __skb_unlink(skb, skb->list);
+ __skb_unlink(skb, &sk->sk_write_queue);
sk_stream_free_skb(sk, skb);
}
@@ -1057,20 +947,21 @@ static void cleanup_rbuf(struct sock *sk, int copied)
BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
#endif
- if (tcp_ack_scheduled(tp)) {
+ if (inet_csk_ack_scheduled(sk)) {
+ const struct inet_connection_sock *icsk = inet_csk(sk);
/* Delayed ACKs frequently hit locked sockets during bulk
* receive. */
- if (tp->ack.blocked ||
+ if (icsk->icsk_ack.blocked ||
/* Once-per-two-segments ACK was not sent by tcp_input.c */
- tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
+ tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
/*
* If this read emptied read buffer, we send ACK, if
* connection is not bidirectional, user drained
* receive buffer and there was a small segment
* in queue.
*/
- (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
- !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
+ (copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
+ !icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
time_to_ack = 1;
}
@@ -1572,40 +1463,6 @@ void tcp_shutdown(struct sock *sk, int how)
}
}
-/*
- * At this point, there should be no process reference to this
- * socket, and thus no user references at all. Therefore we
- * can assume the socket waitqueue is inactive and nobody will
- * try to jump onto it.
- */
-void tcp_destroy_sock(struct sock *sk)
-{
- BUG_TRAP(sk->sk_state == TCP_CLOSE);
- BUG_TRAP(sock_flag(sk, SOCK_DEAD));
-
- /* It cannot be in hash table! */
- BUG_TRAP(sk_unhashed(sk));
-
- /* If it has not 0 inet_sk(sk)->num, it must be bound */
- BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
-
- sk->sk_prot->destroy(sk);
-
- sk_stream_kill_queues(sk);
-
- xfrm_sk_free_policy(sk);
-
-#ifdef INET_REFCNT_DEBUG
- if (atomic_read(&sk->sk_refcnt) != 1) {
- printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
- sk, atomic_read(&sk->sk_refcnt));
- }
-#endif
-
- atomic_dec(&tcp_orphan_count);
- sock_put(sk);
-}
-
void tcp_close(struct sock *sk, long timeout)
{
struct sk_buff *skb;
@@ -1618,7 +1475,7 @@ void tcp_close(struct sock *sk, long timeout)
tcp_set_state(sk, TCP_CLOSE);
/* Special case. */
- tcp_listen_stop(sk);
+ inet_csk_listen_stop(sk);
goto adjudge_to_death;
}
@@ -1721,12 +1578,12 @@ adjudge_to_death:
tcp_send_active_reset(sk, GFP_ATOMIC);
NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
} else {
- int tmo = tcp_fin_time(tp);
+ const int tmo = tcp_fin_time(sk);
if (tmo > TCP_TIMEWAIT_LEN) {
- tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
+ inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk));
} else {
- atomic_inc(&tcp_orphan_count);
+ atomic_inc(sk->sk_prot->orphan_count);
tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
goto out;
}
@@ -1734,7 +1591,7 @@ adjudge_to_death:
}
if (sk->sk_state != TCP_CLOSE) {
sk_stream_mem_reclaim(sk);
- if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
+ if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans ||
(sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
if (net_ratelimit())
@@ -1745,10 +1602,10 @@ adjudge_to_death:
NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
}
}
- atomic_inc(&tcp_orphan_count);
+ atomic_inc(sk->sk_prot->orphan_count);
if (sk->sk_state == TCP_CLOSE)
- tcp_destroy_sock(sk);
+ inet_csk_destroy_sock(sk);
/* Otherwise, socket is reprieved until protocol close. */
out:
@@ -1769,6 +1626,7 @@ static inline int tcp_need_reset(int state)
int tcp_disconnect(struct sock *sk, int flags)
{
struct inet_sock *inet = inet_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
int err = 0;
int old_state = sk->sk_state;
@@ -1778,7 +1636,7 @@ int tcp_disconnect(struct sock *sk, int flags)
/* ABORT function of RFC793 */
if (old_state == TCP_LISTEN) {
- tcp_listen_stop(sk);
+ inet_csk_listen_stop(sk);
} else if (tcp_need_reset(old_state) ||
(tp->snd_nxt != tp->write_seq &&
(1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
@@ -1805,125 +1663,34 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->srtt = 0;
if ((tp->write_seq += tp->max_window + 2) == 0)
tp->write_seq = 1;
- tp->backoff = 0;
+ icsk->icsk_backoff = 0;
tp->snd_cwnd = 2;
- tp->probes_out = 0;
+ icsk->icsk_probes_out = 0;
tp->packets_out = 0;
tp->snd_ssthresh = 0x7fffffff;
tp->snd_cwnd_cnt = 0;
- tcp_set_ca_state(tp, TCP_CA_Open);
+ tcp_set_ca_state(sk, TCP_CA_Open);
tcp_clear_retrans(tp);
- tcp_delack_init(tp);
+ inet_csk_delack_init(sk);
sk->sk_send_head = NULL;
tp->rx_opt.saw_tstamp = 0;
tcp_sack_reset(&tp->rx_opt);
__sk_dst_reset(sk);
- BUG_TRAP(!inet->num || tp->bind_hash);
+ BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
sk->sk_error_report(sk);
return err;
}
/*
- * Wait for an incoming connection, avoid race
- * conditions. This must be called with the socket locked.
- */
-static int wait_for_connect(struct sock *sk, long timeo)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- DEFINE_WAIT(wait);
- int err;
-
- /*
- * True wake-one mechanism for incoming connections: only
- * one process gets woken up, not the 'whole herd'.
- * Since we do not 'race & poll' for established sockets
- * anymore, the common case will execute the loop only once.
- *
- * Subtle issue: "add_wait_queue_exclusive()" will be added
- * after any current non-exclusive waiters, and we know that
- * it will always _stay_ after any new non-exclusive waiters
- * because all non-exclusive waiters are added at the
- * beginning of the wait-queue. As such, it's ok to "drop"
- * our exclusiveness temporarily when we get woken up without
- * having to remove and re-insert us on the wait queue.
- */
- for (;;) {
- prepare_to_wait_exclusive(sk->sk_sleep, &wait,
- TASK_INTERRUPTIBLE);
- release_sock(sk);
- if (reqsk_queue_empty(&tp->accept_queue))
- timeo = schedule_timeout(timeo);
- lock_sock(sk);
- err = 0;
- if (!reqsk_queue_empty(&tp->accept_queue))
- break;
- err = -EINVAL;
- if (sk->sk_state != TCP_LISTEN)
- break;
- err = sock_intr_errno(timeo);
- if (signal_pending(current))
- break;
- err = -EAGAIN;
- if (!timeo)
- break;
- }
- finish_wait(sk->sk_sleep, &wait);
- return err;
-}
-
-/*
- * This will accept the next outstanding connection.
- */
-
-struct sock *tcp_accept(struct sock *sk, int flags, int *err)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct sock *newsk;
- int error;
-
- lock_sock(sk);
-
- /* We need to make sure that this socket is listening,
- * and that it has something pending.
- */
- error = -EINVAL;
- if (sk->sk_state != TCP_LISTEN)
- goto out_err;
-
- /* Find already established connection */
- if (reqsk_queue_empty(&tp->accept_queue)) {
- long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
-
- /* If this is a non blocking socket don't sleep */
- error = -EAGAIN;
- if (!timeo)
- goto out_err;
-
- error = wait_for_connect(sk, timeo);
- if (error)
- goto out_err;
- }
-
- newsk = reqsk_queue_get_child(&tp->accept_queue, sk);
- BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
-out:
- release_sock(sk);
- return newsk;
-out_err:
- newsk = NULL;
- *err = error;
- goto out;
-}
-
-/*
* Socket option code for TCP.
*/
int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
int optlen)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
int val;
int err = 0;
@@ -1945,7 +1712,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
name[val] = 0;
lock_sock(sk);
- err = tcp_set_congestion_control(tp, name);
+ err = tcp_set_congestion_control(sk, name);
release_sock(sk);
return err;
}
@@ -2022,7 +1789,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
elapsed = tp->keepalive_time - elapsed;
else
elapsed = 0;
- tcp_reset_keepalive_timer(sk, elapsed);
+ inet_csk_reset_keepalive_timer(sk, elapsed);
}
}
break;
@@ -2042,7 +1809,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
if (val < 1 || val > MAX_TCP_SYNCNT)
err = -EINVAL;
else
- tp->syn_retries = val;
+ icsk->icsk_syn_retries = val;
break;
case TCP_LINGER2:
@@ -2055,15 +1822,15 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
break;
case TCP_DEFER_ACCEPT:
- tp->defer_accept = 0;
+ icsk->icsk_accept_queue.rskq_defer_accept = 0;
if (val > 0) {
/* Translate value in seconds to number of
* retransmits */
- while (tp->defer_accept < 32 &&
+ while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
val > ((TCP_TIMEOUT_INIT / HZ) <<
- tp->defer_accept))
- tp->defer_accept++;
- tp->defer_accept++;
+ icsk->icsk_accept_queue.rskq_defer_accept))
+ icsk->icsk_accept_queue.rskq_defer_accept++;
+ icsk->icsk_accept_queue.rskq_defer_accept++;
}
break;
@@ -2081,16 +1848,16 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
case TCP_QUICKACK:
if (!val) {
- tp->ack.pingpong = 1;
+ icsk->icsk_ack.pingpong = 1;
} else {
- tp->ack.pingpong = 0;
+ icsk->icsk_ack.pingpong = 0;
if ((1 << sk->sk_state) &
(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
- tcp_ack_scheduled(tp)) {
- tp->ack.pending |= TCP_ACK_PUSHED;
+ inet_csk_ack_scheduled(sk)) {
+ icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
cleanup_rbuf(sk, 1);
if (!(val & 1))
- tp->ack.pingpong = 1;
+ icsk->icsk_ack.pingpong = 1;
}
}
break;
@@ -2107,15 +1874,16 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
void tcp_get_info(struct sock *sk, struct tcp_info *info)
{
struct tcp_sock *tp = tcp_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
u32 now = tcp_time_stamp;
memset(info, 0, sizeof(*info));
info->tcpi_state = sk->sk_state;
- info->tcpi_ca_state = tp->ca_state;
- info->tcpi_retransmits = tp->retransmits;
- info->tcpi_probes = tp->probes_out;
- info->tcpi_backoff = tp->backoff;
+ info->tcpi_ca_state = icsk->icsk_ca_state;
+ info->tcpi_retransmits = icsk->icsk_retransmits;
+ info->tcpi_probes = icsk->icsk_probes_out;
+ info->tcpi_backoff = icsk->icsk_backoff;
if (tp->rx_opt.tstamp_ok)
info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
@@ -2130,10 +1898,10 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
if (tp->ecn_flags&TCP_ECN_OK)
info->tcpi_options |= TCPI_OPT_ECN;
- info->tcpi_rto = jiffies_to_usecs(tp->rto);
- info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
+ info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
+ info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
info->tcpi_snd_mss = tp->mss_cache;
- info->tcpi_rcv_mss = tp->ack.rcv_mss;
+ info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
info->tcpi_unacked = tp->packets_out;
info->tcpi_sacked = tp->sacked_out;
@@ -2142,7 +1910,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_fackets = tp->fackets_out;
info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
- info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);
+ info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
info->tcpi_pmtu = tp->pmtu_cookie;
@@ -2165,6 +1933,7 @@ EXPORT_SYMBOL_GPL(tcp_get_info);
int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
int __user *optlen)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
int val, len;
@@ -2202,7 +1971,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
break;
case TCP_SYNCNT:
- val = tp->syn_retries ? : sysctl_tcp_syn_retries;
+ val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
break;
case TCP_LINGER2:
val = tp->linger2;
@@ -2210,8 +1979,8 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
val = (val ? : sysctl_tcp_fin_timeout) / HZ;
break;
case TCP_DEFER_ACCEPT:
- val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
- (tp->defer_accept - 1));
+ val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
+ ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
break;
case TCP_WINDOW_CLAMP:
val = tp->window_clamp;
@@ -2232,7 +2001,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
return 0;
}
case TCP_QUICKACK:
- val = !tp->ack.pingpong;
+ val = !icsk->icsk_ack.pingpong;
break;
case TCP_CONGESTION:
@@ -2241,7 +2010,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
if (put_user(len, optlen))
return -EFAULT;
- if (copy_to_user(optval, tp->ca_ops->name, len))
+ if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
return -EFAULT;
return 0;
default:
@@ -2278,79 +2047,72 @@ void __init tcp_init(void)
__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
sizeof(skb->cb));
- tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
- sizeof(struct tcp_bind_bucket),
- 0, SLAB_HWCACHE_ALIGN,
- NULL, NULL);
- if (!tcp_bucket_cachep)
+ tcp_hashinfo.bind_bucket_cachep =
+ kmem_cache_create("tcp_bind_bucket",
+ sizeof(struct inet_bind_bucket), 0,
+ SLAB_HWCACHE_ALIGN, NULL, NULL);
+ if (!tcp_hashinfo.bind_bucket_cachep)
panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
- tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
- sizeof(struct tcp_tw_bucket),
- 0, SLAB_HWCACHE_ALIGN,
- NULL, NULL);
- if (!tcp_timewait_cachep)
- panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
-
/* Size and allocate the main established and bind bucket
* hash tables.
*
* The methodology is similar to that of the buffer cache.
*/
- tcp_ehash = (struct tcp_ehash_bucket *)
+ tcp_hashinfo.ehash =
alloc_large_system_hash("TCP established",
- sizeof(struct tcp_ehash_bucket),
+ sizeof(struct inet_ehash_bucket),
thash_entries,
(num_physpages >= 128 * 1024) ?
(25 - PAGE_SHIFT) :
(27 - PAGE_SHIFT),
HASH_HIGHMEM,
- &tcp_ehash_size,
+ &tcp_hashinfo.ehash_size,
NULL,
0);
- tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
- for (i = 0; i < (tcp_ehash_size << 1); i++) {
- rwlock_init(&tcp_ehash[i].lock);
- INIT_HLIST_HEAD(&tcp_ehash[i].chain);
+ tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1;
+ for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) {
+ rwlock_init(&tcp_hashinfo.ehash[i].lock);
+ INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
}
- tcp_bhash = (struct tcp_bind_hashbucket *)
+ tcp_hashinfo.bhash =
alloc_large_system_hash("TCP bind",
- sizeof(struct tcp_bind_hashbucket),
- tcp_ehash_size,
+ sizeof(struct inet_bind_hashbucket),
+ tcp_hashinfo.ehash_size,
(num_physpages >= 128 * 1024) ?
(25 - PAGE_SHIFT) :
(27 - PAGE_SHIFT),
HASH_HIGHMEM,
- &tcp_bhash_size,
+ &tcp_hashinfo.bhash_size,
NULL,
64 * 1024);
- tcp_bhash_size = 1 << tcp_bhash_size;
- for (i = 0; i < tcp_bhash_size; i++) {
- spin_lock_init(&tcp_bhash[i].lock);
- INIT_HLIST_HEAD(&tcp_bhash[i].chain);
+ tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
+ for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
+ spin_lock_init(&tcp_hashinfo.bhash[i].lock);
+ INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
}
/* Try to be a bit smarter and adjust defaults depending
* on available memory.
*/
for (order = 0; ((1 << order) << PAGE_SHIFT) <
- (tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
+ (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
order++)
;
if (order >= 4) {
sysctl_local_port_range[0] = 32768;
sysctl_local_port_range[1] = 61000;
- sysctl_tcp_max_tw_buckets = 180000;
+ tcp_death_row.sysctl_max_tw_buckets = 180000;
sysctl_tcp_max_orphans = 4096 << (order - 4);
sysctl_max_syn_backlog = 1024;
} else if (order < 3) {
sysctl_local_port_range[0] = 1024 * (3 - order);
- sysctl_tcp_max_tw_buckets >>= (3 - order);
+ tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
sysctl_tcp_max_orphans >>= (3 - order);
sysctl_max_syn_backlog = 128;
}
- tcp_port_rover = sysctl_local_port_range[0] - 1;
+ tcp_hashinfo.port_rover = sysctl_local_port_range[0] - 1;
sysctl_tcp_mem[0] = 768 << order;
sysctl_tcp_mem[1] = 1024 << order;
@@ -2365,14 +2127,12 @@ void __init tcp_init(void)
printk(KERN_INFO "TCP: Hash tables configured "
"(established %d bind %d)\n",
- tcp_ehash_size << 1, tcp_bhash_size);
+ tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size);
tcp_register_congestion_control(&tcp_reno);
}
-EXPORT_SYMBOL(tcp_accept);
EXPORT_SYMBOL(tcp_close);
-EXPORT_SYMBOL(tcp_destroy_sock);
EXPORT_SYMBOL(tcp_disconnect);
EXPORT_SYMBOL(tcp_getsockopt);
EXPORT_SYMBOL(tcp_ioctl);
@@ -2384,4 +2144,3 @@ EXPORT_SYMBOL(tcp_sendpage);
EXPORT_SYMBOL(tcp_setsockopt);
EXPORT_SYMBOL(tcp_shutdown);
EXPORT_SYMBOL(tcp_statistics);
-EXPORT_SYMBOL(tcp_timewait_cachep);
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index ec38d45d664..ae35e060904 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -27,7 +27,7 @@
*/
static int fast_convergence = 1;
-static int max_increment = 32;
+static int max_increment = 16;
static int low_window = 14;
static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
static int low_utilization_threshold = 153;
@@ -86,11 +86,11 @@ static inline void bictcp_reset(struct bictcp *ca)
ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
}
-static void bictcp_init(struct tcp_sock *tp)
+static void bictcp_init(struct sock *sk)
{
- bictcp_reset(tcp_ca(tp));
+ bictcp_reset(inet_csk_ca(sk));
if (initial_ssthresh)
- tp->snd_ssthresh = initial_ssthresh;
+ tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
}
/*
@@ -136,7 +136,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
else if (cwnd < ca->last_max_cwnd + max_increment*(BICTCP_B-1))
/* slow start */
ca->cnt = (cwnd * (BICTCP_B-1))
- / cwnd-ca->last_max_cwnd;
+ / (cwnd - ca->last_max_cwnd);
else
/* linear increase */
ca->cnt = cwnd / max_increment;
@@ -156,9 +156,10 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
/* Detect low utilization in congestion avoidance */
-static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag)
+static inline void bictcp_low_utilization(struct sock *sk, int flag)
{
- struct bictcp *ca = tcp_ca(tp);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
u32 dist, delay;
/* No time stamp */
@@ -208,12 +209,13 @@ static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag)
}
-static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack,
+static void bictcp_cong_avoid(struct sock *sk, u32 ack,
u32 seq_rtt, u32 in_flight, int data_acked)
{
- struct bictcp *ca = tcp_ca(tp);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
- bictcp_low_utilization(tp, data_acked);
+ bictcp_low_utilization(sk, data_acked);
if (in_flight < tp->snd_cwnd)
return;
@@ -242,9 +244,10 @@ static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack,
* behave like Reno until low_window is reached,
* then increase congestion window slowly
*/
-static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp)
+static u32 bictcp_recalc_ssthresh(struct sock *sk)
{
- struct bictcp *ca = tcp_ca(tp);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
ca->epoch_start = 0; /* end of epoch */
@@ -269,31 +272,34 @@ static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp)
return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
}
-static u32 bictcp_undo_cwnd(struct tcp_sock *tp)
+static u32 bictcp_undo_cwnd(struct sock *sk)
{
- struct bictcp *ca = tcp_ca(tp);
-
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct bictcp *ca = inet_csk_ca(sk);
return max(tp->snd_cwnd, ca->last_max_cwnd);
}
-static u32 bictcp_min_cwnd(struct tcp_sock *tp)
+static u32 bictcp_min_cwnd(struct sock *sk)
{
+ const struct tcp_sock *tp = tcp_sk(sk);
return tp->snd_ssthresh;
}
-static void bictcp_state(struct tcp_sock *tp, u8 new_state)
+static void bictcp_state(struct sock *sk, u8 new_state)
{
if (new_state == TCP_CA_Loss)
- bictcp_reset(tcp_ca(tp));
+ bictcp_reset(inet_csk_ca(sk));
}
/* Track delayed acknowledgement ratio using sliding window
* ratio = (15*ratio + sample) / 16
*/
-static void bictcp_acked(struct tcp_sock *tp, u32 cnt)
+static void bictcp_acked(struct sock *sk, u32 cnt)
{
- if (cnt > 0 && tp->ca_state == TCP_CA_Open) {
- struct bictcp *ca = tcp_ca(tp);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) {
+ struct bictcp *ca = inet_csk_ca(sk);
cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
ca->delayed_ack += cnt;
}
@@ -314,7 +320,7 @@ static struct tcp_congestion_ops bictcp = {
static int __init bictcp_register(void)
{
- BUG_ON(sizeof(struct bictcp) > TCP_CA_PRIV_SIZE);
+ BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
return tcp_register_congestion_control(&bictcp);
}
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 4970d10a778..bbf2d6624e8 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -73,33 +73,36 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
/* Assign choice of congestion control. */
-void tcp_init_congestion_control(struct tcp_sock *tp)
+void tcp_init_congestion_control(struct sock *sk)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_congestion_ops *ca;
- if (tp->ca_ops != &tcp_init_congestion_ops)
+ if (icsk->icsk_ca_ops != &tcp_init_congestion_ops)
return;
rcu_read_lock();
list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
if (try_module_get(ca->owner)) {
- tp->ca_ops = ca;
+ icsk->icsk_ca_ops = ca;
break;
}
}
rcu_read_unlock();
- if (tp->ca_ops->init)
- tp->ca_ops->init(tp);
+ if (icsk->icsk_ca_ops->init)
+ icsk->icsk_ca_ops->init(sk);
}
/* Manage refcounts on socket close. */
-void tcp_cleanup_congestion_control(struct tcp_sock *tp)
+void tcp_cleanup_congestion_control(struct sock *sk)
{
- if (tp->ca_ops->release)
- tp->ca_ops->release(tp);
- module_put(tp->ca_ops->owner);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_ca_ops->release)
+ icsk->icsk_ca_ops->release(sk);
+ module_put(icsk->icsk_ca_ops->owner);
}
/* Used by sysctl to change default congestion control */
@@ -143,14 +146,15 @@ void tcp_get_default_congestion_control(char *name)
}
/* Change congestion control for socket */
-int tcp_set_congestion_control(struct tcp_sock *tp, const char *name)
+int tcp_set_congestion_control(struct sock *sk, const char *name)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_congestion_ops *ca;
int err = 0;
rcu_read_lock();
ca = tcp_ca_find(name);
- if (ca == tp->ca_ops)
+ if (ca == icsk->icsk_ca_ops)
goto out;
if (!ca)
@@ -160,10 +164,10 @@ int tcp_set_congestion_control(struct tcp_sock *tp, const char *name)
err = -EBUSY;
else {
- tcp_cleanup_congestion_control(tp);
- tp->ca_ops = ca;
- if (tp->ca_ops->init)
- tp->ca_ops->init(tp);
+ tcp_cleanup_congestion_control(sk);
+ icsk->icsk_ca_ops = ca;
+ if (icsk->icsk_ca_ops->init)
+ icsk->icsk_ca_ops->init(sk);
}
out:
rcu_read_unlock();
@@ -177,9 +181,11 @@ int tcp_set_congestion_control(struct tcp_sock *tp, const char *name)
/* This is Jacobson's slow start and congestion avoidance.
* SIGCOMM '88, p. 328.
*/
-void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight,
+void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight,
int flag)
{
+ struct tcp_sock *tp = tcp_sk(sk);
+
if (in_flight < tp->snd_cwnd)
return;
@@ -202,15 +208,17 @@ void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight,
EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
/* Slow start threshold is half the congestion window (min 2) */
-u32 tcp_reno_ssthresh(struct tcp_sock *tp)
+u32 tcp_reno_ssthresh(struct sock *sk)
{
+ const struct tcp_sock *tp = tcp_sk(sk);
return max(tp->snd_cwnd >> 1U, 2U);
}
EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
/* Lower bound on congestion window. */
-u32 tcp_reno_min_cwnd(struct tcp_sock *tp)
+u32 tcp_reno_min_cwnd(struct sock *sk)
{
+ const struct tcp_sock *tp = tcp_sk(sk);
return tp->snd_ssthresh/2;
}
EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd);
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index f66945cb158..c148c108188 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -1,5 +1,5 @@
/*
- * tcp_diag.c Module for monitoring TCP sockets.
+ * tcp_diag.c Module for monitoring TCP transport protocols sockets.
*
* Version: $Id: tcp_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $
*
@@ -12,779 +12,43 @@
*/
#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/fcntl.h>
-#include <linux/random.h>
-#include <linux/cache.h>
-#include <linux/init.h>
-#include <linux/time.h>
-
-#include <net/icmp.h>
-#include <net/tcp.h>
-#include <net/ipv6.h>
-#include <net/inet_common.h>
-
-#include <linux/inet.h>
-#include <linux/stddef.h>
-
-#include <linux/tcp_diag.h>
-struct tcpdiag_entry
-{
- u32 *saddr;
- u32 *daddr;
- u16 sport;
- u16 dport;
- u16 family;
- u16 userlocks;
-};
+#include <linux/module.h>
+#include <linux/inet_diag.h>
-static struct sock *tcpnl;
+#include <linux/tcp.h>
-#define TCPDIAG_PUT(skb, attrtype, attrlen) \
- RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
+#include <net/tcp.h>
-static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
- int ext, u32 pid, u32 seq, u16 nlmsg_flags)
+static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+ void *_info)
{
- struct inet_sock *inet = inet_sk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
- struct tcpdiagmsg *r;
- struct nlmsghdr *nlh;
- struct tcp_info *info = NULL;
- struct tcpdiag_meminfo *minfo = NULL;
- unsigned char *b = skb->tail;
-
- nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
- nlh->nlmsg_flags = nlmsg_flags;
- r = NLMSG_DATA(nlh);
- if (sk->sk_state != TCP_TIME_WAIT) {
- if (ext & (1<<(TCPDIAG_MEMINFO-1)))
- minfo = TCPDIAG_PUT(skb, TCPDIAG_MEMINFO, sizeof(*minfo));
- if (ext & (1<<(TCPDIAG_INFO-1)))
- info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info));
-
- if (ext & (1<<(TCPDIAG_CONG-1))) {
- size_t len = strlen(tp->ca_ops->name);
- strcpy(TCPDIAG_PUT(skb, TCPDIAG_CONG, len+1),
- tp->ca_ops->name);
- }
- }
- r->tcpdiag_family = sk->sk_family;
- r->tcpdiag_state = sk->sk_state;
- r->tcpdiag_timer = 0;
- r->tcpdiag_retrans = 0;
-
- r->id.tcpdiag_if = sk->sk_bound_dev_if;
- r->id.tcpdiag_cookie[0] = (u32)(unsigned long)sk;
- r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
-
- if (r->tcpdiag_state == TCP_TIME_WAIT) {
- struct tcp_tw_bucket *tw = (struct tcp_tw_bucket*)sk;
- long tmo = tw->tw_ttd - jiffies;
- if (tmo < 0)
- tmo = 0;
-
- r->id.tcpdiag_sport = tw->tw_sport;
- r->id.tcpdiag_dport = tw->tw_dport;
- r->id.tcpdiag_src[0] = tw->tw_rcv_saddr;
- r->id.tcpdiag_dst[0] = tw->tw_daddr;
- r->tcpdiag_state = tw->tw_substate;
- r->tcpdiag_timer = 3;
- r->tcpdiag_expires = (tmo*1000+HZ-1)/HZ;
- r->tcpdiag_rqueue = 0;
- r->tcpdiag_wqueue = 0;
- r->tcpdiag_uid = 0;
- r->tcpdiag_inode = 0;
-#ifdef CONFIG_IP_TCPDIAG_IPV6
- if (r->tcpdiag_family == AF_INET6) {
- ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
- &tw->tw_v6_rcv_saddr);
- ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
- &tw->tw_v6_daddr);
- }
-#endif
- nlh->nlmsg_len = skb->tail - b;
- return skb->len;
- }
-
- r->id.tcpdiag_sport = inet->sport;
- r->id.tcpdiag_dport = inet->dport;
- r->id.tcpdiag_src[0] = inet->rcv_saddr;
- r->id.tcpdiag_dst[0] = inet->daddr;
-
-#ifdef CONFIG_IP_TCPDIAG_IPV6
- if (r->tcpdiag_family == AF_INET6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
-
- ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
- &np->rcv_saddr);
- ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
- &np->daddr);
- }
-#endif
-
-#define EXPIRES_IN_MS(tmo) ((tmo-jiffies)*1000+HZ-1)/HZ
-
- if (tp->pending == TCP_TIME_RETRANS) {
- r->tcpdiag_timer = 1;
- r->tcpdiag_retrans = tp->retransmits;
- r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout);
- } else if (tp->pending == TCP_TIME_PROBE0) {
- r->tcpdiag_timer = 4;
- r->tcpdiag_retrans = tp->probes_out;
- r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout);
- } else if (timer_pending(&sk->sk_timer)) {
- r->tcpdiag_timer = 2;
- r->tcpdiag_retrans = tp->probes_out;
- r->tcpdiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires);
- } else {
- r->tcpdiag_timer = 0;
- r->tcpdiag_expires = 0;
- }
-#undef EXPIRES_IN_MS
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_info *info = _info;
- r->tcpdiag_rqueue = tp->rcv_nxt - tp->copied_seq;
- r->tcpdiag_wqueue = tp->write_seq - tp->snd_una;
- r->tcpdiag_uid = sock_i_uid(sk);
- r->tcpdiag_inode = sock_i_ino(sk);
-
- if (minfo) {
- minfo->tcpdiag_rmem = atomic_read(&sk->sk_rmem_alloc);
- minfo->tcpdiag_wmem = sk->sk_wmem_queued;
- minfo->tcpdiag_fmem = sk->sk_forward_alloc;
- minfo->tcpdiag_tmem = atomic_read(&sk->sk_wmem_alloc);
- }
-
- if (info)
+ r->idiag_rqueue = tp->rcv_nxt - tp->copied_seq;
+ r->idiag_wqueue = tp->write_seq - tp->snd_una;
+ if (info != NULL)
tcp_get_info(sk, info);
-
- if (sk->sk_state < TCP_TIME_WAIT && tp->ca_ops->get_info)
- tp->ca_ops->get_info(tp, ext, skb);
-
- nlh->nlmsg_len = skb->tail - b;
- return skb->len;
-
-rtattr_failure:
-nlmsg_failure:
- skb_trim(skb, b - skb->data);
- return -1;
-}
-
-extern struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport,
- int dif);
-#ifdef CONFIG_IP_TCPDIAG_IPV6
-extern struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
- struct in6_addr *daddr, u16 dport,
- int dif);
-#else
-static inline struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
- struct in6_addr *daddr, u16 dport,
- int dif)
-{
- return NULL;
-}
-#endif
-
-static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh)
-{
- int err;
- struct sock *sk;
- struct tcpdiagreq *req = NLMSG_DATA(nlh);
- struct sk_buff *rep;
-
- if (req->tcpdiag_family == AF_INET) {
- sk = tcp_v4_lookup(req->id.tcpdiag_dst[0], req->id.tcpdiag_dport,
- req->id.tcpdiag_src[0], req->id.tcpdiag_sport,
- req->id.tcpdiag_if);
- }
-#ifdef CONFIG_IP_TCPDIAG_IPV6
- else if (req->tcpdiag_family == AF_INET6) {
- sk = tcp_v6_lookup((struct in6_addr*)req->id.tcpdiag_dst, req->id.tcpdiag_dport,
- (struct in6_addr*)req->id.tcpdiag_src, req->id.tcpdiag_sport,
- req->id.tcpdiag_if);
- }
-#endif
- else {
- return -EINVAL;
- }
-
- if (sk == NULL)
- return -ENOENT;
-
- err = -ESTALE;
- if ((req->id.tcpdiag_cookie[0] != TCPDIAG_NOCOOKIE ||
- req->id.tcpdiag_cookie[1] != TCPDIAG_NOCOOKIE) &&
- ((u32)(unsigned long)sk != req->id.tcpdiag_cookie[0] ||
- (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.tcpdiag_cookie[1]))
- goto out;
-
- err = -ENOMEM;
- rep = alloc_skb(NLMSG_SPACE(sizeof(struct tcpdiagmsg)+
- sizeof(struct tcpdiag_meminfo)+
- sizeof(struct tcp_info)+64), GFP_KERNEL);
- if (!rep)
- goto out;
-
- if (tcpdiag_fill(rep, sk, req->tcpdiag_ext,
- NETLINK_CB(in_skb).pid,
- nlh->nlmsg_seq, 0) <= 0)
- BUG();
-
- err = netlink_unicast(tcpnl, rep, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
- if (err > 0)
- err = 0;
-
-out:
- if (sk) {
- if (sk->sk_state == TCP_TIME_WAIT)
- tcp_tw_put((struct tcp_tw_bucket*)sk);
- else
- sock_put(sk);
- }
- return err;
-}
-
-static int bitstring_match(const u32 *a1, const u32 *a2, int bits)
-{
- int words = bits >> 5;
-
- bits &= 0x1f;
-
- if (words) {
- if (memcmp(a1, a2, words << 2))
- return 0;
- }
- if (bits) {
- __u32 w1, w2;
- __u32 mask;
-
- w1 = a1[words];
- w2 = a2[words];
-
- mask = htonl((0xffffffff) << (32 - bits));
-
- if ((w1 ^ w2) & mask)
- return 0;
- }
-
- return 1;
-}
-
-
-static int tcpdiag_bc_run(const void *bc, int len,
- const struct tcpdiag_entry *entry)
-{
- while (len > 0) {
- int yes = 1;
- const struct tcpdiag_bc_op *op = bc;
-
- switch (op->code) {
- case TCPDIAG_BC_NOP:
- break;
- case TCPDIAG_BC_JMP:
- yes = 0;
- break;
- case TCPDIAG_BC_S_GE:
- yes = entry->sport >= op[1].no;
- break;
- case TCPDIAG_BC_S_LE:
- yes = entry->dport <= op[1].no;
- break;
- case TCPDIAG_BC_D_GE:
- yes = entry->dport >= op[1].no;
- break;
- case TCPDIAG_BC_D_LE:
- yes = entry->dport <= op[1].no;
- break;
- case TCPDIAG_BC_AUTO:
- yes = !(entry->userlocks & SOCK_BINDPORT_LOCK);
- break;
- case TCPDIAG_BC_S_COND:
- case TCPDIAG_BC_D_COND:
- {
- struct tcpdiag_hostcond *cond = (struct tcpdiag_hostcond*)(op+1);
- u32 *addr;
-
- if (cond->port != -1 &&
- cond->port != (op->code == TCPDIAG_BC_S_COND ?
- entry->sport : entry->dport)) {
- yes = 0;
- break;
- }
-
- if (cond->prefix_len == 0)
- break;
-
- if (op->code == TCPDIAG_BC_S_COND)
- addr = entry->saddr;
- else
- addr = entry->daddr;
-
- if (bitstring_match(addr, cond->addr, cond->prefix_len))
- break;
- if (entry->family == AF_INET6 &&
- cond->family == AF_INET) {
- if (addr[0] == 0 && addr[1] == 0 &&
- addr[2] == htonl(0xffff) &&
- bitstring_match(addr+3, cond->addr, cond->prefix_len))
- break;
- }
- yes = 0;
- break;
- }
- }
-
- if (yes) {
- len -= op->yes;
- bc += op->yes;
- } else {
- len -= op->no;
- bc += op->no;
- }
- }
- return (len == 0);
-}
-
-static int valid_cc(const void *bc, int len, int cc)
-{
- while (len >= 0) {
- const struct tcpdiag_bc_op *op = bc;
-
- if (cc > len)
- return 0;
- if (cc == len)
- return 1;
- if (op->yes < 4)
- return 0;
- len -= op->yes;
- bc += op->yes;
- }
- return 0;
-}
-
-static int tcpdiag_bc_audit(const void *bytecode, int bytecode_len)
-{
- const unsigned char *bc = bytecode;
- int len = bytecode_len;
-
- while (len > 0) {
- struct tcpdiag_bc_op *op = (struct tcpdiag_bc_op*)bc;
-
-//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
- switch (op->code) {
- case TCPDIAG_BC_AUTO:
- case TCPDIAG_BC_S_COND:
- case TCPDIAG_BC_D_COND:
- case TCPDIAG_BC_S_GE:
- case TCPDIAG_BC_S_LE:
- case TCPDIAG_BC_D_GE:
- case TCPDIAG_BC_D_LE:
- if (op->yes < 4 || op->yes > len+4)
- return -EINVAL;
- case TCPDIAG_BC_JMP:
- if (op->no < 4 || op->no > len+4)
- return -EINVAL;
- if (op->no < len &&
- !valid_cc(bytecode, bytecode_len, len-op->no))
- return -EINVAL;
- break;
- case TCPDIAG_BC_NOP:
- if (op->yes < 4 || op->yes > len+4)
- return -EINVAL;
- break;
- default:
- return -EINVAL;
- }
- bc += op->yes;
- len -= op->yes;
- }
- return len == 0 ? 0 : -EINVAL;
-}
-
-static int tcpdiag_dump_sock(struct sk_buff *skb, struct sock *sk,
- struct netlink_callback *cb)
-{
- struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
-
- if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
- struct tcpdiag_entry entry;
- struct rtattr *bc = (struct rtattr *)(r + 1);
- struct inet_sock *inet = inet_sk(sk);
-
- entry.family = sk->sk_family;
-#ifdef CONFIG_IP_TCPDIAG_IPV6
- if (entry.family == AF_INET6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
-
- entry.saddr = np->rcv_saddr.s6_addr32;
- entry.daddr = np->daddr.s6_addr32;
- } else
-#endif
- {
- entry.saddr = &inet->rcv_saddr;
- entry.daddr = &inet->daddr;
- }
- entry.sport = inet->num;
- entry.dport = ntohs(inet->dport);
- entry.userlocks = sk->sk_userlocks;
-
- if (!tcpdiag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry))
- return 0;
- }
-
- return tcpdiag_fill(skb, sk, r->tcpdiag_ext, NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI);
}
-static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk,
- struct request_sock *req,
- u32 pid, u32 seq)
-{
- const struct inet_request_sock *ireq = inet_rsk(req);
- struct inet_sock *inet = inet_sk(sk);
- unsigned char *b = skb->tail;
- struct tcpdiagmsg *r;
- struct nlmsghdr *nlh;
- long tmo;
-
- nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
- nlh->nlmsg_flags = NLM_F_MULTI;
- r = NLMSG_DATA(nlh);
-
- r->tcpdiag_family = sk->sk_family;
- r->tcpdiag_state = TCP_SYN_RECV;
- r->tcpdiag_timer = 1;
- r->tcpdiag_retrans = req->retrans;
-
- r->id.tcpdiag_if = sk->sk_bound_dev_if;
- r->id.tcpdiag_cookie[0] = (u32)(unsigned long)req;
- r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1);
-
- tmo = req->expires - jiffies;
- if (tmo < 0)
- tmo = 0;
-
- r->id.tcpdiag_sport = inet->sport;
- r->id.tcpdiag_dport = ireq->rmt_port;
- r->id.tcpdiag_src[0] = ireq->loc_addr;
- r->id.tcpdiag_dst[0] = ireq->rmt_addr;
- r->tcpdiag_expires = jiffies_to_msecs(tmo),
- r->tcpdiag_rqueue = 0;
- r->tcpdiag_wqueue = 0;
- r->tcpdiag_uid = sock_i_uid(sk);
- r->tcpdiag_inode = 0;
-#ifdef CONFIG_IP_TCPDIAG_IPV6
- if (r->tcpdiag_family == AF_INET6) {
- ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
- &tcp6_rsk(req)->loc_addr);
- ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
- &tcp6_rsk(req)->rmt_addr);
- }
-#endif
- nlh->nlmsg_len = skb->tail - b;
-
- return skb->len;
-
-nlmsg_failure:
- skb_trim(skb, b - skb->data);
- return -1;
-}
-
-static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
- struct netlink_callback *cb)
-{
- struct tcpdiag_entry entry;
- struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
- struct tcp_sock *tp = tcp_sk(sk);
- struct listen_sock *lopt;
- struct rtattr *bc = NULL;
- struct inet_sock *inet = inet_sk(sk);
- int j, s_j;
- int reqnum, s_reqnum;
- int err = 0;
-
- s_j = cb->args[3];
- s_reqnum = cb->args[4];
-
- if (s_j > 0)
- s_j--;
-
- entry.family = sk->sk_family;
-
- read_lock_bh(&tp->accept_queue.syn_wait_lock);
-
- lopt = tp->accept_queue.listen_opt;
- if (!lopt || !lopt->qlen)
- goto out;
-
- if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
- bc = (struct rtattr *)(r + 1);
- entry.sport = inet->num;
- entry.userlocks = sk->sk_userlocks;
- }
-
- for (j = s_j; j < TCP_SYNQ_HSIZE; j++) {
- struct request_sock *req, *head = lopt->syn_table[j];
-
- reqnum = 0;
- for (req = head; req; reqnum++, req = req->dl_next) {
- struct inet_request_sock *ireq = inet_rsk(req);
-
- if (reqnum < s_reqnum)
- continue;
- if (r->id.tcpdiag_dport != ireq->rmt_port &&
- r->id.tcpdiag_dport)
- continue;
-
- if (bc) {
- entry.saddr =
-#ifdef CONFIG_IP_TCPDIAG_IPV6
- (entry.family == AF_INET6) ?
- tcp6_rsk(req)->loc_addr.s6_addr32 :
-#endif
- &ireq->loc_addr;
- entry.daddr =
-#ifdef CONFIG_IP_TCPDIAG_IPV6
- (entry.family == AF_INET6) ?
- tcp6_rsk(req)->rmt_addr.s6_addr32 :
-#endif
- &ireq->rmt_addr;
- entry.dport = ntohs(ireq->rmt_port);
-
- if (!tcpdiag_bc_run(RTA_DATA(bc),
- RTA_PAYLOAD(bc), &entry))
- continue;
- }
-
- err = tcpdiag_fill_req(skb, sk, req,
- NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq);
- if (err < 0) {
- cb->args[3] = j + 1;
- cb->args[4] = reqnum;
- goto out;
- }
- }
-
- s_reqnum = 0;
- }
-
-out:
- read_unlock_bh(&tp->accept_queue.syn_wait_lock);
-
- return err;
-}
-
-static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb)
-{
- int i, num;
- int s_i, s_num;
- struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
-
- s_i = cb->args[1];
- s_num = num = cb->args[2];
-
- if (cb->args[0] == 0) {
- if (!(r->tcpdiag_states&(TCPF_LISTEN|TCPF_SYN_RECV)))
- goto skip_listen_ht;
- tcp_listen_lock();
- for (i = s_i; i < TCP_LHTABLE_SIZE; i++) {
- struct sock *sk;
- struct hlist_node *node;
-
- num = 0;
- sk_for_each(sk, node, &tcp_listening_hash[i]) {
- struct inet_sock *inet = inet_sk(sk);
-
- if (num < s_num) {
- num++;
- continue;
- }
-
- if (r->id.tcpdiag_sport != inet->sport &&
- r->id.tcpdiag_sport)
- goto next_listen;
-
- if (!(r->tcpdiag_states&TCPF_LISTEN) ||
- r->id.tcpdiag_dport ||
- cb->args[3] > 0)
- goto syn_recv;
-
- if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
- tcp_listen_unlock();
- goto done;
- }
-
-syn_recv:
- if (!(r->tcpdiag_states&TCPF_SYN_RECV))
- goto next_listen;
-
- if (tcpdiag_dump_reqs(skb, sk, cb) < 0) {
- tcp_listen_unlock();
- goto done;
- }
-
-next_listen:
- cb->args[3] = 0;
- cb->args[4] = 0;
- ++num;
- }
-
- s_num = 0;
- cb->args[3] = 0;
- cb->args[4] = 0;
- }
- tcp_listen_unlock();
-skip_listen_ht:
- cb->args[0] = 1;
- s_i = num = s_num = 0;
- }
-
- if (!(r->tcpdiag_states&~(TCPF_LISTEN|TCPF_SYN_RECV)))
- return skb->len;
-
- for (i = s_i; i < tcp_ehash_size; i++) {
- struct tcp_ehash_bucket *head = &tcp_ehash[i];
- struct sock *sk;
- struct hlist_node *node;
-
- if (i > s_i)
- s_num = 0;
-
- read_lock_bh(&head->lock);
-
- num = 0;
- sk_for_each(sk, node, &head->chain) {
- struct inet_sock *inet = inet_sk(sk);
-
- if (num < s_num)
- goto next_normal;
- if (!(r->tcpdiag_states & (1 << sk->sk_state)))
- goto next_normal;
- if (r->id.tcpdiag_sport != inet->sport &&
- r->id.tcpdiag_sport)
- goto next_normal;
- if (r->id.tcpdiag_dport != inet->dport && r->id.tcpdiag_dport)
- goto next_normal;
- if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
- read_unlock_bh(&head->lock);
- goto done;
- }
-next_normal:
- ++num;
- }
-
- if (r->tcpdiag_states&TCPF_TIME_WAIT) {
- sk_for_each(sk, node,
- &tcp_ehash[i + tcp_ehash_size].chain) {
- struct inet_sock *inet = inet_sk(sk);
-
- if (num < s_num)
- goto next_dying;
- if (r->id.tcpdiag_sport != inet->sport &&
- r->id.tcpdiag_sport)
- goto next_dying;
- if (r->id.tcpdiag_dport != inet->dport &&
- r->id.tcpdiag_dport)
- goto next_dying;
- if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
- read_unlock_bh(&head->lock);
- goto done;
- }
-next_dying:
- ++num;
- }
- }
- read_unlock_bh(&head->lock);
- }
-
-done:
- cb->args[1] = i;
- cb->args[2] = num;
- return skb->len;
-}
-
-static int tcpdiag_dump_done(struct netlink_callback *cb)
-{
- return 0;
-}
-
-
-static __inline__ int
-tcpdiag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
-{
- if (!(nlh->nlmsg_flags&NLM_F_REQUEST))
- return 0;
-
- if (nlh->nlmsg_type != TCPDIAG_GETSOCK)
- goto err_inval;
-
- if (NLMSG_LENGTH(sizeof(struct tcpdiagreq)) > skb->len)
- goto err_inval;
-
- if (nlh->nlmsg_flags&NLM_F_DUMP) {
- if (nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(struct tcpdiagreq))) {
- struct rtattr *rta = (struct rtattr*)(NLMSG_DATA(nlh) + sizeof(struct tcpdiagreq));
- if (rta->rta_type != TCPDIAG_REQ_BYTECODE ||
- rta->rta_len < 8 ||
- rta->rta_len > nlh->nlmsg_len - NLMSG_SPACE(sizeof(struct tcpdiagreq)))
- goto err_inval;
- if (tcpdiag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta)))
- goto err_inval;
- }
- return netlink_dump_start(tcpnl, skb, nlh,
- tcpdiag_dump,
- tcpdiag_dump_done);
- } else {
- return tcpdiag_get_exact(skb, nlh);
- }
-
-err_inval:
- return -EINVAL;
-}
-
-
-static inline void tcpdiag_rcv_skb(struct sk_buff *skb)
-{
- int err;
- struct nlmsghdr * nlh;
-
- if (skb->len >= NLMSG_SPACE(0)) {
- nlh = (struct nlmsghdr *)skb->data;
- if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
- return;
- err = tcpdiag_rcv_msg(skb, nlh);
- if (err || nlh->nlmsg_flags & NLM_F_ACK)
- netlink_ack(skb, nlh, err);
- }
-}
-
-static void tcpdiag_rcv(struct sock *sk, int len)
-{
- struct sk_buff *skb;
- unsigned int qlen = skb_queue_len(&sk->sk_receive_queue);
-
- while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) {
- tcpdiag_rcv_skb(skb);
- kfree_skb(skb);
- }
-}
+static struct inet_diag_handler tcp_diag_handler = {
+ .idiag_hashinfo = &tcp_hashinfo,
+ .idiag_get_info = tcp_diag_get_info,
+ .idiag_type = TCPDIAG_GETSOCK,
+ .idiag_info_size = sizeof(struct tcp_info),
+};
-static int __init tcpdiag_init(void)
+static int __init tcp_diag_init(void)
{
- tcpnl = netlink_kernel_create(NETLINK_TCPDIAG, tcpdiag_rcv);
- if (tcpnl == NULL)
- return -ENOMEM;
- return 0;
+ return inet_diag_register(&tcp_diag_handler);
}
-static void __exit tcpdiag_exit(void)
+static void __exit tcp_diag_exit(void)
{
- sock_release(tcpnl->sk_socket);
+ inet_diag_unregister(&tcp_diag_handler);
}
-module_init(tcpdiag_init);
-module_exit(tcpdiag_exit);
+module_init(tcp_diag_init);
+module_exit(tcp_diag_exit);
MODULE_LICENSE("GPL");
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 36c51f8136b..6acc04bde08 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -98,9 +98,10 @@ struct hstcp {
u32 ai;
};
-static void hstcp_init(struct tcp_sock *tp)
+static void hstcp_init(struct sock *sk)
{
- struct hstcp *ca = tcp_ca(tp);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct hstcp *ca = inet_csk_ca(sk);
ca->ai = 0;
@@ -109,10 +110,11 @@ static void hstcp_init(struct tcp_sock *tp)
tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
}
-static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt,
+static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt,
u32 in_flight, int good)
{
- struct hstcp *ca = tcp_ca(tp);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct hstcp *ca = inet_csk_ca(sk);
if (in_flight < tp->snd_cwnd)
return;
@@ -143,9 +145,10 @@ static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt,
}
}
-static u32 hstcp_ssthresh(struct tcp_sock *tp)
+static u32 hstcp_ssthresh(struct sock *sk)
{
- struct hstcp *ca = tcp_ca(tp);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct hstcp *ca = inet_csk_ca(sk);
/* Do multiplicative decrease */
return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
@@ -164,7 +167,7 @@ static struct tcp_congestion_ops tcp_highspeed = {
static int __init hstcp_register(void)
{
- BUG_ON(sizeof(struct hstcp) > TCP_CA_PRIV_SIZE);
+ BUG_ON(sizeof(struct hstcp) > ICSK_CA_PRIV_SIZE);
return tcp_register_congestion_control(&tcp_highspeed);
}
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 40168275acf..e47b37984e9 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -55,18 +55,21 @@ static inline void htcp_reset(struct htcp *ca)
ca->snd_cwnd_cnt2 = 0;
}
-static u32 htcp_cwnd_undo(struct tcp_sock *tp)
+static u32 htcp_cwnd_undo(struct sock *sk)
{
- struct htcp *ca = tcp_ca(tp);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct htcp *ca = inet_csk_ca(sk);
ca->ccount = ca->undo_ccount;
ca->maxRTT = ca->undo_maxRTT;
ca->old_maxB = ca->undo_old_maxB;
return max(tp->snd_cwnd, (tp->snd_ssthresh<<7)/ca->beta);
}
-static inline void measure_rtt(struct tcp_sock *tp)
+static inline void measure_rtt(struct sock *sk)
{
- struct htcp *ca = tcp_ca(tp);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct htcp *ca = inet_csk_ca(sk);
u32 srtt = tp->srtt>>3;
/* keep track of minimum RTT seen so far, minRTT is zero at first */
@@ -74,7 +77,7 @@ static inline void measure_rtt(struct tcp_sock *tp)
ca->minRTT = srtt;
/* max RTT */
- if (tp->ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) {
+ if (icsk->icsk_ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) {
if (ca->maxRTT < ca->minRTT)
ca->maxRTT = ca->minRTT;
if (ca->maxRTT < srtt && srtt <= ca->maxRTT+HZ/50)
@@ -82,13 +85,16 @@ static inline void measure_rtt(struct tcp_sock *tp)
}
}
-static void measure_achieved_throughput(struct tcp_sock *tp, u32 pkts_acked)
+static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked)
{
- struct htcp *ca = tcp_ca(tp);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct htcp *ca = inet_csk_ca(sk);
u32 now = tcp_time_stamp;
/* achieved throughput calculations */
- if (tp->ca_state != TCP_CA_Open && tp->ca_state != TCP_CA_Disorder) {
+ if (icsk->icsk_ca_state != TCP_CA_Open &&
+ icsk->icsk_ca_state != TCP_CA_Disorder) {
ca->packetcount = 0;
ca->lasttime = now;
return;
@@ -173,9 +179,9 @@ static inline void htcp_alpha_update(struct htcp *ca)
* that point do we really have a real sense of maxRTT (the queues en route
* were getting just too full now).
*/
-static void htcp_param_update(struct tcp_sock *tp)
+static void htcp_param_update(struct sock *sk)
{
- struct htcp *ca = tcp_ca(tp);
+ struct htcp *ca = inet_csk_ca(sk);
u32 minRTT = ca->minRTT;
u32 maxRTT = ca->maxRTT;
@@ -187,17 +193,19 @@ static void htcp_param_update(struct tcp_sock *tp)
ca->maxRTT = minRTT + ((maxRTT-minRTT)*95)/100;
}
-static u32 htcp_recalc_ssthresh(struct tcp_sock *tp)
+static u32 htcp_recalc_ssthresh(struct sock *sk)
{
- struct htcp *ca = tcp_ca(tp);
- htcp_param_update(tp);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct htcp *ca = inet_csk_ca(sk);
+ htcp_param_update(sk);
return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
}
-static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
+static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
u32 in_flight, int data_acked)
{
- struct htcp *ca = tcp_ca(tp);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct htcp *ca = inet_csk_ca(sk);
if (in_flight < tp->snd_cwnd)
return;
@@ -207,7 +215,7 @@ static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
if (tp->snd_cwnd < tp->snd_cwnd_clamp)
tp->snd_cwnd++;
} else {
- measure_rtt(tp);
+ measure_rtt(sk);
/* keep track of number of round-trip times since last backoff event */
if (ca->snd_cwnd_cnt2++ > tp->snd_cwnd) {
@@ -229,28 +237,29 @@ static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
}
/* Lower bound on congestion window. */
-static u32 htcp_min_cwnd(struct tcp_sock *tp)
+static u32 htcp_min_cwnd(struct sock *sk)
{
+ const struct tcp_sock *tp = tcp_sk(sk);
return tp->snd_ssthresh;
}
-static void htcp_init(struct tcp_sock *tp)
+static void htcp_init(struct sock *sk)
{
- struct htcp *ca = tcp_ca(tp);
+ struct htcp *ca = inet_csk_ca(sk);
memset(ca, 0, sizeof(struct htcp));
ca->alpha = ALPHA_BASE;
ca->beta = BETA_MIN;
}
-static void htcp_state(struct tcp_sock *tp, u8 new_state)
+static void htcp_state(struct sock *sk, u8 new_state)
{
switch (new_state) {
case TCP_CA_CWR:
case TCP_CA_Recovery:
case TCP_CA_Loss:
- htcp_reset(tcp_ca(tp));
+ htcp_reset(inet_csk_ca(sk));
break;
}
}
@@ -269,7 +278,7 @@ static struct tcp_congestion_ops htcp = {
static int __init htcp_register(void)
{
- BUG_ON(sizeof(struct htcp) > TCP_CA_PRIV_SIZE);
+ BUG_ON(sizeof(struct htcp) > ICSK_CA_PRIV_SIZE);
BUILD_BUG_ON(BETA_MIN >= BETA_MAX);
if (!use_bandwidth_switch)
htcp.pkts_acked = NULL;
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 13a66342c30..77add63623d 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -33,19 +33,20 @@ MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)");
/* This is called to refresh values for hybla parameters */
-static inline void hybla_recalc_param (struct tcp_sock *tp)
+static inline void hybla_recalc_param (struct sock *sk)
{
- struct hybla *ca = tcp_ca(tp);
+ struct hybla *ca = inet_csk_ca(sk);
- ca->rho_3ls = max_t(u32, tp->srtt / msecs_to_jiffies(rtt0), 8);
+ ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8);
ca->rho = ca->rho_3ls >> 3;
ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
ca->rho2 = ca->rho2_7ls >>7;
}
-static void hybla_init(struct tcp_sock *tp)
+static void hybla_init(struct sock *sk)
{
- struct hybla *ca = tcp_ca(tp);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct hybla *ca = inet_csk_ca(sk);
ca->rho = 0;
ca->rho2 = 0;
@@ -57,17 +58,16 @@ static void hybla_init(struct tcp_sock *tp)
tp->snd_cwnd_clamp = 65535;
/* 1st Rho measurement based on initial srtt */
- hybla_recalc_param(tp);
+ hybla_recalc_param(sk);
/* set minimum rtt as this is the 1st ever seen */
ca->minrtt = tp->srtt;
tp->snd_cwnd = ca->rho;
}
-static void hybla_state(struct tcp_sock *tp, u8 ca_state)
+static void hybla_state(struct sock *sk, u8 ca_state)
{
- struct hybla *ca = tcp_ca(tp);
-
+ struct hybla *ca = inet_csk_ca(sk);
ca->hybla_en = (ca_state == TCP_CA_Open);
}
@@ -86,27 +86,28 @@ static inline u32 hybla_fraction(u32 odds)
* o Give cwnd a new value based on the model proposed
* o remember increments <1
*/
-static void hybla_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
+static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
u32 in_flight, int flag)
{
- struct hybla *ca = tcp_ca(tp);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct hybla *ca = inet_csk_ca(sk);
u32 increment, odd, rho_fractions;
int is_slowstart = 0;
/* Recalculate rho only if this srtt is the lowest */
if (tp->srtt < ca->minrtt){
- hybla_recalc_param(tp);
+ hybla_recalc_param(sk);
ca->minrtt = tp->srtt;
}
if (!ca->hybla_en)
- return tcp_reno_cong_avoid(tp, ack, rtt, in_flight, flag);
+ return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag);
if (in_flight < tp->snd_cwnd)
return;
if (ca->rho == 0)
- hybla_recalc_param(tp);
+ hybla_recalc_param(sk);
rho_fractions = ca->rho_3ls - (ca->rho << 3);
@@ -170,7 +171,7 @@ static struct tcp_congestion_ops tcp_hybla = {
static int __init hybla_register(void)
{
- BUG_ON(sizeof(struct hybla) > TCP_CA_PRIV_SIZE);
+ BUG_ON(sizeof(struct hybla) > ICSK_CA_PRIV_SIZE);
return tcp_register_congestion_control(&tcp_hybla);
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 53a8a5399f1..3e98b57578d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -114,20 +114,21 @@ int sysctl_tcp_moderate_rcvbuf = 1;
/* Adapt the MSS value used to make delayed ack decision to the
* real world.
*/
-static inline void tcp_measure_rcv_mss(struct tcp_sock *tp,
- struct sk_buff *skb)
+static inline void tcp_measure_rcv_mss(struct sock *sk,
+ const struct sk_buff *skb)
{
- unsigned int len, lss;
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ const unsigned int lss = icsk->icsk_ack.last_seg_size;
+ unsigned int len;
- lss = tp->ack.last_seg_size;
- tp->ack.last_seg_size = 0;
+ icsk->icsk_ack.last_seg_size = 0;
/* skb->len may jitter because of SACKs, even if peer
* sends good full-sized frames.
*/
len = skb->len;
- if (len >= tp->ack.rcv_mss) {
- tp->ack.rcv_mss = len;
+ if (len >= icsk->icsk_ack.rcv_mss) {
+ icsk->icsk_ack.rcv_mss = len;
} else {
/* Otherwise, we make more careful check taking into account,
* that SACKs block is variable.
@@ -147,41 +148,44 @@ static inline void tcp_measure_rcv_mss(struct tcp_sock *tp,
* tcp header plus fixed timestamp option length.
* Resulting "len" is MSS free of SACK jitter.
*/
- len -= tp->tcp_header_len;
- tp->ack.last_seg_size = len;
+ len -= tcp_sk(sk)->tcp_header_len;
+ icsk->icsk_ack.last_seg_size = len;
if (len == lss) {
- tp->ack.rcv_mss = len;
+ icsk->icsk_ack.rcv_mss = len;
return;
}
}
- tp->ack.pending |= TCP_ACK_PUSHED;
+ icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
}
}
-static void tcp_incr_quickack(struct tcp_sock *tp)
+static void tcp_incr_quickack(struct sock *sk)
{
- unsigned quickacks = tp->rcv_wnd/(2*tp->ack.rcv_mss);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
if (quickacks==0)
quickacks=2;
- if (quickacks > tp->ack.quick)
- tp->ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
+ if (quickacks > icsk->icsk_ack.quick)
+ icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
}
-void tcp_enter_quickack_mode(struct tcp_sock *tp)
+void tcp_enter_quickack_mode(struct sock *sk)
{
- tcp_incr_quickack(tp);
- tp->ack.pingpong = 0;
- tp->ack.ato = TCP_ATO_MIN;
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ tcp_incr_quickack(sk);
+ icsk->icsk_ack.pingpong = 0;
+ icsk->icsk_ack.ato = TCP_ATO_MIN;
}
/* Send ACKs quickly, if "quick" count is not exhausted
* and the session is not interactive.
*/
-static __inline__ int tcp_in_quickack_mode(struct tcp_sock *tp)
+static inline int tcp_in_quickack_mode(const struct sock *sk)
{
- return (tp->ack.quick && !tp->ack.pingpong);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
}
/* Buffer size and advertised window tuning.
@@ -224,8 +228,8 @@ static void tcp_fixup_sndbuf(struct sock *sk)
*/
/* Slow part of check#2. */
-static int __tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
- struct sk_buff *skb)
+static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp,
+ const struct sk_buff *skb)
{
/* Optimize this! */
int truesize = tcp_win_from_space(skb->truesize)/2;
@@ -233,7 +237,7 @@ static int __tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
while (tp->rcv_ssthresh <= window) {
if (truesize <= skb->len)
- return 2*tp->ack.rcv_mss;
+ return 2 * inet_csk(sk)->icsk_ack.rcv_mss;
truesize >>= 1;
window >>= 1;
@@ -260,7 +264,7 @@ static inline void tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
if (incr) {
tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp);
- tp->ack.quick |= 1;
+ inet_csk(sk)->icsk_ack.quick |= 1;
}
}
}
@@ -321,11 +325,12 @@ static void tcp_init_buffer_space(struct sock *sk)
/* 5. Recalculate window clamp after socket hit its memory bounds. */
static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *skb;
unsigned int app_win = tp->rcv_nxt - tp->copied_seq;
int ofo_win = 0;
- tp->ack.quick = 0;
+ icsk->icsk_ack.quick = 0;
skb_queue_walk(&tp->out_of_order_queue, skb) {
ofo_win += skb->len;
@@ -346,12 +351,10 @@ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
app_win += ofo_win;
if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf)
app_win >>= 1;
- if (app_win > tp->ack.rcv_mss)
- app_win -= tp->ack.rcv_mss;
+ if (app_win > icsk->icsk_ack.rcv_mss)
+ app_win -= icsk->icsk_ack.rcv_mss;
app_win = max(app_win, 2U*tp->advmss);
- if (!ofo_win)
- tp->window_clamp = min(tp->window_clamp, app_win);
tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss);
}
}
@@ -415,11 +418,12 @@ new_measure:
tp->rcv_rtt_est.time = tcp_time_stamp;
}
-static inline void tcp_rcv_rtt_measure_ts(struct tcp_sock *tp, struct sk_buff *skb)
+static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, const struct sk_buff *skb)
{
+ struct tcp_sock *tp = tcp_sk(sk);
if (tp->rx_opt.rcv_tsecr &&
(TCP_SKB_CB(skb)->end_seq -
- TCP_SKB_CB(skb)->seq >= tp->ack.rcv_mss))
+ TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss))
tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0);
}
@@ -492,41 +496,42 @@ new_measure:
*/
static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
u32 now;
- tcp_schedule_ack(tp);
+ inet_csk_schedule_ack(sk);
- tcp_measure_rcv_mss(tp, skb);
+ tcp_measure_rcv_mss(sk, skb);
tcp_rcv_rtt_measure(tp);
now = tcp_time_stamp;
- if (!tp->ack.ato) {
+ if (!icsk->icsk_ack.ato) {
/* The _first_ data packet received, initialize
* delayed ACK engine.
*/
- tcp_incr_quickack(tp);
- tp->ack.ato = TCP_ATO_MIN;
+ tcp_incr_quickack(sk);
+ icsk->icsk_ack.ato = TCP_ATO_MIN;
} else {
- int m = now - tp->ack.lrcvtime;
+ int m = now - icsk->icsk_ack.lrcvtime;
if (m <= TCP_ATO_MIN/2) {
/* The fastest case is the first. */
- tp->ack.ato = (tp->ack.ato>>1) + TCP_ATO_MIN/2;
- } else if (m < tp->ack.ato) {
- tp->ack.ato = (tp->ack.ato>>1) + m;
- if (tp->ack.ato > tp->rto)
- tp->ack.ato = tp->rto;
- } else if (m > tp->rto) {
+ icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
+ } else if (m < icsk->icsk_ack.ato) {
+ icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
+ if (icsk->icsk_ack.ato > icsk->icsk_rto)
+ icsk->icsk_ack.ato = icsk->icsk_rto;
+ } else if (m > icsk->icsk_rto) {
/* Too long gap. Apparently sender falled to
* restart window, so that we send ACKs quickly.
*/
- tcp_incr_quickack(tp);
+ tcp_incr_quickack(sk);
sk_stream_mem_reclaim(sk);
}
}
- tp->ack.lrcvtime = now;
+ icsk->icsk_ack.lrcvtime = now;
TCP_ECN_check_ce(tp, skb);
@@ -543,8 +548,10 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
* To save cycles in the RFC 1323 implementation it was better to break
* it up into three procedures. -- erics
*/
-static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt)
+static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt)
{
+ struct tcp_sock *tp = tcp_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
long m = mrtt; /* RTT */
/* The following amusing code comes from Jacobson's
@@ -604,15 +611,16 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt)
tp->rtt_seq = tp->snd_nxt;
}
- if (tp->ca_ops->rtt_sample)
- tp->ca_ops->rtt_sample(tp, *usrtt);
+ if (icsk->icsk_ca_ops->rtt_sample)
+ icsk->icsk_ca_ops->rtt_sample(sk, *usrtt);
}
/* Calculate rto without backoff. This is the second half of Van Jacobson's
* routine referred to above.
*/
-static inline void tcp_set_rto(struct tcp_sock *tp)
+static inline void tcp_set_rto(struct sock *sk)
{
+ const struct tcp_sock *tp = tcp_sk(sk);
/* Old crap is replaced with new one. 8)
*
* More seriously:
@@ -623,7 +631,7 @@ static inline void tcp_set_rto(struct tcp_sock *tp)
* is invisible. Actually, Linux-2.4 also generates erratic
* ACKs in some curcumstances.
*/
- tp->rto = (tp->srtt >> 3) + tp->rttvar;
+ inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar;
/* 2. Fixups made earlier cannot be right.
* If we do not estimate RTO correctly without them,
@@ -635,10 +643,10 @@ static inline void tcp_set_rto(struct tcp_sock *tp)
/* NOTE: clamping at TCP_RTO_MIN is not required, current algo
* guarantees that rto is higher.
*/
-static inline void tcp_bound_rto(struct tcp_sock *tp)
+static inline void tcp_bound_rto(struct sock *sk)
{
- if (tp->rto > TCP_RTO_MAX)
- tp->rto = TCP_RTO_MAX;
+ if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX)
+ inet_csk(sk)->icsk_rto = TCP_RTO_MAX;
}
/* Save metrics learned by this TCP session.
@@ -656,9 +664,10 @@ void tcp_update_metrics(struct sock *sk)
dst_confirm(dst);
if (dst && (dst->flags&DST_HOST)) {
+ const struct inet_connection_sock *icsk = inet_csk(sk);
int m;
- if (tp->backoff || !tp->srtt) {
+ if (icsk->icsk_backoff || !tp->srtt) {
/* This session failed to estimate rtt. Why?
* Probably, no packets returned in time.
* Reset our results.
@@ -707,7 +716,7 @@ void tcp_update_metrics(struct sock *sk)
tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
dst->metrics[RTAX_CWND-1] = tp->snd_cwnd;
} else if (tp->snd_cwnd > tp->snd_ssthresh &&
- tp->ca_state == TCP_CA_Open) {
+ icsk->icsk_ca_state == TCP_CA_Open) {
/* Cong. avoidance phase, cwnd is reliable. */
if (!dst_metric_locked(dst, RTAX_SSTHRESH))
dst->metrics[RTAX_SSTHRESH-1] =
@@ -801,9 +810,9 @@ static void tcp_init_metrics(struct sock *sk)
tp->mdev = dst_metric(dst, RTAX_RTTVAR);
tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);
}
- tcp_set_rto(tp);
- tcp_bound_rto(tp);
- if (tp->rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp)
+ tcp_set_rto(sk);
+ tcp_bound_rto(sk);
+ if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp)
goto reset;
tp->snd_cwnd = tcp_init_cwnd(tp, dst);
tp->snd_cwnd_stamp = tcp_time_stamp;
@@ -817,12 +826,14 @@ reset:
if (!tp->rx_opt.saw_tstamp && tp->srtt) {
tp->srtt = 0;
tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;
- tp->rto = TCP_TIMEOUT_INIT;
+ inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
}
}
-static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts)
+static void tcp_update_reordering(struct sock *sk, const int metric,
+ const int ts)
{
+ struct tcp_sock *tp = tcp_sk(sk);
if (metric > tp->reordering) {
tp->reordering = min(TCP_MAX_REORDERING, metric);
@@ -837,7 +848,7 @@ static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts)
NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER);
#if FASTRETRANS_DEBUG > 1
printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n",
- tp->rx_opt.sack_ok, tp->ca_state,
+ tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
tp->reordering,
tp->fackets_out,
tp->sacked_out,
@@ -899,6 +910,7 @@ static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts)
static int
tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una)
{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
unsigned char *ptr = ack_skb->h.raw + TCP_SKB_CB(ack_skb)->sacked;
struct tcp_sack_block *sp = (struct tcp_sack_block *)(ptr+2);
@@ -909,14 +921,6 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
int flag = 0;
int i;
- /* So, SACKs for already sent large segments will be lost.
- * Not good, but alternative is to resegment the queue. */
- if (sk->sk_route_caps & NETIF_F_TSO) {
- sk->sk_route_caps &= ~NETIF_F_TSO;
- sock_set_flag(sk, SOCK_NO_LARGESEND);
- tp->mss_cache = tp->mss_cache;
- }
-
if (!tp->sacked_out)
tp->fackets_out = 0;
prior_fackets = tp->fackets_out;
@@ -964,20 +968,42 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
flag |= FLAG_DATA_LOST;
sk_stream_for_retrans_queue(skb, sk) {
- u8 sacked = TCP_SKB_CB(skb)->sacked;
- int in_sack;
+ int in_sack, pcount;
+ u8 sacked;
/* The retransmission queue is always in order, so
* we can short-circuit the walk early.
*/
- if(!before(TCP_SKB_CB(skb)->seq, end_seq))
+ if (!before(TCP_SKB_CB(skb)->seq, end_seq))
break;
- fack_count += tcp_skb_pcount(skb);
-
in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
!before(end_seq, TCP_SKB_CB(skb)->end_seq);
+ pcount = tcp_skb_pcount(skb);
+
+ if (pcount > 1 && !in_sack &&
+ after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
+ unsigned int pkt_len;
+
+ in_sack = !after(start_seq,
+ TCP_SKB_CB(skb)->seq);
+
+ if (!in_sack)
+ pkt_len = (start_seq -
+ TCP_SKB_CB(skb)->seq);
+ else
+ pkt_len = (end_seq -
+ TCP_SKB_CB(skb)->seq);
+ if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->tso_size))
+ break;
+ pcount = tcp_skb_pcount(skb);
+ }
+
+ fack_count += pcount;
+
+ sacked = TCP_SKB_CB(skb)->sacked;
+
/* Account D-SACK for retransmitted packet. */
if ((dup_sack && in_sack) &&
(sacked & TCPCB_RETRANS) &&
@@ -1064,7 +1090,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
* we have to account for reordering! Ugly,
* but should help.
*/
- if (lost_retrans && tp->ca_state == TCP_CA_Recovery) {
+ if (lost_retrans && icsk->icsk_ca_state == TCP_CA_Recovery) {
struct sk_buff *skb;
sk_stream_for_retrans_queue(skb, sk) {
@@ -1093,8 +1119,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
tp->left_out = tp->sacked_out + tp->lost_out;
- if ((reord < tp->fackets_out) && tp->ca_state != TCP_CA_Loss)
- tcp_update_reordering(tp, ((tp->fackets_out + 1) - reord), 0);
+ if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss)
+ tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0);
#if FASTRETRANS_DEBUG > 0
BUG_TRAP((int)tp->sacked_out >= 0);
@@ -1111,17 +1137,18 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
*/
void tcp_enter_frto(struct sock *sk)
{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
tp->frto_counter = 1;
- if (tp->ca_state <= TCP_CA_Disorder ||
+ if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
tp->snd_una == tp->high_seq ||
- (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
- tp->prior_ssthresh = tcp_current_ssthresh(tp);
- tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
- tcp_ca_event(tp, CA_EVENT_FRTO);
+ (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
+ tp->prior_ssthresh = tcp_current_ssthresh(sk);
+ tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+ tcp_ca_event(sk, CA_EVENT_FRTO);
}
/* Have to clear retransmission markers here to keep the bookkeeping
@@ -1138,7 +1165,7 @@ void tcp_enter_frto(struct sock *sk)
}
tcp_sync_left_out(tp);
- tcp_set_ca_state(tp, TCP_CA_Open);
+ tcp_set_ca_state(sk, TCP_CA_Open);
tp->frto_highmark = tp->snd_nxt;
}
@@ -1184,7 +1211,7 @@ static void tcp_enter_frto_loss(struct sock *sk)
tp->reordering = min_t(unsigned int, tp->reordering,
sysctl_tcp_reordering);
- tcp_set_ca_state(tp, TCP_CA_Loss);
+ tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->frto_highmark;
TCP_ECN_queue_cwr(tp);
}
@@ -1208,16 +1235,17 @@ void tcp_clear_retrans(struct tcp_sock *tp)
*/
void tcp_enter_loss(struct sock *sk, int how)
{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
int cnt = 0;
/* Reduce ssthresh if it has not yet been made inside this window. */
- if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
- (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
- tp->prior_ssthresh = tcp_current_ssthresh(tp);
- tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
- tcp_ca_event(tp, CA_EVENT_LOSS);
+ if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
+ (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
+ tp->prior_ssthresh = tcp_current_ssthresh(sk);
+ tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+ tcp_ca_event(sk, CA_EVENT_LOSS);
}
tp->snd_cwnd = 1;
tp->snd_cwnd_cnt = 0;
@@ -1248,12 +1276,12 @@ void tcp_enter_loss(struct sock *sk, int how)
tp->reordering = min_t(unsigned int, tp->reordering,
sysctl_tcp_reordering);
- tcp_set_ca_state(tp, TCP_CA_Loss);
+ tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->snd_nxt;
TCP_ECN_queue_cwr(tp);
}
-static int tcp_check_sack_reneging(struct sock *sk, struct tcp_sock *tp)
+static int tcp_check_sack_reneging(struct sock *sk)
{
struct sk_buff *skb;
@@ -1265,12 +1293,14 @@ static int tcp_check_sack_reneging(struct sock *sk, struct tcp_sock *tp)
*/
if ((skb = skb_peek(&sk->sk_write_queue)) != NULL &&
(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
+ struct inet_connection_sock *icsk = inet_csk(sk);
NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING);
tcp_enter_loss(sk, 1);
- tp->retransmits++;
+ icsk->icsk_retransmits++;
tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
- tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ icsk->icsk_rto, TCP_RTO_MAX);
return 1;
}
return 0;
@@ -1281,15 +1311,15 @@ static inline int tcp_fackets_out(struct tcp_sock *tp)
return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out;
}
-static inline int tcp_skb_timedout(struct tcp_sock *tp, struct sk_buff *skb)
+static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
{
- return (tcp_time_stamp - TCP_SKB_CB(skb)->when > tp->rto);
+ return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto);
}
static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp)
{
return tp->packets_out &&
- tcp_skb_timedout(tp, skb_peek(&sk->sk_write_queue));
+ tcp_skb_timedout(sk, skb_peek(&sk->sk_write_queue));
}
/* Linux NewReno/SACK/FACK/ECN state machine.
@@ -1423,8 +1453,9 @@ static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp)
* in assumption of absent reordering, interpret this as reordering.
* The only another reason could be bug in receiver TCP.
*/
-static void tcp_check_reno_reordering(struct tcp_sock *tp, int addend)
+static void tcp_check_reno_reordering(struct sock *sk, const int addend)
{
+ struct tcp_sock *tp = tcp_sk(sk);
u32 holes;
holes = max(tp->lost_out, 1U);
@@ -1432,16 +1463,17 @@ static void tcp_check_reno_reordering(struct tcp_sock *tp, int addend)
if ((tp->sacked_out + holes) > tp->packets_out) {
tp->sacked_out = tp->packets_out - holes;
- tcp_update_reordering(tp, tp->packets_out+addend, 0);
+ tcp_update_reordering(sk, tp->packets_out + addend, 0);
}
}
/* Emulate SACKs for SACKless connection: account for a new dupack. */
-static void tcp_add_reno_sack(struct tcp_sock *tp)
+static void tcp_add_reno_sack(struct sock *sk)
{
+ struct tcp_sock *tp = tcp_sk(sk);
tp->sacked_out++;
- tcp_check_reno_reordering(tp, 0);
+ tcp_check_reno_reordering(sk, 0);
tcp_sync_left_out(tp);
}
@@ -1456,7 +1488,7 @@ static void tcp_remove_reno_sacks(struct sock *sk, struct tcp_sock *tp, int acke
else
tp->sacked_out -= acked-1;
}
- tcp_check_reno_reordering(tp, acked);
+ tcp_check_reno_reordering(sk, acked);
tcp_sync_left_out(tp);
}
@@ -1509,7 +1541,7 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
struct sk_buff *skb;
sk_stream_for_retrans_queue(skb, sk) {
- if (tcp_skb_timedout(tp, skb) &&
+ if (tcp_skb_timedout(sk, skb) &&
!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
@@ -1530,14 +1562,16 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
}
/* Decrease cwnd each second ack. */
-static void tcp_cwnd_down(struct tcp_sock *tp)
+static void tcp_cwnd_down(struct sock *sk)
{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
int decr = tp->snd_cwnd_cnt + 1;
tp->snd_cwnd_cnt = decr&1;
decr >>= 1;
- if (decr && tp->snd_cwnd > tp->ca_ops->min_cwnd(tp))
+ if (decr && tp->snd_cwnd > icsk->icsk_ca_ops->min_cwnd(sk))
tp->snd_cwnd -= decr;
tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
@@ -1571,11 +1605,15 @@ static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg)
#define DBGUNDO(x...) do { } while (0)
#endif
-static void tcp_undo_cwr(struct tcp_sock *tp, int undo)
+static void tcp_undo_cwr(struct sock *sk, const int undo)
{
+ struct tcp_sock *tp = tcp_sk(sk);
+
if (tp->prior_ssthresh) {
- if (tp->ca_ops->undo_cwnd)
- tp->snd_cwnd = tp->ca_ops->undo_cwnd(tp);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_ca_ops->undo_cwnd)
+ tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
else
tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1);
@@ -1603,9 +1641,9 @@ static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp)
/* Happy end! We did not retransmit anything
* or our original transmission succeeded.
*/
- DBGUNDO(sk, tp, tp->ca_state == TCP_CA_Loss ? "loss" : "retrans");
- tcp_undo_cwr(tp, 1);
- if (tp->ca_state == TCP_CA_Loss)
+ DBGUNDO(sk, tp, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
+ tcp_undo_cwr(sk, 1);
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
else
NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO);
@@ -1618,7 +1656,7 @@ static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp)
tcp_moderate_cwnd(tp);
return 1;
}
- tcp_set_ca_state(tp, TCP_CA_Open);
+ tcp_set_ca_state(sk, TCP_CA_Open);
return 0;
}
@@ -1627,7 +1665,7 @@ static void tcp_try_undo_dsack(struct sock *sk, struct tcp_sock *tp)
{
if (tp->undo_marker && !tp->undo_retrans) {
DBGUNDO(sk, tp, "D-SACK");
- tcp_undo_cwr(tp, 1);
+ tcp_undo_cwr(sk, 1);
tp->undo_marker = 0;
NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO);
}
@@ -1648,10 +1686,10 @@ static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp,
if (tp->retrans_out == 0)
tp->retrans_stamp = 0;
- tcp_update_reordering(tp, tcp_fackets_out(tp)+acked, 1);
+ tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
DBGUNDO(sk, tp, "Hoe");
- tcp_undo_cwr(tp, 0);
+ tcp_undo_cwr(sk, 0);
NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO);
/* So... Do not make Hoe's retransmit yet.
@@ -1674,22 +1712,23 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
DBGUNDO(sk, tp, "partial loss");
tp->lost_out = 0;
tp->left_out = tp->sacked_out;
- tcp_undo_cwr(tp, 1);
+ tcp_undo_cwr(sk, 1);
NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
- tp->retransmits = 0;
+ inet_csk(sk)->icsk_retransmits = 0;
tp->undo_marker = 0;
if (!IsReno(tp))
- tcp_set_ca_state(tp, TCP_CA_Open);
+ tcp_set_ca_state(sk, TCP_CA_Open);
return 1;
}
return 0;
}
-static inline void tcp_complete_cwr(struct tcp_sock *tp)
+static inline void tcp_complete_cwr(struct sock *sk)
{
+ struct tcp_sock *tp = tcp_sk(sk);
tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
tp->snd_cwnd_stamp = tcp_time_stamp;
- tcp_ca_event(tp, CA_EVENT_COMPLETE_CWR);
+ tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
}
static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
@@ -1700,21 +1739,21 @@ static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
tp->retrans_stamp = 0;
if (flag&FLAG_ECE)
- tcp_enter_cwr(tp);
+ tcp_enter_cwr(sk);
- if (tp->ca_state != TCP_CA_CWR) {
+ if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
int state = TCP_CA_Open;
if (tp->left_out || tp->retrans_out || tp->undo_marker)
state = TCP_CA_Disorder;
- if (tp->ca_state != state) {
- tcp_set_ca_state(tp, state);
+ if (inet_csk(sk)->icsk_ca_state != state) {
+ tcp_set_ca_state(sk, state);
tp->high_seq = tp->snd_nxt;
}
tcp_moderate_cwnd(tp);
} else {
- tcp_cwnd_down(tp);
+ tcp_cwnd_down(sk);
}
}
@@ -1733,6 +1772,7 @@ static void
tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
int prior_packets, int flag)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
int is_dupack = (tp->snd_una == prior_snd_una && !(flag&FLAG_NOT_DUP));
@@ -1750,13 +1790,13 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
tp->prior_ssthresh = 0;
/* B. In all the states check for reneging SACKs. */
- if (tp->sacked_out && tcp_check_sack_reneging(sk, tp))
+ if (tp->sacked_out && tcp_check_sack_reneging(sk))
return;
/* C. Process data loss notification, provided it is valid. */
if ((flag&FLAG_DATA_LOST) &&
before(tp->snd_una, tp->high_seq) &&
- tp->ca_state != TCP_CA_Open &&
+ icsk->icsk_ca_state != TCP_CA_Open &&
tp->fackets_out > tp->reordering) {
tcp_mark_head_lost(sk, tp, tp->fackets_out-tp->reordering, tp->high_seq);
NET_INC_STATS_BH(LINUX_MIB_TCPLOSS);
@@ -1767,14 +1807,14 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
/* E. Check state exit conditions. State can be terminated
* when high_seq is ACKed. */
- if (tp->ca_state == TCP_CA_Open) {
+ if (icsk->icsk_ca_state == TCP_CA_Open) {
if (!sysctl_tcp_frto)
BUG_TRAP(tp->retrans_out == 0);
tp->retrans_stamp = 0;
} else if (!before(tp->snd_una, tp->high_seq)) {
- switch (tp->ca_state) {
+ switch (icsk->icsk_ca_state) {
case TCP_CA_Loss:
- tp->retransmits = 0;
+ icsk->icsk_retransmits = 0;
if (tcp_try_undo_recovery(sk, tp))
return;
break;
@@ -1783,8 +1823,8 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
/* CWR is to be held something *above* high_seq
* is ACKed for CWR bit to reach receiver. */
if (tp->snd_una != tp->high_seq) {
- tcp_complete_cwr(tp);
- tcp_set_ca_state(tp, TCP_CA_Open);
+ tcp_complete_cwr(sk);
+ tcp_set_ca_state(sk, TCP_CA_Open);
}
break;
@@ -1795,7 +1835,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
* catching for all duplicate ACKs. */
IsReno(tp) || tp->snd_una != tp->high_seq) {
tp->undo_marker = 0;
- tcp_set_ca_state(tp, TCP_CA_Open);
+ tcp_set_ca_state(sk, TCP_CA_Open);
}
break;
@@ -1804,17 +1844,17 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
tcp_reset_reno_sack(tp);
if (tcp_try_undo_recovery(sk, tp))
return;
- tcp_complete_cwr(tp);
+ tcp_complete_cwr(sk);
break;
}
}
/* F. Process state. */
- switch (tp->ca_state) {
+ switch (icsk->icsk_ca_state) {
case TCP_CA_Recovery:
if (prior_snd_una == tp->snd_una) {
if (IsReno(tp) && is_dupack)
- tcp_add_reno_sack(tp);
+ tcp_add_reno_sack(sk);
} else {
int acked = prior_packets - tp->packets_out;
if (IsReno(tp))
@@ -1824,13 +1864,13 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
break;
case TCP_CA_Loss:
if (flag&FLAG_DATA_ACKED)
- tp->retransmits = 0;
+ icsk->icsk_retransmits = 0;
if (!tcp_try_undo_loss(sk, tp)) {
tcp_moderate_cwnd(tp);
tcp_xmit_retransmit_queue(sk);
return;
}
- if (tp->ca_state != TCP_CA_Open)
+ if (icsk->icsk_ca_state != TCP_CA_Open)
return;
/* Loss is undone; fall through to processing in Open state. */
default:
@@ -1838,10 +1878,10 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
if (tp->snd_una != prior_snd_una)
tcp_reset_reno_sack(tp);
if (is_dupack)
- tcp_add_reno_sack(tp);
+ tcp_add_reno_sack(sk);
}
- if (tp->ca_state == TCP_CA_Disorder)
+ if (icsk->icsk_ca_state == TCP_CA_Disorder)
tcp_try_undo_dsack(sk, tp);
if (!tcp_time_to_recover(sk, tp)) {
@@ -1861,30 +1901,28 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
tp->undo_marker = tp->snd_una;
tp->undo_retrans = tp->retrans_out;
- if (tp->ca_state < TCP_CA_CWR) {
+ if (icsk->icsk_ca_state < TCP_CA_CWR) {
if (!(flag&FLAG_ECE))
- tp->prior_ssthresh = tcp_current_ssthresh(tp);
- tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
+ tp->prior_ssthresh = tcp_current_ssthresh(sk);
+ tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
TCP_ECN_queue_cwr(tp);
}
tp->snd_cwnd_cnt = 0;
- tcp_set_ca_state(tp, TCP_CA_Recovery);
+ tcp_set_ca_state(sk, TCP_CA_Recovery);
}
if (is_dupack || tcp_head_timedout(sk, tp))
tcp_update_scoreboard(sk, tp);
- tcp_cwnd_down(tp);
+ tcp_cwnd_down(sk);
tcp_xmit_retransmit_queue(sk);
}
/* Read draft-ietf-tcplw-high-performance before mucking
* with this code. (Superceeds RFC1323)
*/
-static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag)
+static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag)
{
- __u32 seq_rtt;
-
/* RTTM Rule: A TSecr value received in a segment is used to
* update the averaged RTT measurement only if the segment
* acknowledges some new data, i.e., only if it advances the
@@ -1900,14 +1938,15 @@ static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag)
* answer arrives rto becomes 120 seconds! If at least one of segments
* in window is lost... Voila. --ANK (010210)
*/
- seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
- tcp_rtt_estimator(tp, seq_rtt, usrtt);
- tcp_set_rto(tp);
- tp->backoff = 0;
- tcp_bound_rto(tp);
+ struct tcp_sock *tp = tcp_sk(sk);
+ const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
+ tcp_rtt_estimator(sk, seq_rtt, usrtt);
+ tcp_set_rto(sk);
+ inet_csk(sk)->icsk_backoff = 0;
+ tcp_bound_rto(sk);
}
-static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int flag)
+static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag)
{
/* We don't have a timestamp. Can only use
* packets that are not retransmitted to determine
@@ -1921,27 +1960,29 @@ static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int
if (flag & FLAG_RETRANS_DATA_ACKED)
return;
- tcp_rtt_estimator(tp, seq_rtt, usrtt);
- tcp_set_rto(tp);
- tp->backoff = 0;
- tcp_bound_rto(tp);
+ tcp_rtt_estimator(sk, seq_rtt, usrtt);
+ tcp_set_rto(sk);
+ inet_csk(sk)->icsk_backoff = 0;
+ tcp_bound_rto(sk);
}
-static inline void tcp_ack_update_rtt(struct tcp_sock *tp,
- int flag, s32 seq_rtt, u32 *usrtt)
+static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
+ const s32 seq_rtt, u32 *usrtt)
{
+ const struct tcp_sock *tp = tcp_sk(sk);
/* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
- tcp_ack_saw_tstamp(tp, usrtt, flag);
+ tcp_ack_saw_tstamp(sk, usrtt, flag);
else if (seq_rtt >= 0)
- tcp_ack_no_tstamp(tp, seq_rtt, usrtt, flag);
+ tcp_ack_no_tstamp(sk, seq_rtt, usrtt, flag);
}
-static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
+static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
u32 in_flight, int good)
{
- tp->ca_ops->cong_avoid(tp, ack, rtt, in_flight, good);
- tp->snd_cwnd_stamp = tcp_time_stamp;
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good);
+ tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
}
/* Restart timer after forward progress on connection.
@@ -1951,9 +1992,9 @@ static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
{
if (!tp->packets_out) {
- tcp_clear_xmit_timer(sk, TCP_TIME_RETRANS);
+ inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
} else {
- tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
}
}
@@ -2068,9 +2109,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
seq_rtt = -1;
} else if (seq_rtt < 0)
seq_rtt = now - scb->when;
- if (seq_usrtt)
- *seq_usrtt = (usnow.tv_sec - skb->stamp.tv_sec) * 1000000
- + (usnow.tv_usec - skb->stamp.tv_usec);
+ if (seq_usrtt) {
+ struct timeval tv;
+
+ skb_get_timestamp(skb, &tv);
+ *seq_usrtt = (usnow.tv_sec - tv.tv_sec) * 1000000
+ + (usnow.tv_usec - tv.tv_usec);
+ }
if (sacked & TCPCB_SACKED_ACKED)
tp->sacked_out -= tcp_skb_pcount(skb);
@@ -2085,16 +2130,17 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
seq_rtt = now - scb->when;
tcp_dec_pcount_approx(&tp->fackets_out, skb);
tcp_packets_out_dec(tp, skb);
- __skb_unlink(skb, skb->list);
+ __skb_unlink(skb, &sk->sk_write_queue);
sk_stream_free_skb(sk, skb);
}
if (acked&FLAG_ACKED) {
- tcp_ack_update_rtt(tp, acked, seq_rtt, seq_usrtt);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ tcp_ack_update_rtt(sk, acked, seq_rtt, seq_usrtt);
tcp_ack_packets_out(sk, tp);
- if (tp->ca_ops->pkts_acked)
- tp->ca_ops->pkts_acked(tp, pkts_acked);
+ if (icsk->icsk_ca_ops->pkts_acked)
+ icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked);
}
#if FASTRETRANS_DEBUG > 0
@@ -2102,19 +2148,20 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
BUG_TRAP((int)tp->lost_out >= 0);
BUG_TRAP((int)tp->retrans_out >= 0);
if (!tp->packets_out && tp->rx_opt.sack_ok) {
+ const struct inet_connection_sock *icsk = inet_csk(sk);
if (tp->lost_out) {
printk(KERN_DEBUG "Leak l=%u %d\n",
- tp->lost_out, tp->ca_state);
+ tp->lost_out, icsk->icsk_ca_state);
tp->lost_out = 0;
}
if (tp->sacked_out) {
printk(KERN_DEBUG "Leak s=%u %d\n",
- tp->sacked_out, tp->ca_state);
+ tp->sacked_out, icsk->icsk_ca_state);
tp->sacked_out = 0;
}
if (tp->retrans_out) {
printk(KERN_DEBUG "Leak r=%u %d\n",
- tp->retrans_out, tp->ca_state);
+ tp->retrans_out, icsk->icsk_ca_state);
tp->retrans_out = 0;
}
}
@@ -2125,40 +2172,43 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
static void tcp_ack_probe(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
/* Was it a usable window open? */
if (!after(TCP_SKB_CB(sk->sk_send_head)->end_seq,
tp->snd_una + tp->snd_wnd)) {
- tp->backoff = 0;
- tcp_clear_xmit_timer(sk, TCP_TIME_PROBE0);
+ icsk->icsk_backoff = 0;
+ inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
/* Socket must be waked up by subsequent tcp_data_snd_check().
* This function is not for random using!
*/
} else {
- tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0,
- min(tp->rto << tp->backoff, TCP_RTO_MAX));
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+ min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
+ TCP_RTO_MAX);
}
}
-static inline int tcp_ack_is_dubious(struct tcp_sock *tp, int flag)
+static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
{
return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
- tp->ca_state != TCP_CA_Open);
+ inet_csk(sk)->icsk_ca_state != TCP_CA_Open);
}
-static inline int tcp_may_raise_cwnd(struct tcp_sock *tp, int flag)
+static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
{
+ const struct tcp_sock *tp = tcp_sk(sk);
return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
- !((1<<tp->ca_state)&(TCPF_CA_Recovery|TCPF_CA_CWR));
+ !((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR));
}
/* Check that window update is acceptable.
* The function assumes that snd_una<=ack<=snd_next.
*/
-static inline int tcp_may_update_window(struct tcp_sock *tp, u32 ack,
- u32 ack_seq, u32 nwin)
+static inline int tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,
+ const u32 ack_seq, const u32 nwin)
{
return (after(ack, tp->snd_una) ||
after(ack_seq, tp->snd_wl1) ||
@@ -2189,6 +2239,7 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp,
/* Note, it is the only place, where
* fast path is recovered for sending TCP.
*/
+ tp->pred_flags = 0;
tcp_fast_path_check(sk, tp);
if (nwin > tp->max_window) {
@@ -2241,6 +2292,7 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
/* This routine deals with incoming acks, but not outgoing ones. */
static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
u32 prior_snd_una = tp->snd_una;
u32 ack_seq = TCP_SKB_CB(skb)->seq;
@@ -2268,7 +2320,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
tp->snd_una = ack;
flag |= FLAG_WIN_UPDATE;
- tcp_ca_event(tp, CA_EVENT_FAST_ACK);
+ tcp_ca_event(sk, CA_EVENT_FAST_ACK);
NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS);
} else {
@@ -2285,7 +2337,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th))
flag |= FLAG_ECE;
- tcp_ca_event(tp, CA_EVENT_SLOW_ACK);
+ tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
}
/* We passed data and got it acked, remove any soft error
@@ -2301,19 +2353,19 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
/* See if we can take anything off of the retransmit queue. */
flag |= tcp_clean_rtx_queue(sk, &seq_rtt,
- tp->ca_ops->rtt_sample ? &seq_usrtt : NULL);
+ icsk->icsk_ca_ops->rtt_sample ? &seq_usrtt : NULL);
if (tp->frto_counter)
tcp_process_frto(sk, prior_snd_una);
- if (tcp_ack_is_dubious(tp, flag)) {
+ if (tcp_ack_is_dubious(sk, flag)) {
/* Advanve CWND, if state allows this. */
- if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(tp, flag))
- tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 0);
+ if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag))
+ tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0);
tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
} else {
if ((flag & FLAG_DATA_ACKED))
- tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 1);
+ tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 1);
}
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP))
@@ -2322,7 +2374,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
return 1;
no_queue:
- tp->probes_out = 0;
+ icsk->icsk_probes_out = 0;
/* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
@@ -2500,8 +2552,9 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
* up to bandwidth of 18Gigabit/sec. 8) ]
*/
-static int tcp_disordered_ack(struct tcp_sock *tp, struct sk_buff *skb)
+static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
{
+ struct tcp_sock *tp = tcp_sk(sk);
struct tcphdr *th = skb->h.th;
u32 seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
@@ -2516,14 +2569,15 @@ static int tcp_disordered_ack(struct tcp_sock *tp, struct sk_buff *skb)
!tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
/* 4. ... and sits in replay window. */
- (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (tp->rto*1024)/HZ);
+ (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
}
-static inline int tcp_paws_discard(struct tcp_sock *tp, struct sk_buff *skb)
+static inline int tcp_paws_discard(const struct sock *sk, const struct sk_buff *skb)
{
+ const struct tcp_sock *tp = tcp_sk(sk);
return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW &&
xtime.tv_sec < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS &&
- !tcp_disordered_ack(tp, skb));
+ !tcp_disordered_ack(sk, skb));
}
/* Check segment sequence number for validity.
@@ -2586,7 +2640,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
{
struct tcp_sock *tp = tcp_sk(sk);
- tcp_schedule_ack(tp);
+ inet_csk_schedule_ack(sk);
sk->sk_shutdown |= RCV_SHUTDOWN;
sock_set_flag(sk, SOCK_DONE);
@@ -2596,7 +2650,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
case TCP_ESTABLISHED:
/* Move to CLOSE_WAIT */
tcp_set_state(sk, TCP_CLOSE_WAIT);
- tp->ack.pingpong = 1;
+ inet_csk(sk)->icsk_ack.pingpong = 1;
break;
case TCP_CLOSE_WAIT:
@@ -2694,7 +2748,7 @@ static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb)
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST);
- tcp_enter_quickack_mode(tp);
+ tcp_enter_quickack_mode(sk);
if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) {
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
@@ -2853,7 +2907,7 @@ static void tcp_ofo_queue(struct sock *sk)
if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
SOCK_DEBUG(sk, "ofo packet was already received \n");
- __skb_unlink(skb, skb->list);
+ __skb_unlink(skb, &tp->out_of_order_queue);
__kfree_skb(skb);
continue;
}
@@ -2861,7 +2915,7 @@ static void tcp_ofo_queue(struct sock *sk)
tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(skb)->end_seq);
- __skb_unlink(skb, skb->list);
+ __skb_unlink(skb, &tp->out_of_order_queue);
__skb_queue_tail(&sk->sk_receive_queue, skb);
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
if(skb->h.th->fin)
@@ -2942,7 +2996,7 @@ queue_and_out:
* gap in queue is filled.
*/
if (skb_queue_empty(&tp->out_of_order_queue))
- tp->ack.pingpong = 0;
+ inet_csk(sk)->icsk_ack.pingpong = 0;
}
if (tp->rx_opt.num_sacks)
@@ -2963,8 +3017,8 @@ queue_and_out:
tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
out_of_window:
- tcp_enter_quickack_mode(tp);
- tcp_schedule_ack(tp);
+ tcp_enter_quickack_mode(sk);
+ inet_csk_schedule_ack(sk);
drop:
__kfree_skb(skb);
return;
@@ -2974,7 +3028,7 @@ drop:
if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
goto out_of_window;
- tcp_enter_quickack_mode(tp);
+ tcp_enter_quickack_mode(sk);
if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
/* Partial packet, seq < rcv_next < end_seq */
@@ -3003,7 +3057,7 @@ drop:
/* Disable header prediction. */
tp->pred_flags = 0;
- tcp_schedule_ack(tp);
+ inet_csk_schedule_ack(sk);
SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
@@ -3027,7 +3081,7 @@ drop:
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
if (seq == TCP_SKB_CB(skb1)->end_seq) {
- __skb_append(skb1, skb);
+ __skb_append(skb1, skb, &tp->out_of_order_queue);
if (!tp->rx_opt.num_sacks ||
tp->selective_acks[0].end_seq != seq)
@@ -3071,7 +3125,7 @@ drop:
tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, end_seq);
break;
}
- __skb_unlink(skb1, skb1->list);
+ __skb_unlink(skb1, &tp->out_of_order_queue);
tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq);
__kfree_skb(skb1);
}
@@ -3088,8 +3142,9 @@ add_sack:
* simplifies code)
*/
static void
-tcp_collapse(struct sock *sk, struct sk_buff *head,
- struct sk_buff *tail, u32 start, u32 end)
+tcp_collapse(struct sock *sk, struct sk_buff_head *list,
+ struct sk_buff *head, struct sk_buff *tail,
+ u32 start, u32 end)
{
struct sk_buff *skb;
@@ -3099,7 +3154,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head,
/* No new bits? It is possible on ofo queue. */
if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
struct sk_buff *next = skb->next;
- __skb_unlink(skb, skb->list);
+ __skb_unlink(skb, list);
__kfree_skb(skb);
NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED);
skb = next;
@@ -3145,7 +3200,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head,
nskb->mac.raw = nskb->head + (skb->mac.raw-skb->head);
memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
- __skb_insert(nskb, skb->prev, skb, skb->list);
+ __skb_insert(nskb, skb->prev, skb, list);
sk_stream_set_owner_r(nskb, sk);
/* Copy data, releasing collapsed skbs. */
@@ -3164,7 +3219,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head,
}
if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
struct sk_buff *next = skb->next;
- __skb_unlink(skb, skb->list);
+ __skb_unlink(skb, list);
__kfree_skb(skb);
NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED);
skb = next;
@@ -3200,7 +3255,8 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
if (skb == (struct sk_buff *)&tp->out_of_order_queue ||
after(TCP_SKB_CB(skb)->seq, end) ||
before(TCP_SKB_CB(skb)->end_seq, start)) {
- tcp_collapse(sk, head, skb, start, end);
+ tcp_collapse(sk, &tp->out_of_order_queue,
+ head, skb, start, end);
head = skb;
if (skb == (struct sk_buff *)&tp->out_of_order_queue)
break;
@@ -3237,7 +3293,8 @@ static int tcp_prune_queue(struct sock *sk)
tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
tcp_collapse_ofo_queue(sk);
- tcp_collapse(sk, sk->sk_receive_queue.next,
+ tcp_collapse(sk, &sk->sk_receive_queue,
+ sk->sk_receive_queue.next,
(struct sk_buff*)&sk->sk_receive_queue,
tp->copied_seq, tp->rcv_nxt);
sk_stream_mem_reclaim(sk);
@@ -3286,12 +3343,12 @@ void tcp_cwnd_application_limited(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (tp->ca_state == TCP_CA_Open &&
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
/* Limited by application or receiver window. */
u32 win_used = max(tp->snd_cwnd_used, 2U);
if (win_used < tp->snd_cwnd) {
- tp->snd_ssthresh = tcp_current_ssthresh(tp);
+ tp->snd_ssthresh = tcp_current_ssthresh(sk);
tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
}
tp->snd_cwnd_used = 0;
@@ -3370,13 +3427,13 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
struct tcp_sock *tp = tcp_sk(sk);
/* More than one full frame received... */
- if (((tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss
+ if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss
/* ... and right edge of window advances far enough.
* (tcp_recvmsg() will send ACK otherwise). Or...
*/
&& __tcp_select_window(sk) >= tp->rcv_wnd) ||
/* We ACK each frame or... */
- tcp_in_quickack_mode(tp) ||
+ tcp_in_quickack_mode(sk) ||
/* We have out of order data. */
(ofo_possible &&
skb_peek(&tp->out_of_order_queue))) {
@@ -3390,8 +3447,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
static __inline__ void tcp_ack_snd_check(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
- if (!tcp_ack_scheduled(tp)) {
+ if (!inet_csk_ack_scheduled(sk)) {
/* We sent a data segment already. */
return;
}
@@ -3462,7 +3518,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
tp->copied_seq++;
if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
- __skb_unlink(skb, skb->list);
+ __skb_unlink(skb, &sk->sk_receive_queue);
__kfree_skb(skb);
}
}
@@ -3645,7 +3701,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tp->rcv_nxt == tp->rcv_wup)
tcp_store_ts_recent(tp);
- tcp_rcv_rtt_measure_ts(tp, skb);
+ tcp_rcv_rtt_measure_ts(sk, skb);
/* We know that such packets are checksummed
* on entry.
@@ -3678,7 +3734,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tp->rcv_nxt == tp->rcv_wup)
tcp_store_ts_recent(tp);
- tcp_rcv_rtt_measure_ts(tp, skb);
+ tcp_rcv_rtt_measure_ts(sk, skb);
__skb_pull(skb, tcp_header_len);
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
@@ -3699,7 +3755,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tp->rcv_nxt == tp->rcv_wup)
tcp_store_ts_recent(tp);
- tcp_rcv_rtt_measure_ts(tp, skb);
+ tcp_rcv_rtt_measure_ts(sk, skb);
if ((int)skb->truesize > sk->sk_forward_alloc)
goto step5;
@@ -3719,7 +3775,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
/* Well, only one small jumplet in fast path... */
tcp_ack(sk, skb, FLAG_DATA);
tcp_data_snd_check(sk, tp);
- if (!tcp_ack_scheduled(tp))
+ if (!inet_csk_ack_scheduled(sk))
goto no_ack;
}
@@ -3741,7 +3797,7 @@ slow_path:
* RFC1323: H1. Apply PAWS check first.
*/
if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
- tcp_paws_discard(tp, skb)) {
+ tcp_paws_discard(sk, skb)) {
if (!th->rst) {
NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
tcp_send_dupack(sk, skb);
@@ -3788,7 +3844,7 @@ step5:
if(th->ack)
tcp_ack(sk, skb, FLAG_SLOWPATH);
- tcp_rcv_rtt_measure_ts(tp, skb);
+ tcp_rcv_rtt_measure_ts(sk, skb);
/* Process urgent data. */
tcp_urg(sk, skb, th);
@@ -3817,6 +3873,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
tcp_parse_options(skb, &tp->rx_opt, 0);
if (th->ack) {
+ struct inet_connection_sock *icsk;
/* rfc793:
* "If the state is SYN-SENT then
* first check the ACK bit
@@ -3920,7 +3977,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
tcp_init_metrics(sk);
- tcp_init_congestion_control(tp);
+ tcp_init_congestion_control(sk);
/* Prevent spurious tcp_cwnd_restart() on first data
* packet.
@@ -3930,7 +3987,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
tcp_init_buffer_space(sk);
if (sock_flag(sk, SOCK_KEEPOPEN))
- tcp_reset_keepalive_timer(sk, keepalive_time_when(tp));
+ inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
if (!tp->rx_opt.snd_wscale)
__tcp_fast_path_on(tp, tp->snd_wnd);
@@ -3942,7 +3999,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
sk_wake_async(sk, 0, POLL_OUT);
}
- if (sk->sk_write_pending || tp->defer_accept || tp->ack.pingpong) {
+ icsk = inet_csk(sk);
+
+ if (sk->sk_write_pending ||
+ icsk->icsk_accept_queue.rskq_defer_accept ||
+ icsk->icsk_ack.pingpong) {
/* Save one ACK. Data will be ready after
* several ticks, if write_pending is set.
*
@@ -3950,12 +4011,13 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* look so _wonderfully_ clever, that I was not able
* to stand against the temptation 8) --ANK
*/
- tcp_schedule_ack(tp);
- tp->ack.lrcvtime = tcp_time_stamp;
- tp->ack.ato = TCP_ATO_MIN;
- tcp_incr_quickack(tp);
- tcp_enter_quickack_mode(tp);
- tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
+ inet_csk_schedule_ack(sk);
+ icsk->icsk_ack.lrcvtime = tcp_time_stamp;
+ icsk->icsk_ack.ato = TCP_ATO_MIN;
+ tcp_incr_quickack(sk);
+ tcp_enter_quickack_mode(sk);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+ TCP_DELACK_MAX, TCP_RTO_MAX);
discard:
__kfree_skb(skb);
@@ -4111,7 +4173,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
}
if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
- tcp_paws_discard(tp, skb)) {
+ tcp_paws_discard(sk, skb)) {
if (!th->rst) {
NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
tcp_send_dupack(sk, skb);
@@ -4180,7 +4242,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
*/
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
!tp->srtt)
- tcp_ack_saw_tstamp(tp, 0, 0);
+ tcp_ack_saw_tstamp(sk, NULL, 0);
if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
@@ -4192,7 +4254,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
tcp_init_metrics(sk);
- tcp_init_congestion_control(tp);
+ tcp_init_congestion_control(sk);
/* Prevent spurious tcp_cwnd_restart() on
* first data packet.
@@ -4227,9 +4289,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
return 1;
}
- tmo = tcp_fin_time(tp);
+ tmo = tcp_fin_time(sk);
if (tmo > TCP_TIMEWAIT_LEN) {
- tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
+ inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
} else if (th->fin || sock_owned_by_user(sk)) {
/* Bad case. We could lose such FIN otherwise.
* It is not a big problem, but it looks confusing
@@ -4237,7 +4299,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
* if it spins in bh_lock_sock(), but it is really
* marginal case.
*/
- tcp_reset_keepalive_timer(sk, tmo);
+ inet_csk_reset_keepalive_timer(sk, tmo);
} else {
tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
goto discard;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 62f62bb05c2..c85819d8474 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -64,7 +64,9 @@
#include <linux/times.h>
#include <net/icmp.h>
+#include <net/inet_hashtables.h>
#include <net/tcp.h>
+#include <net/transp_v6.h>
#include <net/ipv6.h>
#include <net/inet_common.h>
#include <net/xfrm.h>
@@ -75,7 +77,6 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
-extern int sysctl_ip_dynaddr;
int sysctl_tcp_tw_reuse;
int sysctl_tcp_low_latency;
@@ -88,458 +89,29 @@ static struct socket *tcp_socket;
void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
struct sk_buff *skb);
-struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
- .__tcp_lhash_lock = RW_LOCK_UNLOCKED,
- .__tcp_lhash_users = ATOMIC_INIT(0),
- .__tcp_lhash_wait
- = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
- .__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED
+struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
+ .lhash_lock = RW_LOCK_UNLOCKED,
+ .lhash_users = ATOMIC_INIT(0),
+ .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
+ .portalloc_lock = SPIN_LOCK_UNLOCKED,
+ .port_rover = 1024 - 1,
};
-/*
- * This array holds the first and last local port number.
- * For high-usage systems, use sysctl to change this to
- * 32768-61000
- */
-int sysctl_local_port_range[2] = { 1024, 4999 };
-int tcp_port_rover = 1024 - 1;
-
-static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
- __u32 faddr, __u16 fport)
-{
- int h = (laddr ^ lport) ^ (faddr ^ fport);
- h ^= h >> 16;
- h ^= h >> 8;
- return h & (tcp_ehash_size - 1);
-}
-
-static __inline__ int tcp_sk_hashfn(struct sock *sk)
-{
- struct inet_sock *inet = inet_sk(sk);
- __u32 laddr = inet->rcv_saddr;
- __u16 lport = inet->num;
- __u32 faddr = inet->daddr;
- __u16 fport = inet->dport;
-
- return tcp_hashfn(laddr, lport, faddr, fport);
-}
-
-/* Allocate and initialize a new TCP local port bind bucket.
- * The bindhash mutex for snum's hash chain must be held here.
- */
-struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
- unsigned short snum)
-{
- struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
- SLAB_ATOMIC);
- if (tb) {
- tb->port = snum;
- tb->fastreuse = 0;
- INIT_HLIST_HEAD(&tb->owners);
- hlist_add_head(&tb->node, &head->chain);
- }
- return tb;
-}
-
-/* Caller must hold hashbucket lock for this tb with local BH disabled */
-void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
-{
- if (hlist_empty(&tb->owners)) {
- __hlist_del(&tb->node);
- kmem_cache_free(tcp_bucket_cachep, tb);
- }
-}
-
-/* Caller must disable local BH processing. */
-static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
-{
- struct tcp_bind_hashbucket *head =
- &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
- struct tcp_bind_bucket *tb;
-
- spin_lock(&head->lock);
- tb = tcp_sk(sk)->bind_hash;
- sk_add_bind_node(child, &tb->owners);
- tcp_sk(child)->bind_hash = tb;
- spin_unlock(&head->lock);
-}
-
-inline void tcp_inherit_port(struct sock *sk, struct sock *child)
-{
- local_bh_disable();
- __tcp_inherit_port(sk, child);
- local_bh_enable();
-}
-
-void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
- unsigned short snum)
-{
- inet_sk(sk)->num = snum;
- sk_add_bind_node(sk, &tb->owners);
- tcp_sk(sk)->bind_hash = tb;
-}
-
-static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
-{
- const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
- struct sock *sk2;
- struct hlist_node *node;
- int reuse = sk->sk_reuse;
-
- sk_for_each_bound(sk2, node, &tb->owners) {
- if (sk != sk2 &&
- !tcp_v6_ipv6only(sk2) &&
- (!sk->sk_bound_dev_if ||
- !sk2->sk_bound_dev_if ||
- sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
- if (!reuse || !sk2->sk_reuse ||
- sk2->sk_state == TCP_LISTEN) {
- const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
- if (!sk2_rcv_saddr || !sk_rcv_saddr ||
- sk2_rcv_saddr == sk_rcv_saddr)
- break;
- }
- }
- }
- return node != NULL;
-}
-
-/* Obtain a reference to a local port for the given sock,
- * if snum is zero it means select any available local port.
- */
static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
{
- struct tcp_bind_hashbucket *head;
- struct hlist_node *node;
- struct tcp_bind_bucket *tb;
- int ret;
-
- local_bh_disable();
- if (!snum) {
- int low = sysctl_local_port_range[0];
- int high = sysctl_local_port_range[1];
- int remaining = (high - low) + 1;
- int rover;
-
- spin_lock(&tcp_portalloc_lock);
- if (tcp_port_rover < low)
- rover = low;
- else
- rover = tcp_port_rover;
- do {
- rover++;
- if (rover > high)
- rover = low;
- head = &tcp_bhash[tcp_bhashfn(rover)];
- spin_lock(&head->lock);
- tb_for_each(tb, node, &head->chain)
- if (tb->port == rover)
- goto next;
- break;
- next:
- spin_unlock(&head->lock);
- } while (--remaining > 0);
- tcp_port_rover = rover;
- spin_unlock(&tcp_portalloc_lock);
-
- /* Exhausted local port range during search? */
- ret = 1;
- if (remaining <= 0)
- goto fail;
-
- /* OK, here is the one we will use. HEAD is
- * non-NULL and we hold it's mutex.
- */
- snum = rover;
- } else {
- head = &tcp_bhash[tcp_bhashfn(snum)];
- spin_lock(&head->lock);
- tb_for_each(tb, node, &head->chain)
- if (tb->port == snum)
- goto tb_found;
- }
- tb = NULL;
- goto tb_not_found;
-tb_found:
- if (!hlist_empty(&tb->owners)) {
- if (sk->sk_reuse > 1)
- goto success;
- if (tb->fastreuse > 0 &&
- sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
- goto success;
- } else {
- ret = 1;
- if (tcp_bind_conflict(sk, tb))
- goto fail_unlock;
- }
- }
-tb_not_found:
- ret = 1;
- if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
- goto fail_unlock;
- if (hlist_empty(&tb->owners)) {
- if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
- tb->fastreuse = 1;
- else
- tb->fastreuse = 0;
- } else if (tb->fastreuse &&
- (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
- tb->fastreuse = 0;
-success:
- if (!tcp_sk(sk)->bind_hash)
- tcp_bind_hash(sk, tb, snum);
- BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
- ret = 0;
-
-fail_unlock:
- spin_unlock(&head->lock);
-fail:
- local_bh_enable();
- return ret;
-}
-
-/* Get rid of any references to a local port held by the
- * given sock.
- */
-static void __tcp_put_port(struct sock *sk)
-{
- struct inet_sock *inet = inet_sk(sk);
- struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
- struct tcp_bind_bucket *tb;
-
- spin_lock(&head->lock);
- tb = tcp_sk(sk)->bind_hash;
- __sk_del_bind_node(sk);
- tcp_sk(sk)->bind_hash = NULL;
- inet->num = 0;
- tcp_bucket_destroy(tb);
- spin_unlock(&head->lock);
-}
-
-void tcp_put_port(struct sock *sk)
-{
- local_bh_disable();
- __tcp_put_port(sk);
- local_bh_enable();
-}
-
-/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
- * Look, when several writers sleep and reader wakes them up, all but one
- * immediately hit write lock and grab all the cpus. Exclusive sleep solves
- * this, _but_ remember, it adds useless work on UP machines (wake up each
- * exclusive lock release). It should be ifdefed really.
- */
-
-void tcp_listen_wlock(void)
-{
- write_lock(&tcp_lhash_lock);
-
- if (atomic_read(&tcp_lhash_users)) {
- DEFINE_WAIT(wait);
-
- for (;;) {
- prepare_to_wait_exclusive(&tcp_lhash_wait,
- &wait, TASK_UNINTERRUPTIBLE);
- if (!atomic_read(&tcp_lhash_users))
- break;
- write_unlock_bh(&tcp_lhash_lock);
- schedule();
- write_lock_bh(&tcp_lhash_lock);
- }
-
- finish_wait(&tcp_lhash_wait, &wait);
- }
-}
-
-static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
-{
- struct hlist_head *list;
- rwlock_t *lock;
-
- BUG_TRAP(sk_unhashed(sk));
- if (listen_possible && sk->sk_state == TCP_LISTEN) {
- list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
- lock = &tcp_lhash_lock;
- tcp_listen_wlock();
- } else {
- list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
- lock = &tcp_ehash[sk->sk_hashent].lock;
- write_lock(lock);
- }
- __sk_add_node(sk, list);
- sock_prot_inc_use(sk->sk_prot);
- write_unlock(lock);
- if (listen_possible && sk->sk_state == TCP_LISTEN)
- wake_up(&tcp_lhash_wait);
+ return inet_csk_get_port(&tcp_hashinfo, sk, snum);
}
static void tcp_v4_hash(struct sock *sk)
{
- if (sk->sk_state != TCP_CLOSE) {
- local_bh_disable();
- __tcp_v4_hash(sk, 1);
- local_bh_enable();
- }
+ inet_hash(&tcp_hashinfo, sk);
}
void tcp_unhash(struct sock *sk)
{
- rwlock_t *lock;
-
- if (sk_unhashed(sk))
- goto ende;
-
- if (sk->sk_state == TCP_LISTEN) {
- local_bh_disable();
- tcp_listen_wlock();
- lock = &tcp_lhash_lock;
- } else {
- struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
- lock = &head->lock;
- write_lock_bh(&head->lock);
- }
-
- if (__sk_del_node_init(sk))
- sock_prot_dec_use(sk->sk_prot);
- write_unlock_bh(lock);
-
- ende:
- if (sk->sk_state == TCP_LISTEN)
- wake_up(&tcp_lhash_wait);
-}
-
-/* Don't inline this cruft. Here are some nice properties to
- * exploit here. The BSD API does not allow a listening TCP
- * to specify the remote port nor the remote address for the
- * connection. So always assume those are both wildcarded
- * during the search since they can never be otherwise.
- */
-static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
- unsigned short hnum, int dif)
-{
- struct sock *result = NULL, *sk;
- struct hlist_node *node;
- int score, hiscore;
-
- hiscore=-1;
- sk_for_each(sk, node, head) {
- struct inet_sock *inet = inet_sk(sk);
-
- if (inet->num == hnum && !ipv6_only_sock(sk)) {
- __u32 rcv_saddr = inet->rcv_saddr;
-
- score = (sk->sk_family == PF_INET ? 1 : 0);
- if (rcv_saddr) {
- if (rcv_saddr != daddr)
- continue;
- score+=2;
- }
- if (sk->sk_bound_dev_if) {
- if (sk->sk_bound_dev_if != dif)
- continue;
- score+=2;
- }
- if (score == 5)
- return sk;
- if (score > hiscore) {
- hiscore = score;
- result = sk;
- }
- }
- }
- return result;
-}
-
-/* Optimize the common listener case. */
-static inline struct sock *tcp_v4_lookup_listener(u32 daddr,
- unsigned short hnum, int dif)
-{
- struct sock *sk = NULL;
- struct hlist_head *head;
-
- read_lock(&tcp_lhash_lock);
- head = &tcp_listening_hash[tcp_lhashfn(hnum)];
- if (!hlist_empty(head)) {
- struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
-
- if (inet->num == hnum && !sk->sk_node.next &&
- (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
- (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
- !sk->sk_bound_dev_if)
- goto sherry_cache;
- sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
- }
- if (sk) {
-sherry_cache:
- sock_hold(sk);
- }
- read_unlock(&tcp_lhash_lock);
- return sk;
-}
-
-/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
- * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
- *
- * Local BH must be disabled here.
- */
-
-static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
- u32 daddr, u16 hnum,
- int dif)
-{
- struct tcp_ehash_bucket *head;
- TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
- __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
- struct sock *sk;
- struct hlist_node *node;
- /* Optimize here for direct hit, only listening connections can
- * have wildcards anyways.
- */
- int hash = tcp_hashfn(daddr, hnum, saddr, sport);
- head = &tcp_ehash[hash];
- read_lock(&head->lock);
- sk_for_each(sk, node, &head->chain) {
- if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
- goto hit; /* You sunk my battleship! */
- }
-
- /* Must check for a TIME_WAIT'er before going to listener hash. */
- sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
- if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
- goto hit;
- }
- sk = NULL;
-out:
- read_unlock(&head->lock);
- return sk;
-hit:
- sock_hold(sk);
- goto out;
-}
-
-static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
- u32 daddr, u16 hnum, int dif)
-{
- struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
- daddr, hnum, dif);
-
- return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
-}
-
-inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
- u16 dport, int dif)
-{
- struct sock *sk;
-
- local_bh_disable();
- sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
- local_bh_enable();
-
- return sk;
+ inet_unhash(&tcp_hashinfo, sk);
}
-EXPORT_SYMBOL_GPL(tcp_v4_lookup);
-
static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
{
return secure_tcp_sequence_number(skb->nh.iph->daddr,
@@ -550,27 +122,29 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
/* called with local bh disabled */
static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
- struct tcp_tw_bucket **twp)
+ struct inet_timewait_sock **twp)
{
struct inet_sock *inet = inet_sk(sk);
u32 daddr = inet->rcv_saddr;
u32 saddr = inet->daddr;
int dif = sk->sk_bound_dev_if;
- TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
- __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
- int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
- struct tcp_ehash_bucket *head = &tcp_ehash[hash];
+ INET_ADDR_COOKIE(acookie, saddr, daddr)
+ const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
+ unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
+ struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash);
struct sock *sk2;
- struct hlist_node *node;
- struct tcp_tw_bucket *tw;
+ const struct hlist_node *node;
+ struct inet_timewait_sock *tw;
+ prefetch(head->chain.first);
write_lock(&head->lock);
/* Check TIME-WAIT sockets first. */
- sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
- tw = (struct tcp_tw_bucket *)sk2;
+ sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
+ tw = inet_twsk(sk2);
- if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
+ if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) {
+ const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
struct tcp_sock *tp = tcp_sk(sk);
/* With PAWS, it is safe from the viewpoint
@@ -587,15 +161,15 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
fall back to VJ's scheme and use initial
timestamp retrieved from peer table.
*/
- if (tw->tw_ts_recent_stamp &&
+ if (tcptw->tw_ts_recent_stamp &&
(!twp || (sysctl_tcp_tw_reuse &&
xtime.tv_sec -
- tw->tw_ts_recent_stamp > 1))) {
- if ((tp->write_seq =
- tw->tw_snd_nxt + 65535 + 2) == 0)
+ tcptw->tw_ts_recent_stamp > 1))) {
+ tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
+ if (tp->write_seq == 0)
tp->write_seq = 1;
- tp->rx_opt.ts_recent = tw->tw_ts_recent;
- tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
+ tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
+ tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
sock_hold(sk2);
goto unique;
} else
@@ -606,7 +180,7 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
/* And established part... */
sk_for_each(sk2, node, &head->chain) {
- if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
+ if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
goto not_unique;
}
@@ -615,7 +189,7 @@ unique:
* in hash table socket with a funny identity. */
inet->num = lport;
inet->sport = htons(lport);
- sk->sk_hashent = hash;
+ sk->sk_hash = hash;
BUG_TRAP(sk_unhashed(sk));
__sk_add_node(sk, &head->chain);
sock_prot_inc_use(sk->sk_prot);
@@ -626,10 +200,10 @@ unique:
NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
} else if (tw) {
/* Silly. Should hash-dance instead... */
- tcp_tw_deschedule(tw);
+ inet_twsk_deschedule(tw, &tcp_death_row);
NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
- tcp_tw_put(tw);
+ inet_twsk_put(tw);
}
return 0;
@@ -652,9 +226,9 @@ static inline u32 connect_port_offset(const struct sock *sk)
*/
static inline int tcp_v4_hash_connect(struct sock *sk)
{
- unsigned short snum = inet_sk(sk)->num;
- struct tcp_bind_hashbucket *head;
- struct tcp_bind_bucket *tb;
+ const unsigned short snum = inet_sk(sk)->num;
+ struct inet_bind_hashbucket *head;
+ struct inet_bind_bucket *tb;
int ret;
if (!snum) {
@@ -666,19 +240,19 @@ static inline int tcp_v4_hash_connect(struct sock *sk)
static u32 hint;
u32 offset = hint + connect_port_offset(sk);
struct hlist_node *node;
- struct tcp_tw_bucket *tw = NULL;
+ struct inet_timewait_sock *tw = NULL;
local_bh_disable();
for (i = 1; i <= range; i++) {
port = low + (i + offset) % range;
- head = &tcp_bhash[tcp_bhashfn(port)];
+ head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
spin_lock(&head->lock);
/* Does not bother with rcv_saddr checks,
* because the established check is already
* unique enough.
*/
- tb_for_each(tb, node, &head->chain) {
+ inet_bind_bucket_for_each(tb, node, &head->chain) {
if (tb->port == port) {
BUG_TRAP(!hlist_empty(&tb->owners));
if (tb->fastreuse >= 0)
@@ -691,7 +265,7 @@ static inline int tcp_v4_hash_connect(struct sock *sk)
}
}
- tb = tcp_bucket_create(head, port);
+ tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
if (!tb) {
spin_unlock(&head->lock);
break;
@@ -710,27 +284,27 @@ ok:
hint += i;
/* Head lock still held and bh's disabled */
- tcp_bind_hash(sk, tb, port);
+ inet_bind_hash(sk, tb, port);
if (sk_unhashed(sk)) {
inet_sk(sk)->sport = htons(port);
- __tcp_v4_hash(sk, 0);
+ __inet_hash(&tcp_hashinfo, sk, 0);
}
spin_unlock(&head->lock);
if (tw) {
- tcp_tw_deschedule(tw);
- tcp_tw_put(tw);
+ inet_twsk_deschedule(tw, &tcp_death_row);;
+ inet_twsk_put(tw);
}
ret = 0;
goto out;
}
- head = &tcp_bhash[tcp_bhashfn(snum)];
- tb = tcp_sk(sk)->bind_hash;
+ head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
+ tb = inet_csk(sk)->icsk_bind_hash;
spin_lock_bh(&head->lock);
if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
- __tcp_v4_hash(sk, 0);
+ __inet_hash(&tcp_hashinfo, sk, 0);
spin_unlock_bh(&head->lock);
return 0;
} else {
@@ -793,7 +367,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
tp->write_seq = 0;
}
- if (sysctl_tcp_tw_recycle &&
+ if (tcp_death_row.sysctl_tw_recycle &&
!tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
struct inet_peer *peer = rt_get_peer(rt);
@@ -832,8 +406,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
goto failure;
/* OK, now commit destination to socket. */
- __sk_dst_set(sk, &rt->u.dst);
- tcp_v4_setup_caps(sk, &rt->u.dst);
+ sk_setup_caps(sk, &rt->u.dst);
if (!tp->write_seq)
tp->write_seq = secure_tcp_sequence_number(inet->saddr,
@@ -859,53 +432,6 @@ failure:
return err;
}
-static __inline__ int tcp_v4_iif(struct sk_buff *skb)
-{
- return ((struct rtable *)skb->dst)->rt_iif;
-}
-
-static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
-{
- return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
-}
-
-static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
- struct request_sock ***prevp,
- __u16 rport,
- __u32 raddr, __u32 laddr)
-{
- struct listen_sock *lopt = tp->accept_queue.listen_opt;
- struct request_sock *req, **prev;
-
- for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
- (req = *prev) != NULL;
- prev = &req->dl_next) {
- const struct inet_request_sock *ireq = inet_rsk(req);
-
- if (ireq->rmt_port == rport &&
- ireq->rmt_addr == raddr &&
- ireq->loc_addr == laddr &&
- TCP_INET_FAMILY(req->rsk_ops->family)) {
- BUG_TRAP(!req->sk);
- *prevp = prev;
- break;
- }
- }
-
- return req;
-}
-
-static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct listen_sock *lopt = tp->accept_queue.listen_opt;
- u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
-
- reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
- tcp_synq_added(sk);
-}
-
-
/*
* This routine does path mtu discovery as defined in RFC1191.
*/
@@ -988,14 +514,14 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
return;
}
- sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
- th->source, tcp_v4_iif(skb));
+ sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
+ th->source, inet_iif(skb));
if (!sk) {
ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
return;
}
if (sk->sk_state == TCP_TIME_WAIT) {
- tcp_tw_put((struct tcp_tw_bucket *)sk);
+ inet_twsk_put((struct inet_timewait_sock *)sk);
return;
}
@@ -1049,8 +575,8 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
if (sock_owned_by_user(sk))
goto out;
- req = tcp_v4_search_req(tp, &prev, th->dest,
- iph->daddr, iph->saddr);
+ req = inet_csk_search_req(sk, &prev, th->dest,
+ iph->daddr, iph->saddr);
if (!req)
goto out;
@@ -1070,7 +596,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
* created socket, and POSIX does not want network
* errors returned from accept().
*/
- tcp_synq_drop(sk, req, prev);
+ inet_csk_reqsk_queue_drop(sk, req, prev);
goto out;
case TCP_SYN_SENT:
@@ -1240,12 +766,13 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
{
- struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
+ struct inet_timewait_sock *tw = inet_twsk(sk);
+ const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
- tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
- tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
+ tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
+ tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
- tcp_tw_put(tw);
+ inet_twsk_put(tw);
}
static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
@@ -1254,36 +781,6 @@ static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
req->ts_recent);
}
-static struct dst_entry* tcp_v4_route_req(struct sock *sk,
- struct request_sock *req)
-{
- struct rtable *rt;
- const struct inet_request_sock *ireq = inet_rsk(req);
- struct ip_options *opt = inet_rsk(req)->opt;
- struct flowi fl = { .oif = sk->sk_bound_dev_if,
- .nl_u = { .ip4_u =
- { .daddr = ((opt && opt->srr) ?
- opt->faddr :
- ireq->rmt_addr),
- .saddr = ireq->loc_addr,
- .tos = RT_CONN_FLAGS(sk) } },
- .proto = IPPROTO_TCP,
- .uli_u = { .ports =
- { .sport = inet_sk(sk)->sport,
- .dport = ireq->rmt_port } } };
-
- if (ip_route_output_flow(&rt, &fl, sk, 0)) {
- IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
- return NULL;
- }
- if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
- ip_rt_put(rt);
- IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
- return NULL;
- }
- return &rt->u.dst;
-}
-
/*
* Send a SYN-ACK after having received an ACK.
* This still operates on a request_sock only, not on a big
@@ -1297,7 +794,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
struct sk_buff * skb;
/* First, grab a route. */
- if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
+ if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
goto out;
skb = tcp_make_synack(sk, dst, req);
@@ -1399,7 +896,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
* limitations, they conserve resources and peer is
* evidently real one.
*/
- if (tcp_synq_is_full(sk) && !isn) {
+ if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
#ifdef CONFIG_SYN_COOKIES
if (sysctl_tcp_syncookies) {
want_cookie = 1;
@@ -1413,7 +910,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
* clogging syn queue with openreqs with exponentially increasing
* timeout.
*/
- if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
+ if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
goto drop;
req = reqsk_alloc(&tcp_request_sock_ops);
@@ -1469,8 +966,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
* are made in the function processing timewait state.
*/
if (tmp_opt.saw_tstamp &&
- sysctl_tcp_tw_recycle &&
- (dst = tcp_v4_route_req(sk, req)) != NULL &&
+ tcp_death_row.sysctl_tw_recycle &&
+ (dst = inet_csk_route_req(sk, req)) != NULL &&
(peer = rt_get_peer((struct rtable *)dst)) != NULL &&
peer->v4daddr == saddr) {
if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
@@ -1483,7 +980,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
}
/* Kill the following clause, if you dislike this way. */
else if (!sysctl_tcp_syncookies &&
- (sysctl_max_syn_backlog - tcp_synq_len(sk) <
+ (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
(sysctl_max_syn_backlog >> 2)) &&
(!peer || !peer->tcp_ts_stamp) &&
(!dst || !dst_metric(dst, RTAX_RTT))) {
@@ -1494,12 +991,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
* to destinations, already remembered
* to the moment of synflood.
*/
- NETDEBUG(if (net_ratelimit()) \
- printk(KERN_DEBUG "TCP: drop open "
- "request from %u.%u."
- "%u.%u/%u\n", \
- NIPQUAD(saddr),
- ntohs(skb->h.th->source)));
+ LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
+ "request from %u.%u.%u.%u/%u\n",
+ NIPQUAD(saddr),
+ ntohs(skb->h.th->source));
dst_release(dst);
goto drop_and_free;
}
@@ -1514,7 +1009,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
if (want_cookie) {
reqsk_free(req);
} else {
- tcp_v4_synq_add(sk, req);
+ inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
}
return 0;
@@ -1542,15 +1037,14 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
if (sk_acceptq_is_full(sk))
goto exit_overflow;
- if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
+ if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
goto exit;
newsk = tcp_create_openreq_child(sk, req, skb);
if (!newsk)
goto exit;
- newsk->sk_dst_cache = dst;
- tcp_v4_setup_caps(newsk, dst);
+ sk_setup_caps(newsk, dst);
newtp = tcp_sk(newsk);
newinet = inet_sk(newsk);
@@ -1560,7 +1054,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newinet->saddr = ireq->loc_addr;
newinet->opt = ireq->opt;
ireq->opt = NULL;
- newinet->mc_index = tcp_v4_iif(skb);
+ newinet->mc_index = inet_iif(skb);
newinet->mc_ttl = skb->nh.iph->ttl;
newtp->ext_header_len = 0;
if (newinet->opt)
@@ -1571,8 +1065,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
tcp_initialize_rcv_mss(newsk);
- __tcp_v4_hash(newsk, 0);
- __tcp_inherit_port(sk, newsk);
+ __inet_hash(&tcp_hashinfo, newsk, 0);
+ __inet_inherit_port(&tcp_hashinfo, sk, newsk);
return newsk;
@@ -1588,27 +1082,24 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
{
struct tcphdr *th = skb->h.th;
struct iphdr *iph = skb->nh.iph;
- struct tcp_sock *tp = tcp_sk(sk);
struct sock *nsk;
struct request_sock **prev;
/* Find possible connection requests. */
- struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
- iph->saddr, iph->daddr);
+ struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
+ iph->saddr, iph->daddr);
if (req)
return tcp_check_req(sk, skb, req, prev);
- nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
- th->source,
- skb->nh.iph->daddr,
- ntohs(th->dest),
- tcp_v4_iif(skb));
+ nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
+ th->source, skb->nh.iph->daddr,
+ ntohs(th->dest), inet_iif(skb));
if (nsk) {
if (nsk->sk_state != TCP_TIME_WAIT) {
bh_lock_sock(nsk);
return nsk;
}
- tcp_tw_put((struct tcp_tw_bucket *)nsk);
+ inet_twsk_put((struct inet_timewait_sock *)nsk);
return NULL;
}
@@ -1627,8 +1118,7 @@ static int tcp_v4_checksum_init(struct sk_buff *skb)
skb->nh.iph->daddr, skb->csum))
return 0;
- NETDEBUG(if (net_ratelimit())
- printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
+ LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v4 csum failed\n");
skb->ip_summed = CHECKSUM_NONE;
}
if (skb->len <= 76) {
@@ -1744,9 +1234,9 @@ int tcp_v4_rcv(struct sk_buff *skb)
TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
TCP_SKB_CB(skb)->sacked = 0;
- sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
- skb->nh.iph->daddr, ntohs(th->dest),
- tcp_v4_iif(skb));
+ sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
+ skb->nh.iph->daddr, ntohs(th->dest),
+ inet_iif(skb));
if (!sk)
goto no_tcp_socket;
@@ -1798,24 +1288,26 @@ discard_and_relse:
do_time_wait:
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
- tcp_tw_put((struct tcp_tw_bucket *) sk);
+ inet_twsk_put((struct inet_timewait_sock *) sk);
goto discard_it;
}
if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
TCP_INC_STATS_BH(TCP_MIB_INERRS);
- tcp_tw_put((struct tcp_tw_bucket *) sk);
+ inet_twsk_put((struct inet_timewait_sock *) sk);
goto discard_it;
}
- switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
- skb, th, skb->len)) {
+ switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
+ skb, th)) {
case TCP_TW_SYN: {
- struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
- ntohs(th->dest),
- tcp_v4_iif(skb));
+ struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
+ skb->nh.iph->daddr,
+ ntohs(th->dest),
+ inet_iif(skb));
if (sk2) {
- tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
- tcp_tw_put((struct tcp_tw_bucket *)sk);
+ inet_twsk_deschedule((struct inet_timewait_sock *)sk,
+ &tcp_death_row);
+ inet_twsk_put((struct inet_timewait_sock *)sk);
sk = sk2;
goto process;
}
@@ -1831,112 +1323,6 @@ do_time_wait:
goto discard_it;
}
-/* With per-bucket locks this operation is not-atomic, so that
- * this version is not worse.
- */
-static void __tcp_v4_rehash(struct sock *sk)
-{
- sk->sk_prot->unhash(sk);
- sk->sk_prot->hash(sk);
-}
-
-static int tcp_v4_reselect_saddr(struct sock *sk)
-{
- struct inet_sock *inet = inet_sk(sk);
- int err;
- struct rtable *rt;
- __u32 old_saddr = inet->saddr;
- __u32 new_saddr;
- __u32 daddr = inet->daddr;
-
- if (inet->opt && inet->opt->srr)
- daddr = inet->opt->faddr;
-
- /* Query new route. */
- err = ip_route_connect(&rt, daddr, 0,
- RT_CONN_FLAGS(sk),
- sk->sk_bound_dev_if,
- IPPROTO_TCP,
- inet->sport, inet->dport, sk);
- if (err)
- return err;
-
- __sk_dst_set(sk, &rt->u.dst);
- tcp_v4_setup_caps(sk, &rt->u.dst);
-
- new_saddr = rt->rt_src;
-
- if (new_saddr == old_saddr)
- return 0;
-
- if (sysctl_ip_dynaddr > 1) {
- printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
- "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
- NIPQUAD(old_saddr),
- NIPQUAD(new_saddr));
- }
-
- inet->saddr = new_saddr;
- inet->rcv_saddr = new_saddr;
-
- /* XXX The only one ugly spot where we need to
- * XXX really change the sockets identity after
- * XXX it has entered the hashes. -DaveM
- *
- * Besides that, it does not check for connection
- * uniqueness. Wait for troubles.
- */
- __tcp_v4_rehash(sk);
- return 0;
-}
-
-int tcp_v4_rebuild_header(struct sock *sk)
-{
- struct inet_sock *inet = inet_sk(sk);
- struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
- u32 daddr;
- int err;
-
- /* Route is OK, nothing to do. */
- if (rt)
- return 0;
-
- /* Reroute. */
- daddr = inet->daddr;
- if (inet->opt && inet->opt->srr)
- daddr = inet->opt->faddr;
-
- {
- struct flowi fl = { .oif = sk->sk_bound_dev_if,
- .nl_u = { .ip4_u =
- { .daddr = daddr,
- .saddr = inet->saddr,
- .tos = RT_CONN_FLAGS(sk) } },
- .proto = IPPROTO_TCP,
- .uli_u = { .ports =
- { .sport = inet->sport,
- .dport = inet->dport } } };
-
- err = ip_route_output_flow(&rt, &fl, sk, 0);
- }
- if (!err) {
- __sk_dst_set(sk, &rt->u.dst);
- tcp_v4_setup_caps(sk, &rt->u.dst);
- return 0;
- }
-
- /* Routing failed... */
- sk->sk_route_caps = 0;
-
- if (!sysctl_ip_dynaddr ||
- sk->sk_state != TCP_SYN_SENT ||
- (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
- (err = tcp_v4_reselect_saddr(sk)) != 0)
- sk->sk_err_soft = -err;
-
- return err;
-}
-
static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
{
struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
@@ -1985,18 +1371,18 @@ int tcp_v4_remember_stamp(struct sock *sk)
return 0;
}
-int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
+int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
{
- struct inet_peer *peer = NULL;
-
- peer = inet_getpeer(tw->tw_daddr, 1);
+ struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
if (peer) {
- if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
+ const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
+
+ if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
(peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
- peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
- peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
- peer->tcp_ts = tw->tw_ts_recent;
+ peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
+ peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
+ peer->tcp_ts = tcptw->tw_ts_recent;
}
inet_putpeer(peer);
return 1;
@@ -2008,7 +1394,7 @@ int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
struct tcp_func ipv4_specific = {
.queue_xmit = ip_queue_xmit,
.send_check = tcp_v4_send_check,
- .rebuild_header = tcp_v4_rebuild_header,
+ .rebuild_header = inet_sk_rebuild_header,
.conn_request = tcp_v4_conn_request,
.syn_recv_sock = tcp_v4_syn_recv_sock,
.remember_stamp = tcp_v4_remember_stamp,
@@ -2024,13 +1410,14 @@ struct tcp_func ipv4_specific = {
*/
static int tcp_v4_init_sock(struct sock *sk)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
skb_queue_head_init(&tp->out_of_order_queue);
tcp_init_xmit_timers(sk);
tcp_prequeue_init(tp);
- tp->rto = TCP_TIMEOUT_INIT;
+ icsk->icsk_rto = TCP_TIMEOUT_INIT;
tp->mdev = TCP_TIMEOUT_INIT;
/* So many TCP implementations out there (incorrectly) count the
@@ -2048,7 +1435,7 @@ static int tcp_v4_init_sock(struct sock *sk)
tp->mss_cache = 536;
tp->reordering = sysctl_tcp_reordering;
- tp->ca_ops = &tcp_init_congestion_ops;
+ icsk->icsk_ca_ops = &tcp_init_congestion_ops;
sk->sk_state = TCP_CLOSE;
@@ -2071,7 +1458,7 @@ int tcp_v4_destroy_sock(struct sock *sk)
tcp_clear_xmit_timers(sk);
- tcp_cleanup_congestion_control(tp);
+ tcp_cleanup_congestion_control(sk);
/* Cleanup up the write buffer. */
sk_stream_writequeue_purge(sk);
@@ -2083,8 +1470,8 @@ int tcp_v4_destroy_sock(struct sock *sk)
__skb_queue_purge(&tp->ucopy.prequeue);
/* Clean up a referenced TCP bind bucket. */
- if (tp->bind_hash)
- tcp_put_port(sk);
+ if (inet_csk(sk)->icsk_bind_hash)
+ inet_put_port(&tcp_hashinfo, sk);
/*
* If sendmsg cached page exists, toss it.
@@ -2104,13 +1491,13 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock);
#ifdef CONFIG_PROC_FS
/* Proc filesystem TCP sock list dumping. */
-static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
+static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
{
return hlist_empty(head) ? NULL :
- list_entry(head->first, struct tcp_tw_bucket, tw_node);
+ list_entry(head->first, struct inet_timewait_sock, tw_node);
}
-static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
+static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
{
return tw->tw_node.next ?
hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
@@ -2118,14 +1505,14 @@ static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
static void *listening_get_next(struct seq_file *seq, void *cur)
{
- struct tcp_sock *tp;
+ struct inet_connection_sock *icsk;
struct hlist_node *node;
struct sock *sk = cur;
struct tcp_iter_state* st = seq->private;
if (!sk) {
st->bucket = 0;
- sk = sk_head(&tcp_listening_hash[0]);
+ sk = sk_head(&tcp_hashinfo.listening_hash[0]);
goto get_sk;
}
@@ -2134,7 +1521,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
if (st->state == TCP_SEQ_STATE_OPENREQ) {
struct request_sock *req = cur;
- tp = tcp_sk(st->syn_wait_sk);
+ icsk = inet_csk(st->syn_wait_sk);
req = req->dl_next;
while (1) {
while (req) {
@@ -2147,17 +1534,17 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
if (++st->sbucket >= TCP_SYNQ_HSIZE)
break;
get_req:
- req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
+ req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
}
sk = sk_next(st->syn_wait_sk);
st->state = TCP_SEQ_STATE_LISTENING;
- read_unlock_bh(&tp->accept_queue.syn_wait_lock);
+ read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
} else {
- tp = tcp_sk(sk);
- read_lock_bh(&tp->accept_queue.syn_wait_lock);
- if (reqsk_queue_len(&tp->accept_queue))
+ icsk = inet_csk(sk);
+ read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+ if (reqsk_queue_len(&icsk->icsk_accept_queue))
goto start_req;
- read_unlock_bh(&tp->accept_queue.syn_wait_lock);
+ read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
sk = sk_next(sk);
}
get_sk:
@@ -2166,9 +1553,9 @@ get_sk:
cur = sk;
goto out;
}
- tp = tcp_sk(sk);
- read_lock_bh(&tp->accept_queue.syn_wait_lock);
- if (reqsk_queue_len(&tp->accept_queue)) {
+ icsk = inet_csk(sk);
+ read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+ if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
start_req:
st->uid = sock_i_uid(sk);
st->syn_wait_sk = sk;
@@ -2176,10 +1563,10 @@ start_req:
st->sbucket = 0;
goto get_req;
}
- read_unlock_bh(&tp->accept_queue.syn_wait_lock);
+ read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
}
- if (++st->bucket < TCP_LHTABLE_SIZE) {
- sk = sk_head(&tcp_listening_hash[st->bucket]);
+ if (++st->bucket < INET_LHTABLE_SIZE) {
+ sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
goto get_sk;
}
cur = NULL;
@@ -2203,16 +1590,16 @@ static void *established_get_first(struct seq_file *seq)
struct tcp_iter_state* st = seq->private;
void *rc = NULL;
- for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
+ for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
struct sock *sk;
struct hlist_node *node;
- struct tcp_tw_bucket *tw;
+ struct inet_timewait_sock *tw;
/* We can reschedule _before_ having picked the target: */
cond_resched_softirq();
- read_lock(&tcp_ehash[st->bucket].lock);
- sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
+ read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
+ sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
if (sk->sk_family != st->family) {
continue;
}
@@ -2220,15 +1607,15 @@ static void *established_get_first(struct seq_file *seq)
goto out;
}
st->state = TCP_SEQ_STATE_TIME_WAIT;
- tw_for_each(tw, node,
- &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
+ inet_twsk_for_each(tw, node,
+ &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
if (tw->tw_family != st->family) {
continue;
}
rc = tw;
goto out;
}
- read_unlock(&tcp_ehash[st->bucket].lock);
+ read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
st->state = TCP_SEQ_STATE_ESTABLISHED;
}
out:
@@ -2238,7 +1625,7 @@ out:
static void *established_get_next(struct seq_file *seq, void *cur)
{
struct sock *sk = cur;
- struct tcp_tw_bucket *tw;
+ struct inet_timewait_sock *tw;
struct hlist_node *node;
struct tcp_iter_state* st = seq->private;
@@ -2255,15 +1642,15 @@ get_tw:
cur = tw;
goto out;
}
- read_unlock(&tcp_ehash[st->bucket].lock);
+ read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
st->state = TCP_SEQ_STATE_ESTABLISHED;
/* We can reschedule between buckets: */
cond_resched_softirq();
- if (++st->bucket < tcp_ehash_size) {
- read_lock(&tcp_ehash[st->bucket].lock);
- sk = sk_head(&tcp_ehash[st->bucket].chain);
+ if (++st->bucket < tcp_hashinfo.ehash_size) {
+ read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
+ sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
} else {
cur = NULL;
goto out;
@@ -2277,7 +1664,7 @@ get_tw:
}
st->state = TCP_SEQ_STATE_TIME_WAIT;
- tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
+ tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
goto get_tw;
found:
cur = sk;
@@ -2301,12 +1688,12 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
void *rc;
struct tcp_iter_state* st = seq->private;
- tcp_listen_lock();
+ inet_listen_lock(&tcp_hashinfo);
st->state = TCP_SEQ_STATE_LISTENING;
rc = listening_get_idx(seq, &pos);
if (!rc) {
- tcp_listen_unlock();
+ inet_listen_unlock(&tcp_hashinfo);
local_bh_disable();
st->state = TCP_SEQ_STATE_ESTABLISHED;
rc = established_get_idx(seq, pos);
@@ -2339,7 +1726,7 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
case TCP_SEQ_STATE_LISTENING:
rc = listening_get_next(seq, v);
if (!rc) {
- tcp_listen_unlock();
+ inet_listen_unlock(&tcp_hashinfo);
local_bh_disable();
st->state = TCP_SEQ_STATE_ESTABLISHED;
rc = established_get_first(seq);
@@ -2362,17 +1749,17 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
switch (st->state) {
case TCP_SEQ_STATE_OPENREQ:
if (v) {
- struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
- read_unlock_bh(&tp->accept_queue.syn_wait_lock);
+ struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
+ read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
}
case TCP_SEQ_STATE_LISTENING:
if (v != SEQ_START_TOKEN)
- tcp_listen_unlock();
+ inet_listen_unlock(&tcp_hashinfo);
break;
case TCP_SEQ_STATE_TIME_WAIT:
case TCP_SEQ_STATE_ESTABLISHED:
if (v)
- read_unlock(&tcp_ehash[st->bucket].lock);
+ read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
local_bh_enable();
break;
}
@@ -2469,18 +1856,19 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
int timer_active;
unsigned long timer_expires;
struct tcp_sock *tp = tcp_sk(sp);
+ const struct inet_connection_sock *icsk = inet_csk(sp);
struct inet_sock *inet = inet_sk(sp);
unsigned int dest = inet->daddr;
unsigned int src = inet->rcv_saddr;
__u16 destp = ntohs(inet->dport);
__u16 srcp = ntohs(inet->sport);
- if (tp->pending == TCP_TIME_RETRANS) {
+ if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
timer_active = 1;
- timer_expires = tp->timeout;
- } else if (tp->pending == TCP_TIME_PROBE0) {
+ timer_expires = icsk->icsk_timeout;
+ } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
timer_active = 4;
- timer_expires = tp->timeout;
+ timer_expires = icsk->icsk_timeout;
} else if (timer_pending(&sp->sk_timer)) {
timer_active = 2;
timer_expires = sp->sk_timer.expires;
@@ -2495,17 +1883,19 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
timer_active,
jiffies_to_clock_t(timer_expires - jiffies),
- tp->retransmits,
+ icsk->icsk_retransmits,
sock_i_uid(sp),
- tp->probes_out,
+ icsk->icsk_probes_out,
sock_i_ino(sp),
atomic_read(&sp->sk_refcnt), sp,
- tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
+ icsk->icsk_rto,
+ icsk->icsk_ack.ato,
+ (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
tp->snd_cwnd,
tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
}
-static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
+static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
{
unsigned int dest, src;
__u16 destp, srcp;
@@ -2585,7 +1975,7 @@ struct proto tcp_prot = {
.close = tcp_close,
.connect = tcp_v4_connect,
.disconnect = tcp_disconnect,
- .accept = tcp_accept,
+ .accept = inet_csk_accept,
.ioctl = tcp_ioctl,
.init = tcp_v4_init_sock,
.destroy = tcp_v4_destroy_sock,
@@ -2600,6 +1990,7 @@ struct proto tcp_prot = {
.get_port = tcp_v4_get_port,
.enter_memory_pressure = tcp_enter_memory_pressure,
.sockets_allocated = &tcp_sockets_allocated,
+ .orphan_count = &tcp_orphan_count,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
.sysctl_mem = sysctl_tcp_mem,
@@ -2607,6 +1998,7 @@ struct proto tcp_prot = {
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
+ .twsk_obj_size = sizeof(struct tcp_timewait_sock),
.rsk_prot = &tcp_request_sock_ops,
};
@@ -2628,19 +2020,13 @@ void __init tcp_v4_init(struct net_proto_family *ops)
}
EXPORT_SYMBOL(ipv4_specific);
-EXPORT_SYMBOL(tcp_bind_hash);
-EXPORT_SYMBOL(tcp_bucket_create);
+EXPORT_SYMBOL(inet_bind_bucket_create);
EXPORT_SYMBOL(tcp_hashinfo);
-EXPORT_SYMBOL(tcp_inherit_port);
-EXPORT_SYMBOL(tcp_listen_wlock);
-EXPORT_SYMBOL(tcp_port_rover);
EXPORT_SYMBOL(tcp_prot);
-EXPORT_SYMBOL(tcp_put_port);
EXPORT_SYMBOL(tcp_unhash);
EXPORT_SYMBOL(tcp_v4_conn_request);
EXPORT_SYMBOL(tcp_v4_connect);
EXPORT_SYMBOL(tcp_v4_do_rcv);
-EXPORT_SYMBOL(tcp_v4_rebuild_header);
EXPORT_SYMBOL(tcp_v4_remember_stamp);
EXPORT_SYMBOL(tcp_v4_send_check);
EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f42a284164b..b1a63b2c6b4 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -35,13 +35,27 @@
#define SYNC_INIT 1
#endif
-int sysctl_tcp_tw_recycle;
-int sysctl_tcp_max_tw_buckets = NR_FILE*2;
-
int sysctl_tcp_syncookies = SYNC_INIT;
int sysctl_tcp_abort_on_overflow;
-static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo);
+struct inet_timewait_death_row tcp_death_row = {
+ .sysctl_max_tw_buckets = NR_FILE * 2,
+ .period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
+ .death_lock = SPIN_LOCK_UNLOCKED,
+ .hashinfo = &tcp_hashinfo,
+ .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0,
+ (unsigned long)&tcp_death_row),
+ .twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work,
+ inet_twdr_twkill_work,
+ &tcp_death_row),
+/* Short-time timewait calendar */
+
+ .twcal_hand = -1,
+ .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
+ (unsigned long)&tcp_death_row),
+};
+
+EXPORT_SYMBOL_GPL(tcp_death_row);
static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
{
@@ -52,47 +66,6 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
return (seq == e_win && seq == end_seq);
}
-/* New-style handling of TIME_WAIT sockets. */
-
-int tcp_tw_count;
-
-
-/* Must be called with locally disabled BHs. */
-static void tcp_timewait_kill(struct tcp_tw_bucket *tw)
-{
- struct tcp_ehash_bucket *ehead;
- struct tcp_bind_hashbucket *bhead;
- struct tcp_bind_bucket *tb;
-
- /* Unlink from established hashes. */
- ehead = &tcp_ehash[tw->tw_hashent];
- write_lock(&ehead->lock);
- if (hlist_unhashed(&tw->tw_node)) {
- write_unlock(&ehead->lock);
- return;
- }
- __hlist_del(&tw->tw_node);
- sk_node_init(&tw->tw_node);
- write_unlock(&ehead->lock);
-
- /* Disassociate with bind bucket. */
- bhead = &tcp_bhash[tcp_bhashfn(tw->tw_num)];
- spin_lock(&bhead->lock);
- tb = tw->tw_tb;
- __hlist_del(&tw->tw_bind_node);
- tw->tw_tb = NULL;
- tcp_bucket_destroy(tb);
- spin_unlock(&bhead->lock);
-
-#ifdef INET_REFCNT_DEBUG
- if (atomic_read(&tw->tw_refcnt) != 1) {
- printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw,
- atomic_read(&tw->tw_refcnt));
- }
-#endif
- tcp_tw_put(tw);
-}
-
/*
* * Main purpose of TIME-WAIT state is to close connection gracefully,
* when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
@@ -122,19 +95,20 @@ static void tcp_timewait_kill(struct tcp_tw_bucket *tw)
* to avoid misread sequence numbers, states etc. --ANK
*/
enum tcp_tw_status
-tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
- struct tcphdr *th, unsigned len)
+tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
+ const struct tcphdr *th)
{
+ struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
struct tcp_options_received tmp_opt;
int paws_reject = 0;
tmp_opt.saw_tstamp = 0;
- if (th->doff > (sizeof(struct tcphdr) >> 2) && tw->tw_ts_recent_stamp) {
+ if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
tcp_parse_options(skb, &tmp_opt, 0);
if (tmp_opt.saw_tstamp) {
- tmp_opt.ts_recent = tw->tw_ts_recent;
- tmp_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
+ tmp_opt.ts_recent = tcptw->tw_ts_recent;
+ tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
paws_reject = tcp_paws_check(&tmp_opt, th->rst);
}
}
@@ -145,20 +119,20 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
/* Out of window, send ACK */
if (paws_reject ||
!tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
- tw->tw_rcv_nxt,
- tw->tw_rcv_nxt + tw->tw_rcv_wnd))
+ tcptw->tw_rcv_nxt,
+ tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
return TCP_TW_ACK;
if (th->rst)
goto kill;
- if (th->syn && !before(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt))
+ if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
goto kill_with_rst;
/* Dup ACK? */
- if (!after(TCP_SKB_CB(skb)->end_seq, tw->tw_rcv_nxt) ||
+ if (!after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
- tcp_tw_put(tw);
+ inet_twsk_put(tw);
return TCP_TW_SUCCESS;
}
@@ -166,19 +140,19 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
* reset.
*/
if (!th->fin ||
- TCP_SKB_CB(skb)->end_seq != tw->tw_rcv_nxt + 1) {
+ TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
kill_with_rst:
- tcp_tw_deschedule(tw);
- tcp_tw_put(tw);
+ inet_twsk_deschedule(tw, &tcp_death_row);
+ inet_twsk_put(tw);
return TCP_TW_RST;
}
/* FIN arrived, enter true time-wait state. */
- tw->tw_substate = TCP_TIME_WAIT;
- tw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ tw->tw_substate = TCP_TIME_WAIT;
+ tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
if (tmp_opt.saw_tstamp) {
- tw->tw_ts_recent_stamp = xtime.tv_sec;
- tw->tw_ts_recent = tmp_opt.rcv_tsval;
+ tcptw->tw_ts_recent_stamp = xtime.tv_sec;
+ tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
}
/* I am shamed, but failed to make it more elegant.
@@ -187,11 +161,13 @@ kill_with_rst:
* do not undertsnad recycling in any case, it not
* a big problem in practice. --ANK */
if (tw->tw_family == AF_INET &&
- sysctl_tcp_tw_recycle && tw->tw_ts_recent_stamp &&
+ tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp &&
tcp_v4_tw_remember_stamp(tw))
- tcp_tw_schedule(tw, tw->tw_timeout);
+ inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout,
+ TCP_TIMEWAIT_LEN);
else
- tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
+ inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
+ TCP_TIMEWAIT_LEN);
return TCP_TW_ACK;
}
@@ -213,7 +189,7 @@ kill_with_rst:
*/
if (!paws_reject &&
- (TCP_SKB_CB(skb)->seq == tw->tw_rcv_nxt &&
+ (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
(TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
/* In window segment, it may be only reset or bare ack. */
@@ -224,19 +200,20 @@ kill_with_rst:
*/
if (sysctl_tcp_rfc1337 == 0) {
kill:
- tcp_tw_deschedule(tw);
- tcp_tw_put(tw);
+ inet_twsk_deschedule(tw, &tcp_death_row);
+ inet_twsk_put(tw);
return TCP_TW_SUCCESS;
}
}
- tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
+ inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
+ TCP_TIMEWAIT_LEN);
if (tmp_opt.saw_tstamp) {
- tw->tw_ts_recent = tmp_opt.rcv_tsval;
- tw->tw_ts_recent_stamp = xtime.tv_sec;
+ tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
+ tcptw->tw_ts_recent_stamp = xtime.tv_sec;
}
- tcp_tw_put(tw);
+ inet_twsk_put(tw);
return TCP_TW_SUCCESS;
}
@@ -258,9 +235,10 @@ kill:
*/
if (th->syn && !th->rst && !th->ack && !paws_reject &&
- (after(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt) ||
- (tmp_opt.saw_tstamp && (s32)(tw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
- u32 isn = tw->tw_snd_nxt + 65535 + 2;
+ (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||
+ (tmp_opt.saw_tstamp &&
+ (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
+ u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
if (isn == 0)
isn++;
TCP_SKB_CB(skb)->when = isn;
@@ -278,107 +256,57 @@ kill:
* Do not reschedule in the last case.
*/
if (paws_reject || th->ack)
- tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
+ inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
+ TCP_TIMEWAIT_LEN);
/* Send ACK. Note, we do not put the bucket,
* it will be released by caller.
*/
return TCP_TW_ACK;
}
- tcp_tw_put(tw);
+ inet_twsk_put(tw);
return TCP_TW_SUCCESS;
}
-/* Enter the time wait state. This is called with locally disabled BH.
- * Essentially we whip up a timewait bucket, copy the
- * relevant info into it from the SK, and mess with hash chains
- * and list linkage.
- */
-static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
-{
- struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->sk_hashent];
- struct tcp_bind_hashbucket *bhead;
-
- /* Step 1: Put TW into bind hash. Original socket stays there too.
- Note, that any socket with inet_sk(sk)->num != 0 MUST be bound in
- binding cache, even if it is closed.
- */
- bhead = &tcp_bhash[tcp_bhashfn(inet_sk(sk)->num)];
- spin_lock(&bhead->lock);
- tw->tw_tb = tcp_sk(sk)->bind_hash;
- BUG_TRAP(tcp_sk(sk)->bind_hash);
- tw_add_bind_node(tw, &tw->tw_tb->owners);
- spin_unlock(&bhead->lock);
-
- write_lock(&ehead->lock);
-
- /* Step 2: Remove SK from established hash. */
- if (__sk_del_node_init(sk))
- sock_prot_dec_use(sk->sk_prot);
-
- /* Step 3: Hash TW into TIMEWAIT half of established hash table. */
- tw_add_node(tw, &(ehead + tcp_ehash_size)->chain);
- atomic_inc(&tw->tw_refcnt);
-
- write_unlock(&ehead->lock);
-}
-
/*
* Move a socket to time-wait or dead fin-wait-2 state.
*/
void tcp_time_wait(struct sock *sk, int state, int timeo)
{
- struct tcp_tw_bucket *tw = NULL;
- struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_timewait_sock *tw = NULL;
+ const struct tcp_sock *tp = tcp_sk(sk);
int recycle_ok = 0;
- if (sysctl_tcp_tw_recycle && tp->rx_opt.ts_recent_stamp)
+ if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
recycle_ok = tp->af_specific->remember_stamp(sk);
- if (tcp_tw_count < sysctl_tcp_max_tw_buckets)
- tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
-
- if(tw != NULL) {
- struct inet_sock *inet = inet_sk(sk);
- int rto = (tp->rto<<2) - (tp->rto>>1);
-
- /* Give us an identity. */
- tw->tw_daddr = inet->daddr;
- tw->tw_rcv_saddr = inet->rcv_saddr;
- tw->tw_bound_dev_if = sk->sk_bound_dev_if;
- tw->tw_num = inet->num;
- tw->tw_state = TCP_TIME_WAIT;
- tw->tw_substate = state;
- tw->tw_sport = inet->sport;
- tw->tw_dport = inet->dport;
- tw->tw_family = sk->sk_family;
- tw->tw_reuse = sk->sk_reuse;
- tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
- atomic_set(&tw->tw_refcnt, 1);
+ if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
+ tw = inet_twsk_alloc(sk, state);
- tw->tw_hashent = sk->sk_hashent;
- tw->tw_rcv_nxt = tp->rcv_nxt;
- tw->tw_snd_nxt = tp->snd_nxt;
- tw->tw_rcv_wnd = tcp_receive_window(tp);
- tw->tw_ts_recent = tp->rx_opt.ts_recent;
- tw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
- tw_dead_node_init(tw);
+ if (tw != NULL) {
+ struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
+
+ tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
+ tcptw->tw_rcv_nxt = tp->rcv_nxt;
+ tcptw->tw_snd_nxt = tp->snd_nxt;
+ tcptw->tw_rcv_wnd = tcp_receive_window(tp);
+ tcptw->tw_ts_recent = tp->rx_opt.ts_recent;
+ tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
if (tw->tw_family == PF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
+ struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw);
- ipv6_addr_copy(&tw->tw_v6_daddr, &np->daddr);
- ipv6_addr_copy(&tw->tw_v6_rcv_saddr, &np->rcv_saddr);
- tw->tw_v6_ipv6only = np->ipv6only;
- } else {
- memset(&tw->tw_v6_daddr, 0, sizeof(tw->tw_v6_daddr));
- memset(&tw->tw_v6_rcv_saddr, 0, sizeof(tw->tw_v6_rcv_saddr));
- tw->tw_v6_ipv6only = 0;
+ ipv6_addr_copy(&tcp6tw->tw_v6_daddr, &np->daddr);
+ ipv6_addr_copy(&tcp6tw->tw_v6_rcv_saddr, &np->rcv_saddr);
+ tw->tw_ipv6only = np->ipv6only;
}
#endif
/* Linkage updates. */
- __tcp_tw_hashdance(sk, tw);
+ __inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
/* Get the TIME_WAIT timeout firing. */
if (timeo < rto)
@@ -392,8 +320,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
timeo = TCP_TIMEWAIT_LEN;
}
- tcp_tw_schedule(tw, timeo);
- tcp_tw_put(tw);
+ inet_twsk_schedule(tw, &tcp_death_row, timeo,
+ TCP_TIMEWAIT_LEN);
+ inet_twsk_put(tw);
} else {
/* Sorry, if we're out of memory, just CLOSE this
* socket up. We've got bigger problems than
@@ -407,277 +336,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
tcp_done(sk);
}
-/* Kill off TIME_WAIT sockets once their lifetime has expired. */
-static int tcp_tw_death_row_slot;
-
-static void tcp_twkill(unsigned long);
-
-/* TIME_WAIT reaping mechanism. */
-#define TCP_TWKILL_SLOTS 8 /* Please keep this a power of 2. */
-#define TCP_TWKILL_PERIOD (TCP_TIMEWAIT_LEN/TCP_TWKILL_SLOTS)
-
-#define TCP_TWKILL_QUOTA 100
-
-static struct hlist_head tcp_tw_death_row[TCP_TWKILL_SLOTS];
-static DEFINE_SPINLOCK(tw_death_lock);
-static struct timer_list tcp_tw_timer = TIMER_INITIALIZER(tcp_twkill, 0, 0);
-static void twkill_work(void *);
-static DECLARE_WORK(tcp_twkill_work, twkill_work, NULL);
-static u32 twkill_thread_slots;
-
-/* Returns non-zero if quota exceeded. */
-static int tcp_do_twkill_work(int slot, unsigned int quota)
-{
- struct tcp_tw_bucket *tw;
- struct hlist_node *node;
- unsigned int killed;
- int ret;
-
- /* NOTE: compare this to previous version where lock
- * was released after detaching chain. It was racy,
- * because tw buckets are scheduled in not serialized context
- * in 2.3 (with netfilter), and with softnet it is common, because
- * soft irqs are not sequenced.
- */
- killed = 0;
- ret = 0;
-rescan:
- tw_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) {
- __tw_del_dead_node(tw);
- spin_unlock(&tw_death_lock);
- tcp_timewait_kill(tw);
- tcp_tw_put(tw);
- killed++;
- spin_lock(&tw_death_lock);
- if (killed > quota) {
- ret = 1;
- break;
- }
-
- /* While we dropped tw_death_lock, another cpu may have
- * killed off the next TW bucket in the list, therefore
- * do a fresh re-read of the hlist head node with the
- * lock reacquired. We still use the hlist traversal
- * macro in order to get the prefetches.
- */
- goto rescan;
- }
-
- tcp_tw_count -= killed;
- NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed);
-
- return ret;
-}
-
-static void tcp_twkill(unsigned long dummy)
-{
- int need_timer, ret;
-
- spin_lock(&tw_death_lock);
-
- if (tcp_tw_count == 0)
- goto out;
-
- need_timer = 0;
- ret = tcp_do_twkill_work(tcp_tw_death_row_slot, TCP_TWKILL_QUOTA);
- if (ret) {
- twkill_thread_slots |= (1 << tcp_tw_death_row_slot);
- mb();
- schedule_work(&tcp_twkill_work);
- need_timer = 1;
- } else {
- /* We purged the entire slot, anything left? */
- if (tcp_tw_count)
- need_timer = 1;
- }
- tcp_tw_death_row_slot =
- ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
- if (need_timer)
- mod_timer(&tcp_tw_timer, jiffies + TCP_TWKILL_PERIOD);
-out:
- spin_unlock(&tw_death_lock);
-}
-
-extern void twkill_slots_invalid(void);
-
-static void twkill_work(void *dummy)
-{
- int i;
-
- if ((TCP_TWKILL_SLOTS - 1) > (sizeof(twkill_thread_slots) * 8))
- twkill_slots_invalid();
-
- while (twkill_thread_slots) {
- spin_lock_bh(&tw_death_lock);
- for (i = 0; i < TCP_TWKILL_SLOTS; i++) {
- if (!(twkill_thread_slots & (1 << i)))
- continue;
-
- while (tcp_do_twkill_work(i, TCP_TWKILL_QUOTA) != 0) {
- if (need_resched()) {
- spin_unlock_bh(&tw_death_lock);
- schedule();
- spin_lock_bh(&tw_death_lock);
- }
- }
-
- twkill_thread_slots &= ~(1 << i);
- }
- spin_unlock_bh(&tw_death_lock);
- }
-}
-
-/* These are always called from BH context. See callers in
- * tcp_input.c to verify this.
- */
-
-/* This is for handling early-kills of TIME_WAIT sockets. */
-void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
-{
- spin_lock(&tw_death_lock);
- if (tw_del_dead_node(tw)) {
- tcp_tw_put(tw);
- if (--tcp_tw_count == 0)
- del_timer(&tcp_tw_timer);
- }
- spin_unlock(&tw_death_lock);
- tcp_timewait_kill(tw);
-}
-
-/* Short-time timewait calendar */
-
-static int tcp_twcal_hand = -1;
-static int tcp_twcal_jiffie;
-static void tcp_twcal_tick(unsigned long);
-static struct timer_list tcp_twcal_timer =
- TIMER_INITIALIZER(tcp_twcal_tick, 0, 0);
-static struct hlist_head tcp_twcal_row[TCP_TW_RECYCLE_SLOTS];
-
-static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo)
-{
- struct hlist_head *list;
- int slot;
-
- /* timeout := RTO * 3.5
- *
- * 3.5 = 1+2+0.5 to wait for two retransmits.
- *
- * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
- * our ACK acking that FIN can be lost. If N subsequent retransmitted
- * FINs (or previous seqments) are lost (probability of such event
- * is p^(N+1), where p is probability to lose single packet and
- * time to detect the loss is about RTO*(2^N - 1) with exponential
- * backoff). Normal timewait length is calculated so, that we
- * waited at least for one retransmitted FIN (maximal RTO is 120sec).
- * [ BTW Linux. following BSD, violates this requirement waiting
- * only for 60sec, we should wait at least for 240 secs.
- * Well, 240 consumes too much of resources 8)
- * ]
- * This interval is not reduced to catch old duplicate and
- * responces to our wandering segments living for two MSLs.
- * However, if we use PAWS to detect
- * old duplicates, we can reduce the interval to bounds required
- * by RTO, rather than MSL. So, if peer understands PAWS, we
- * kill tw bucket after 3.5*RTO (it is important that this number
- * is greater than TS tick!) and detect old duplicates with help
- * of PAWS.
- */
- slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK;
-
- spin_lock(&tw_death_lock);
-
- /* Unlink it, if it was scheduled */
- if (tw_del_dead_node(tw))
- tcp_tw_count--;
- else
- atomic_inc(&tw->tw_refcnt);
-
- if (slot >= TCP_TW_RECYCLE_SLOTS) {
- /* Schedule to slow timer */
- if (timeo >= TCP_TIMEWAIT_LEN) {
- slot = TCP_TWKILL_SLOTS-1;
- } else {
- slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD;
- if (slot >= TCP_TWKILL_SLOTS)
- slot = TCP_TWKILL_SLOTS-1;
- }
- tw->tw_ttd = jiffies + timeo;
- slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1);
- list = &tcp_tw_death_row[slot];
- } else {
- tw->tw_ttd = jiffies + (slot << TCP_TW_RECYCLE_TICK);
-
- if (tcp_twcal_hand < 0) {
- tcp_twcal_hand = 0;
- tcp_twcal_jiffie = jiffies;
- tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK);
- add_timer(&tcp_twcal_timer);
- } else {
- if (time_after(tcp_twcal_timer.expires, jiffies + (slot<<TCP_TW_RECYCLE_TICK)))
- mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK));
- slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1);
- }
- list = &tcp_twcal_row[slot];
- }
-
- hlist_add_head(&tw->tw_death_node, list);
-
- if (tcp_tw_count++ == 0)
- mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
- spin_unlock(&tw_death_lock);
-}
-
-void tcp_twcal_tick(unsigned long dummy)
-{
- int n, slot;
- unsigned long j;
- unsigned long now = jiffies;
- int killed = 0;
- int adv = 0;
-
- spin_lock(&tw_death_lock);
- if (tcp_twcal_hand < 0)
- goto out;
-
- slot = tcp_twcal_hand;
- j = tcp_twcal_jiffie;
-
- for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) {
- if (time_before_eq(j, now)) {
- struct hlist_node *node, *safe;
- struct tcp_tw_bucket *tw;
-
- tw_for_each_inmate_safe(tw, node, safe,
- &tcp_twcal_row[slot]) {
- __tw_del_dead_node(tw);
- tcp_timewait_kill(tw);
- tcp_tw_put(tw);
- killed++;
- }
- } else {
- if (!adv) {
- adv = 1;
- tcp_twcal_jiffie = j;
- tcp_twcal_hand = slot;
- }
-
- if (!hlist_empty(&tcp_twcal_row[slot])) {
- mod_timer(&tcp_twcal_timer, j);
- goto out;
- }
- }
- j += (1<<TCP_TW_RECYCLE_TICK);
- slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1);
- }
- tcp_twcal_hand = -1;
-
-out:
- if ((tcp_tw_count -= killed) == 0)
- del_timer(&tcp_tw_timer);
- NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed);
- spin_unlock(&tw_death_lock);
-}
-
/* This is not only more efficient than what we used to do, it eliminates
* a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
*
@@ -686,75 +344,27 @@ out:
*/
struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
{
- /* allocate the newsk from the same slab of the master sock,
- * if not, at sk_free time we'll try to free it from the wrong
- * slabcache (i.e. is it TCPv4 or v6?), this is handled thru sk->sk_prot -acme */
- struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, sk->sk_prot, 0);
+ struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
- if(newsk != NULL) {
- struct inet_request_sock *ireq = inet_rsk(req);
+ if (newsk != NULL) {
+ const struct inet_request_sock *ireq = inet_rsk(req);
struct tcp_request_sock *treq = tcp_rsk(req);
+ struct inet_connection_sock *newicsk = inet_csk(sk);
struct tcp_sock *newtp;
- struct sk_filter *filter;
-
- memcpy(newsk, sk, sizeof(struct tcp_sock));
- newsk->sk_state = TCP_SYN_RECV;
-
- /* SANITY */
- sk_node_init(&newsk->sk_node);
- tcp_sk(newsk)->bind_hash = NULL;
-
- /* Clone the TCP header template */
- inet_sk(newsk)->dport = ireq->rmt_port;
-
- sock_lock_init(newsk);
- bh_lock_sock(newsk);
-
- rwlock_init(&newsk->sk_dst_lock);
- atomic_set(&newsk->sk_rmem_alloc, 0);
- skb_queue_head_init(&newsk->sk_receive_queue);
- atomic_set(&newsk->sk_wmem_alloc, 0);
- skb_queue_head_init(&newsk->sk_write_queue);
- atomic_set(&newsk->sk_omem_alloc, 0);
- newsk->sk_wmem_queued = 0;
- newsk->sk_forward_alloc = 0;
-
- sock_reset_flag(newsk, SOCK_DONE);
- newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
- newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
- newsk->sk_send_head = NULL;
- rwlock_init(&newsk->sk_callback_lock);
- skb_queue_head_init(&newsk->sk_error_queue);
- newsk->sk_write_space = sk_stream_write_space;
-
- if ((filter = newsk->sk_filter) != NULL)
- sk_filter_charge(newsk, filter);
-
- if (unlikely(xfrm_sk_clone_policy(newsk))) {
- /* It is still raw copy of parent, so invalidate
- * destructor and make plain sk_free() */
- newsk->sk_destruct = NULL;
- sk_free(newsk);
- return NULL;
- }
/* Now setup tcp_sock */
newtp = tcp_sk(newsk);
newtp->pred_flags = 0;
newtp->rcv_nxt = treq->rcv_isn + 1;
- newtp->snd_nxt = treq->snt_isn + 1;
- newtp->snd_una = treq->snt_isn + 1;
- newtp->snd_sml = treq->snt_isn + 1;
+ newtp->snd_nxt = newtp->snd_una = newtp->snd_sml = treq->snt_isn + 1;
tcp_prequeue_init(newtp);
tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn);
- newtp->retransmits = 0;
- newtp->backoff = 0;
newtp->srtt = 0;
newtp->mdev = TCP_TIMEOUT_INIT;
- newtp->rto = TCP_TIMEOUT_INIT;
+ newicsk->icsk_rto = TCP_TIMEOUT_INIT;
newtp->packets_out = 0;
newtp->left_out = 0;
@@ -774,9 +384,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->frto_counter = 0;
newtp->frto_highmark = 0;
- newtp->ca_ops = &tcp_reno;
+ newicsk->icsk_ca_ops = &tcp_init_congestion_ops;
- tcp_set_ca_state(newtp, TCP_CA_Open);
+ tcp_set_ca_state(newsk, TCP_CA_Open);
tcp_init_xmit_timers(newsk);
skb_queue_head_init(&newtp->out_of_order_queue);
newtp->rcv_wup = treq->rcv_isn + 1;
@@ -789,26 +399,12 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->rx_opt.dsack = 0;
newtp->rx_opt.eff_sacks = 0;
- newtp->probes_out = 0;
newtp->rx_opt.num_sacks = 0;
newtp->urg_data = 0;
- /* Deinitialize accept_queue to trap illegal accesses. */
- memset(&newtp->accept_queue, 0, sizeof(newtp->accept_queue));
-
- /* Back to base struct sock members. */
- newsk->sk_err = 0;
- newsk->sk_priority = 0;
- atomic_set(&newsk->sk_refcnt, 2);
-#ifdef INET_REFCNT_DEBUG
- atomic_inc(&inet_sock_nr);
-#endif
- atomic_inc(&tcp_sockets_allocated);
if (sock_flag(newsk, SOCK_KEEPOPEN))
- tcp_reset_keepalive_timer(newsk,
- keepalive_time_when(newtp));
- newsk->sk_socket = NULL;
- newsk->sk_sleep = NULL;
+ inet_csk_reset_keepalive_timer(newsk,
+ keepalive_time_when(newtp));
newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
if((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
@@ -838,7 +434,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->tcp_header_len = sizeof(struct tcphdr);
}
if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len)
- newtp->ack.last_seg_size = skb->len-newtp->tcp_header_len;
+ newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
newtp->rx_opt.mss_clamp = req->mss;
TCP_ECN_openreq_child(newtp, req);
if (newtp->ecn_flags&TCP_ECN_OK)
@@ -934,9 +530,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
does sequence test, SYN is truncated, and thus we consider
it a bare ACK.
- If tp->defer_accept, we silently drop this bare ACK. Otherwise,
- we create an established connection. Both ends (listening sockets)
- accept the new incoming connection and try to talk to each other. 8-)
+ If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
+ bare ACK. Otherwise, we create an established connection. Both
+ ends (listening sockets) accept the new incoming connection and try
+ to talk to each other. 8-)
Note: This case is both harmless, and rare. Possibility is about the
same as us discovering intelligent life on another plant tomorrow.
@@ -1003,7 +600,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
return NULL;
/* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
- if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
+ if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
+ TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
inet_rsk(req)->acked = 1;
return NULL;
}
@@ -1018,10 +616,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
if (child == NULL)
goto listen_overflow;
- tcp_synq_unlink(tp, req, prev);
- tcp_synq_removed(sk, req);
+ inet_csk_reqsk_queue_unlink(sk, req, prev);
+ inet_csk_reqsk_queue_removed(sk, req);
- tcp_acceptq_queue(sk, req, child);
+ inet_csk_reqsk_queue_add(sk, req, child);
return child;
listen_overflow:
@@ -1035,7 +633,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
if (!(flg & TCP_FLAG_RST))
req->rsk_ops->send_reset(skb);
- tcp_synq_drop(sk, req, prev);
+ inet_csk_reqsk_queue_drop(sk, req, prev);
return NULL;
}
@@ -1074,4 +672,3 @@ EXPORT_SYMBOL(tcp_check_req);
EXPORT_SYMBOL(tcp_child_process);
EXPORT_SYMBOL(tcp_create_openreq_child);
EXPORT_SYMBOL(tcp_timewait_state_process);
-EXPORT_SYMBOL(tcp_tw_deschedule);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e3f8ea1bfa9..b907456a79f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -105,18 +105,19 @@ static __u16 tcp_advertise_mss(struct sock *sk)
/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
* This is the first part of cwnd validation mechanism. */
-static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
+static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst)
{
+ struct tcp_sock *tp = tcp_sk(sk);
s32 delta = tcp_time_stamp - tp->lsndtime;
u32 restart_cwnd = tcp_init_cwnd(tp, dst);
u32 cwnd = tp->snd_cwnd;
- tcp_ca_event(tp, CA_EVENT_CWND_RESTART);
+ tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
- tp->snd_ssthresh = tcp_current_ssthresh(tp);
+ tp->snd_ssthresh = tcp_current_ssthresh(sk);
restart_cwnd = min(restart_cwnd, cwnd);
- while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd)
+ while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
cwnd >>= 1;
tp->snd_cwnd = max(cwnd, restart_cwnd);
tp->snd_cwnd_stamp = tcp_time_stamp;
@@ -126,26 +127,25 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
static inline void tcp_event_data_sent(struct tcp_sock *tp,
struct sk_buff *skb, struct sock *sk)
{
- u32 now = tcp_time_stamp;
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ const u32 now = tcp_time_stamp;
- if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto)
- tcp_cwnd_restart(tp, __sk_dst_get(sk));
+ if (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)
+ tcp_cwnd_restart(sk, __sk_dst_get(sk));
tp->lsndtime = now;
/* If it is a reply for ato after last received
* packet, enter pingpong mode.
*/
- if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato)
- tp->ack.pingpong = 1;
+ if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
+ icsk->icsk_ack.pingpong = 1;
}
static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
{
- struct tcp_sock *tp = tcp_sk(sk);
-
- tcp_dec_quickack_mode(tp, pkts);
- tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
+ tcp_dec_quickack_mode(sk, pkts);
+ inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
}
/* Determine a window scaling and initial window to offer.
@@ -190,7 +190,7 @@ void tcp_select_initial_window(int __space, __u32 mss,
}
/* Set initial window to value enough for senders,
- * following RFC1414. Senders, not following this RFC,
+ * following RFC2414. Senders, not following this RFC,
* will be satisfied with 2.
*/
if (mss > (1<<*rcv_wscale)) {
@@ -265,6 +265,7 @@ static __inline__ u16 tcp_select_window(struct sock *sk)
static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
{
if (skb != NULL) {
+ const struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_sock *inet = inet_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
@@ -280,8 +281,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
#define SYSCTL_FLAG_SACK 0x4
/* If congestion control is doing timestamping */
- if (tp->ca_ops->rtt_sample)
- do_gettimeofday(&skb->stamp);
+ if (icsk->icsk_ca_ops->rtt_sample)
+ __net_timestamp(skb);
sysctl_flags = 0;
if (tcb->flags & TCPCB_FLAG_SYN) {
@@ -308,7 +309,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
}
if (tcp_packets_in_flight(tp) == 0)
- tcp_ca_event(tp, CA_EVENT_TX_START);
+ tcp_ca_event(sk, CA_EVENT_TX_START);
th = (struct tcphdr *) skb_push(skb, tcp_header_size);
skb->h.th = th;
@@ -366,7 +367,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
if (err <= 0)
return err;
- tcp_enter_cwr(tp);
+ tcp_enter_cwr(sk);
/* NET_XMIT_CN is special. It does not guarantee,
* that this packet is lost. It tells that device
@@ -403,11 +404,9 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
sk->sk_send_head = skb;
}
-static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
+static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
{
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (skb->len <= tp->mss_cache ||
+ if (skb->len <= mss_now ||
!(sk->sk_route_caps & NETIF_F_TSO)) {
/* Avoid the costly divide in the normal
* non-TSO case.
@@ -417,10 +416,10 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
} else {
unsigned int factor;
- factor = skb->len + (tp->mss_cache - 1);
- factor /= tp->mss_cache;
+ factor = skb->len + (mss_now - 1);
+ factor /= mss_now;
skb_shinfo(skb)->tso_segs = factor;
- skb_shinfo(skb)->tso_size = tp->mss_cache;
+ skb_shinfo(skb)->tso_size = mss_now;
}
}
@@ -429,13 +428,14 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
* packet to the list. This won't be called frequently, I hope.
* Remember, these are still headerless SKBs at this point.
*/
-static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
+int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss_now)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *buff;
- int nsize;
+ int nsize, old_factor;
u16 flags;
+ BUG_ON(len > skb->len);
nsize = skb_headlen(skb) - len;
if (nsize < 0)
nsize = 0;
@@ -460,9 +460,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
flags = TCP_SKB_CB(skb)->flags;
TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
TCP_SKB_CB(buff)->flags = flags;
- TCP_SKB_CB(buff)->sacked =
- (TCP_SKB_CB(skb)->sacked &
- (TCPCB_LOST | TCPCB_EVER_RETRANS | TCPCB_AT_TAIL));
+ TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL;
if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_HW) {
@@ -484,30 +482,51 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
* skbs, which it never sent before. --ANK
*/
TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
- buff->stamp = skb->stamp;
+ buff->tstamp = skb->tstamp;
- if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
- tp->lost_out -= tcp_skb_pcount(skb);
- tp->left_out -= tcp_skb_pcount(skb);
- }
+ old_factor = tcp_skb_pcount(skb);
/* Fix up tso_factor for both original and new SKB. */
- tcp_set_skb_tso_segs(sk, skb);
- tcp_set_skb_tso_segs(sk, buff);
+ tcp_set_skb_tso_segs(sk, skb, mss_now);
+ tcp_set_skb_tso_segs(sk, buff, mss_now);
- if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
- tp->lost_out += tcp_skb_pcount(skb);
- tp->left_out += tcp_skb_pcount(skb);
- }
+ /* If this packet has been sent out already, we must
+ * adjust the various packet counters.
+ */
+ if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
+ int diff = old_factor - tcp_skb_pcount(skb) -
+ tcp_skb_pcount(buff);
+
+ tp->packets_out -= diff;
+
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
+ tp->sacked_out -= diff;
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
+ tp->retrans_out -= diff;
+
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
+ tp->lost_out -= diff;
+ tp->left_out -= diff;
+ }
+
+ if (diff > 0) {
+ /* Adjust Reno SACK estimate. */
+ if (!tp->rx_opt.sack_ok) {
+ tp->sacked_out -= diff;
+ if ((int)tp->sacked_out < 0)
+ tp->sacked_out = 0;
+ tcp_sync_left_out(tp);
+ }
- if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) {
- tp->lost_out += tcp_skb_pcount(buff);
- tp->left_out += tcp_skb_pcount(buff);
+ tp->fackets_out -= diff;
+ if ((int)tp->fackets_out < 0)
+ tp->fackets_out = 0;
+ }
}
/* Link BUFF into the send queue. */
skb_header_release(buff);
- __skb_append(skb, buff);
+ __skb_append(skb, buff, &sk->sk_write_queue);
return 0;
}
@@ -569,7 +588,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
* factor and mss.
*/
if (tcp_skb_pcount(skb) > 1)
- tcp_set_skb_tso_segs(sk, skb);
+ tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk, 1));
return 0;
}
@@ -698,7 +717,7 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
if (tp->packets_out > tp->snd_cwnd_used)
tp->snd_cwnd_used = tp->packets_out;
- if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto)
+ if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto)
tcp_cwnd_application_limited(sk);
}
}
@@ -734,12 +753,14 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *sk
/* This must be invoked the first time we consider transmitting
* SKB onto the wire.
*/
-static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb)
+static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
{
int tso_segs = tcp_skb_pcount(skb);
- if (!tso_segs) {
- tcp_set_skb_tso_segs(sk, skb);
+ if (!tso_segs ||
+ (tso_segs > 1 &&
+ skb_shinfo(skb)->tso_size != mss_now)) {
+ tcp_set_skb_tso_segs(sk, skb, mss_now);
tso_segs = tcp_skb_pcount(skb);
}
return tso_segs;
@@ -817,7 +838,7 @@ static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
struct tcp_sock *tp = tcp_sk(sk);
unsigned int cwnd_quota;
- tcp_init_tso_segs(sk, skb);
+ tcp_init_tso_segs(sk, skb, cur_mss);
if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
return 0;
@@ -854,14 +875,15 @@ int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
* know that all the data is in scatter-gather pages, and that the
* packet has never been sent out before (and thus is not cloned).
*/
-static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len)
+static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, unsigned int mss_now)
{
struct sk_buff *buff;
int nlen = skb->len - len;
u16 flags;
/* All of a TSO frame must be composed of paged data. */
- BUG_ON(skb->len != skb->data_len);
+ if (skb->len != skb->data_len)
+ return tcp_fragment(sk, skb, len, mss_now);
buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC);
if (unlikely(buff == NULL))
@@ -887,12 +909,12 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len)
skb_split(skb, buff, len);
/* Fix up tso_factor for both original and new SKB. */
- tcp_set_skb_tso_segs(sk, skb);
- tcp_set_skb_tso_segs(sk, buff);
+ tcp_set_skb_tso_segs(sk, skb, mss_now);
+ tcp_set_skb_tso_segs(sk, buff, mss_now);
/* Link BUFF into the send queue. */
skb_header_release(buff);
- __skb_append(skb, buff);
+ __skb_append(skb, buff, &sk->sk_write_queue);
return 0;
}
@@ -904,12 +926,13 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len)
*/
static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
u32 send_win, cong_win, limit, in_flight;
if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
return 0;
- if (tp->ca_state != TCP_CA_Open)
+ if (icsk->icsk_ca_state != TCP_CA_Open)
return 0;
in_flight = tcp_packets_in_flight(tp);
@@ -924,10 +947,6 @@ static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_
limit = min(send_win, cong_win);
- /* If sk_send_head can be sent fully now, just do it. */
- if (skb->len <= limit)
- return 0;
-
if (sysctl_tcp_tso_win_divisor) {
u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
@@ -972,19 +991,20 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
if (unlikely(sk->sk_state == TCP_CLOSE))
return 0;
- skb = sk->sk_send_head;
- if (unlikely(!skb))
- return 0;
-
- tso_segs = tcp_init_tso_segs(sk, skb);
- cwnd_quota = tcp_cwnd_test(tp, skb);
- if (unlikely(!cwnd_quota))
- goto out;
-
sent_pkts = 0;
- while (likely(tcp_snd_wnd_test(tp, skb, mss_now))) {
+ while ((skb = sk->sk_send_head)) {
+ unsigned int limit;
+
+ tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
BUG_ON(!tso_segs);
+ cwnd_quota = tcp_cwnd_test(tp, skb);
+ if (!cwnd_quota)
+ break;
+
+ if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
+ break;
+
if (tso_segs == 1) {
if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
(tcp_skb_is_last(sk, skb) ?
@@ -995,9 +1015,10 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
break;
}
+ limit = mss_now;
if (tso_segs > 1) {
- u32 limit = tcp_window_allows(tp, skb,
- mss_now, cwnd_quota);
+ limit = tcp_window_allows(tp, skb,
+ mss_now, cwnd_quota);
if (skb->len < limit) {
unsigned int trim = skb->len % mss_now;
@@ -1005,15 +1026,12 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
if (trim)
limit = skb->len - trim;
}
- if (skb->len > limit) {
- if (tso_fragment(sk, skb, limit))
- break;
- }
- } else if (unlikely(skb->len > mss_now)) {
- if (unlikely(tcp_fragment(sk, skb, mss_now)))
- break;
}
+ if (skb->len > limit &&
+ unlikely(tso_fragment(sk, skb, limit, mss_now)))
+ break;
+
TCP_SKB_CB(skb)->when = tcp_time_stamp;
if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))))
@@ -1026,27 +1044,12 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
tcp_minshall_update(tp, mss_now, skb);
sent_pkts++;
-
- /* Do not optimize this to use tso_segs. If we chopped up
- * the packet above, tso_segs will no longer be valid.
- */
- cwnd_quota -= tcp_skb_pcount(skb);
-
- BUG_ON(cwnd_quota < 0);
- if (!cwnd_quota)
- break;
-
- skb = sk->sk_send_head;
- if (!skb)
- break;
- tso_segs = tcp_init_tso_segs(sk, skb);
}
if (likely(sent_pkts)) {
tcp_cwnd_validate(sk, tp);
return 0;
}
-out:
return !tp->packets_out && sk->sk_send_head;
}
@@ -1076,15 +1079,18 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
BUG_ON(!skb || skb->len < mss_now);
- tso_segs = tcp_init_tso_segs(sk, skb);
+ tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);
if (likely(cwnd_quota)) {
+ unsigned int limit;
+
BUG_ON(!tso_segs);
+ limit = mss_now;
if (tso_segs > 1) {
- u32 limit = tcp_window_allows(tp, skb,
- mss_now, cwnd_quota);
+ limit = tcp_window_allows(tp, skb,
+ mss_now, cwnd_quota);
if (skb->len < limit) {
unsigned int trim = skb->len % mss_now;
@@ -1092,15 +1098,12 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
if (trim)
limit = skb->len - trim;
}
- if (skb->len > limit) {
- if (unlikely(tso_fragment(sk, skb, limit)))
- return;
- }
- } else if (unlikely(skb->len > mss_now)) {
- if (unlikely(tcp_fragment(sk, skb, mss_now)))
- return;
}
+ if (skb->len > limit &&
+ unlikely(tso_fragment(sk, skb, limit, mss_now)))
+ return;
+
/* Send it out now. */
TCP_SKB_CB(skb)->when = tcp_time_stamp;
@@ -1166,6 +1169,7 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
*/
u32 __tcp_select_window(struct sock *sk)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
/* MSS for the peer's data. Previous verions used mss_clamp
* here. I don't know if the value based on our guesses
@@ -1173,7 +1177,7 @@ u32 __tcp_select_window(struct sock *sk)
* but may be worse for the performance because of rcv_mss
* fluctuations. --SAW 1998/11/1
*/
- int mss = tp->ack.rcv_mss;
+ int mss = icsk->icsk_ack.rcv_mss;
int free_space = tcp_space(sk);
int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
int window;
@@ -1182,7 +1186,7 @@ u32 __tcp_select_window(struct sock *sk)
mss = full_space;
if (free_space < full_space/2) {
- tp->ack.quick = 0;
+ icsk->icsk_ack.quick = 0;
if (tcp_memory_pressure)
tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss);
@@ -1257,7 +1261,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
tcp_skb_pcount(next_skb) != 1);
/* Ok. We will be able to collapse the packet. */
- __skb_unlink(next_skb, next_skb->list);
+ __skb_unlink(next_skb, &sk->sk_write_queue);
memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
@@ -1305,6 +1309,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
*/
void tcp_simple_retransmit(struct sock *sk)
{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
unsigned int mss = tcp_current_mss(sk, 0);
@@ -1335,12 +1340,12 @@ void tcp_simple_retransmit(struct sock *sk)
* in network, but units changed and effective
* cwnd/ssthresh really reduced now.
*/
- if (tp->ca_state != TCP_CA_Loss) {
+ if (icsk->icsk_ca_state != TCP_CA_Loss) {
tp->high_seq = tp->snd_nxt;
- tp->snd_ssthresh = tcp_current_ssthresh(tp);
+ tp->snd_ssthresh = tcp_current_ssthresh(sk);
tp->prior_ssthresh = 0;
tp->undo_marker = 0;
- tcp_set_ca_state(tp, TCP_CA_Loss);
+ tcp_set_ca_state(sk, TCP_CA_Loss);
}
tcp_xmit_retransmit_queue(sk);
}
@@ -1365,12 +1370,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
BUG();
-
- if (sk->sk_route_caps & NETIF_F_TSO) {
- sk->sk_route_caps &= ~NETIF_F_TSO;
- sock_set_flag(sk, SOCK_NO_LARGESEND);
- }
-
if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
return -ENOMEM;
}
@@ -1385,16 +1384,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
return -EAGAIN;
if (skb->len > cur_mss) {
- int old_factor = tcp_skb_pcount(skb);
- int new_factor;
-
- if (tcp_fragment(sk, skb, cur_mss))
+ if (tcp_fragment(sk, skb, cur_mss, cur_mss))
return -ENOMEM; /* We'll try again later. */
-
- /* New SKB created, account for it. */
- new_factor = tcp_skb_pcount(skb);
- tp->packets_out -= old_factor - new_factor;
- tp->packets_out += tcp_skb_pcount(skb->next);
}
/* Collapse two adjacent packets if worthwhile and we can. */
@@ -1474,6 +1465,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
*/
void tcp_xmit_retransmit_queue(struct sock *sk)
{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
int packet_cnt = tp->lost_out;
@@ -1497,14 +1489,16 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
if (tcp_retransmit_skb(sk, skb))
return;
- if (tp->ca_state != TCP_CA_Loss)
+ if (icsk->icsk_ca_state != TCP_CA_Loss)
NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
else
NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS);
if (skb ==
skb_peek(&sk->sk_write_queue))
- tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ inet_csk(sk)->icsk_rto,
+ TCP_RTO_MAX);
}
packet_cnt -= tcp_skb_pcount(skb);
@@ -1517,7 +1511,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
/* OK, demanded retransmission is finished. */
/* Forward retransmissions are possible only during Recovery. */
- if (tp->ca_state != TCP_CA_Recovery)
+ if (icsk->icsk_ca_state != TCP_CA_Recovery)
return;
/* No forward retransmissions in Reno are possible. */
@@ -1557,7 +1551,9 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
break;
if (skb == skb_peek(&sk->sk_write_queue))
- tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ inet_csk(sk)->icsk_rto,
+ TCP_RTO_MAX);
NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS);
}
@@ -1586,7 +1582,7 @@ void tcp_send_fin(struct sock *sk)
} else {
/* Socket is locked, keep trying until memory is available. */
for (;;) {
- skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
+ skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_KERNEL);
if (skb)
break;
yield();
@@ -1613,7 +1609,7 @@ void tcp_send_fin(struct sock *sk)
* was unread data in the receive queue. This behavior is recommended
* by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
*/
-void tcp_send_active_reset(struct sock *sk, unsigned int __nocast priority)
+void tcp_send_active_reset(struct sock *sk, gfp_t priority)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
@@ -1793,8 +1789,8 @@ static inline void tcp_connect_init(struct sock *sk)
tp->rcv_wup = 0;
tp->copied_seq = 0;
- tp->rto = TCP_TIMEOUT_INIT;
- tp->retransmits = 0;
+ inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
+ inet_csk(sk)->icsk_retransmits = 0;
tcp_clear_retrans(tp);
}
@@ -1808,7 +1804,7 @@ int tcp_connect(struct sock *sk)
tcp_connect_init(sk);
- buff = alloc_skb(MAX_TCP_HEADER + 15, sk->sk_allocation);
+ buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
if (unlikely(buff == NULL))
return -ENOBUFS;
@@ -1837,7 +1833,8 @@ int tcp_connect(struct sock *sk)
TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
/* Timer for repeating the SYN until an answer. */
- tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
return 0;
}
@@ -1847,20 +1844,21 @@ int tcp_connect(struct sock *sk)
*/
void tcp_send_delayed_ack(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
- int ato = tp->ack.ato;
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ int ato = icsk->icsk_ack.ato;
unsigned long timeout;
if (ato > TCP_DELACK_MIN) {
+ const struct tcp_sock *tp = tcp_sk(sk);
int max_ato = HZ/2;
- if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED))
+ if (icsk->icsk_ack.pingpong || (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
max_ato = TCP_DELACK_MAX;
/* Slow path, intersegment interval is "high". */
/* If some rtt estimate is known, use it to bound delayed ack.
- * Do not use tp->rto here, use results of rtt measurements
+ * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
* directly.
*/
if (tp->srtt) {
@@ -1877,21 +1875,22 @@ void tcp_send_delayed_ack(struct sock *sk)
timeout = jiffies + ato;
/* Use new timeout only if there wasn't a older one earlier. */
- if (tp->ack.pending&TCP_ACK_TIMER) {
+ if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
/* If delack timer was blocked or is about to expire,
* send ACK now.
*/
- if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) {
+ if (icsk->icsk_ack.blocked ||
+ time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
tcp_send_ack(sk);
return;
}
- if (!time_before(timeout, tp->ack.timeout))
- timeout = tp->ack.timeout;
+ if (!time_before(timeout, icsk->icsk_ack.timeout))
+ timeout = icsk->icsk_ack.timeout;
}
- tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER;
- tp->ack.timeout = timeout;
- sk_reset_timer(sk, &tp->delack_timer, timeout);
+ icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
+ icsk->icsk_ack.timeout = timeout;
+ sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
}
/* This routine sends an ack and also updates the window. */
@@ -1908,9 +1907,10 @@ void tcp_send_ack(struct sock *sk)
*/
buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
if (buff == NULL) {
- tcp_schedule_ack(tp);
- tp->ack.ato = TCP_ATO_MIN;
- tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
+ inet_csk_schedule_ack(sk);
+ inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+ TCP_DELACK_MAX, TCP_RTO_MAX);
return;
}
@@ -1991,16 +1991,10 @@ int tcp_write_wakeup(struct sock *sk)
skb->len > mss) {
seg_size = min(seg_size, mss);
TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
- if (tcp_fragment(sk, skb, seg_size))
+ if (tcp_fragment(sk, skb, seg_size, mss))
return -1;
- /* SWS override triggered forced fragmentation.
- * Disable TSO, the connection is too sick. */
- if (sk->sk_route_caps & NETIF_F_TSO) {
- sock_set_flag(sk, SOCK_NO_LARGESEND);
- sk->sk_route_caps &= ~NETIF_F_TSO;
- }
} else if (!tcp_skb_pcount(skb))
- tcp_set_skb_tso_segs(sk, skb);
+ tcp_set_skb_tso_segs(sk, skb, mss);
TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
TCP_SKB_CB(skb)->when = tcp_time_stamp;
@@ -2024,6 +2018,7 @@ int tcp_write_wakeup(struct sock *sk)
*/
void tcp_send_probe0(struct sock *sk)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
int err;
@@ -2031,28 +2026,31 @@ void tcp_send_probe0(struct sock *sk)
if (tp->packets_out || !sk->sk_send_head) {
/* Cancel probe timer, if it is not required. */
- tp->probes_out = 0;
- tp->backoff = 0;
+ icsk->icsk_probes_out = 0;
+ icsk->icsk_backoff = 0;
return;
}
if (err <= 0) {
- if (tp->backoff < sysctl_tcp_retries2)
- tp->backoff++;
- tp->probes_out++;
- tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
- min(tp->rto << tp->backoff, TCP_RTO_MAX));
+ if (icsk->icsk_backoff < sysctl_tcp_retries2)
+ icsk->icsk_backoff++;
+ icsk->icsk_probes_out++;
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+ min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
+ TCP_RTO_MAX);
} else {
/* If packet was not sent due to local congestion,
- * do not backoff and do not remember probes_out.
+ * do not backoff and do not remember icsk_probes_out.
* Let local senders to fight for local resources.
*
* Use accumulated backoff yet.
*/
- if (!tp->probes_out)
- tp->probes_out=1;
- tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
- min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL));
+ if (!icsk->icsk_probes_out)
+ icsk->icsk_probes_out = 1;
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+ min(icsk->icsk_rto << icsk->icsk_backoff,
+ TCP_RESOURCE_PROBE_INTERVAL),
+ TCP_RTO_MAX);
}
}
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 70e108e15c7..327770bf552 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -16,9 +16,10 @@
#define TCP_SCALABLE_AI_CNT 50U
#define TCP_SCALABLE_MD_SCALE 3
-static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
+static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
u32 in_flight, int flag)
{
+ struct tcp_sock *tp = tcp_sk(sk);
if (in_flight < tp->snd_cwnd)
return;
@@ -35,8 +36,9 @@ static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
tp->snd_cwnd_stamp = tcp_time_stamp;
}
-static u32 tcp_scalable_ssthresh(struct tcp_sock *tp)
+static u32 tcp_scalable_ssthresh(struct sock *sk)
{
+ const struct tcp_sock *tp = tcp_sk(sk);
return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U);
}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 0084227438c..415ee47ac1c 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -36,49 +36,13 @@ static void tcp_write_timer(unsigned long);
static void tcp_delack_timer(unsigned long);
static void tcp_keepalive_timer (unsigned long data);
-#ifdef TCP_DEBUG
-const char tcp_timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
-EXPORT_SYMBOL(tcp_timer_bug_msg);
-#endif
-
-/*
- * Using different timers for retransmit, delayed acks and probes
- * We may wish use just one timer maintaining a list of expire jiffies
- * to optimize.
- */
-
void tcp_init_xmit_timers(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
-
- init_timer(&tp->retransmit_timer);
- tp->retransmit_timer.function=&tcp_write_timer;
- tp->retransmit_timer.data = (unsigned long) sk;
- tp->pending = 0;
-
- init_timer(&tp->delack_timer);
- tp->delack_timer.function=&tcp_delack_timer;
- tp->delack_timer.data = (unsigned long) sk;
- tp->ack.pending = 0;
-
- init_timer(&sk->sk_timer);
- sk->sk_timer.function = &tcp_keepalive_timer;
- sk->sk_timer.data = (unsigned long)sk;
+ inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
+ &tcp_keepalive_timer);
}
-void tcp_clear_xmit_timers(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- tp->pending = 0;
- sk_stop_timer(sk, &tp->retransmit_timer);
-
- tp->ack.pending = 0;
- tp->ack.blocked = 0;
- sk_stop_timer(sk, &tp->delack_timer);
-
- sk_stop_timer(sk, &sk->sk_timer);
-}
+EXPORT_SYMBOL(tcp_init_xmit_timers);
static void tcp_write_err(struct sock *sk)
{
@@ -155,15 +119,15 @@ static int tcp_orphan_retries(struct sock *sk, int alive)
/* A write timeout has occurred. Process the after effects. */
static int tcp_write_timeout(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
int retry_until;
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
- if (tp->retransmits)
+ if (icsk->icsk_retransmits)
dst_negative_advice(&sk->sk_dst_cache);
- retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries;
+ retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
} else {
- if (tp->retransmits >= sysctl_tcp_retries1) {
+ if (icsk->icsk_retransmits >= sysctl_tcp_retries1) {
/* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
hole detection. :-(
@@ -189,16 +153,16 @@ static int tcp_write_timeout(struct sock *sk)
retry_until = sysctl_tcp_retries2;
if (sock_flag(sk, SOCK_DEAD)) {
- int alive = (tp->rto < TCP_RTO_MAX);
+ const int alive = (icsk->icsk_rto < TCP_RTO_MAX);
retry_until = tcp_orphan_retries(sk, alive);
- if (tcp_out_of_resources(sk, alive || tp->retransmits < retry_until))
+ if (tcp_out_of_resources(sk, alive || icsk->icsk_retransmits < retry_until))
return 1;
}
}
- if (tp->retransmits >= retry_until) {
+ if (icsk->icsk_retransmits >= retry_until) {
/* Has it gone just too far? */
tcp_write_err(sk);
return 1;
@@ -210,26 +174,27 @@ static void tcp_delack_timer(unsigned long data)
{
struct sock *sk = (struct sock*)data;
struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
/* Try again later. */
- tp->ack.blocked = 1;
+ icsk->icsk_ack.blocked = 1;
NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
- sk_reset_timer(sk, &tp->delack_timer, jiffies + TCP_DELACK_MIN);
+ sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN);
goto out_unlock;
}
sk_stream_mem_reclaim(sk);
- if (sk->sk_state == TCP_CLOSE || !(tp->ack.pending & TCP_ACK_TIMER))
+ if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
goto out;
- if (time_after(tp->ack.timeout, jiffies)) {
- sk_reset_timer(sk, &tp->delack_timer, tp->ack.timeout);
+ if (time_after(icsk->icsk_ack.timeout, jiffies)) {
+ sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
goto out;
}
- tp->ack.pending &= ~TCP_ACK_TIMER;
+ icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
if (!skb_queue_empty(&tp->ucopy.prequeue)) {
struct sk_buff *skb;
@@ -242,16 +207,16 @@ static void tcp_delack_timer(unsigned long data)
tp->ucopy.memory = 0;
}
- if (tcp_ack_scheduled(tp)) {
- if (!tp->ack.pingpong) {
+ if (inet_csk_ack_scheduled(sk)) {
+ if (!icsk->icsk_ack.pingpong) {
/* Delayed ACK missed: inflate ATO. */
- tp->ack.ato = min(tp->ack.ato << 1, tp->rto);
+ icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto);
} else {
/* Delayed ACK missed: leave pingpong mode and
* deflate ATO.
*/
- tp->ack.pingpong = 0;
- tp->ack.ato = TCP_ATO_MIN;
+ icsk->icsk_ack.pingpong = 0;
+ icsk->icsk_ack.ato = TCP_ATO_MIN;
}
tcp_send_ack(sk);
NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
@@ -268,11 +233,12 @@ out_unlock:
static void tcp_probe_timer(struct sock *sk)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
int max_probes;
if (tp->packets_out || !sk->sk_send_head) {
- tp->probes_out = 0;
+ icsk->icsk_probes_out = 0;
return;
}
@@ -283,7 +249,7 @@ static void tcp_probe_timer(struct sock *sk)
* FIXME: We ought not to do it, Solaris 2.5 actually has fixing
* this behaviour in Solaris down as a bug fix. [AC]
*
- * Let me to explain. probes_out is zeroed by incoming ACKs
+ * Let me to explain. icsk_probes_out is zeroed by incoming ACKs
* even if they advertise zero window. Hence, connection is killed only
* if we received no ACKs for normal connection timeout. It is not killed
* only because window stays zero for some time, window may be zero
@@ -294,15 +260,15 @@ static void tcp_probe_timer(struct sock *sk)
max_probes = sysctl_tcp_retries2;
if (sock_flag(sk, SOCK_DEAD)) {
- int alive = ((tp->rto<<tp->backoff) < TCP_RTO_MAX);
+ const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX);
max_probes = tcp_orphan_retries(sk, alive);
- if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes))
+ if (tcp_out_of_resources(sk, alive || icsk->icsk_probes_out <= max_probes))
return;
}
- if (tp->probes_out > max_probes) {
+ if (icsk->icsk_probes_out > max_probes) {
tcp_write_err(sk);
} else {
/* Only send another probe if we didn't close things up. */
@@ -317,6 +283,7 @@ static void tcp_probe_timer(struct sock *sk)
static void tcp_retransmit_timer(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
if (!tp->packets_out)
goto out;
@@ -351,20 +318,21 @@ static void tcp_retransmit_timer(struct sock *sk)
if (tcp_write_timeout(sk))
goto out;
- if (tp->retransmits == 0) {
- if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) {
+ if (icsk->icsk_retransmits == 0) {
+ if (icsk->icsk_ca_state == TCP_CA_Disorder ||
+ icsk->icsk_ca_state == TCP_CA_Recovery) {
if (tp->rx_opt.sack_ok) {
- if (tp->ca_state == TCP_CA_Recovery)
+ if (icsk->icsk_ca_state == TCP_CA_Recovery)
NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL);
else
NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES);
} else {
- if (tp->ca_state == TCP_CA_Recovery)
+ if (icsk->icsk_ca_state == TCP_CA_Recovery)
NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL);
else
NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES);
}
- } else if (tp->ca_state == TCP_CA_Loss) {
+ } else if (icsk->icsk_ca_state == TCP_CA_Loss) {
NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES);
} else {
NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS);
@@ -381,10 +349,11 @@ static void tcp_retransmit_timer(struct sock *sk)
/* Retransmission failed because of local congestion,
* do not backoff.
*/
- if (!tp->retransmits)
- tp->retransmits=1;
- tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS,
- min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL));
+ if (!icsk->icsk_retransmits)
+ icsk->icsk_retransmits = 1;
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
+ TCP_RTO_MAX);
goto out;
}
@@ -403,13 +372,13 @@ static void tcp_retransmit_timer(struct sock *sk)
* implemented ftp to mars will work nicely. We will have to fix
* the 120 second clamps though!
*/
- tp->backoff++;
- tp->retransmits++;
+ icsk->icsk_backoff++;
+ icsk->icsk_retransmits++;
out_reset_timer:
- tp->rto = min(tp->rto << 1, TCP_RTO_MAX);
- tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
- if (tp->retransmits > sysctl_tcp_retries1)
+ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
+ if (icsk->icsk_retransmits > sysctl_tcp_retries1)
__sk_dst_reset(sk);
out:;
@@ -418,32 +387,32 @@ out:;
static void tcp_write_timer(unsigned long data)
{
struct sock *sk = (struct sock*)data;
- struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
int event;
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
/* Try again later */
- sk_reset_timer(sk, &tp->retransmit_timer, jiffies + (HZ / 20));
+ sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20));
goto out_unlock;
}
- if (sk->sk_state == TCP_CLOSE || !tp->pending)
+ if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
goto out;
- if (time_after(tp->timeout, jiffies)) {
- sk_reset_timer(sk, &tp->retransmit_timer, tp->timeout);
+ if (time_after(icsk->icsk_timeout, jiffies)) {
+ sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
goto out;
}
- event = tp->pending;
- tp->pending = 0;
+ event = icsk->icsk_pending;
+ icsk->icsk_pending = 0;
switch (event) {
- case TCP_TIME_RETRANS:
+ case ICSK_TIME_RETRANS:
tcp_retransmit_timer(sk);
break;
- case TCP_TIME_PROBE0:
+ case ICSK_TIME_PROBE0:
tcp_probe_timer(sk);
break;
}
@@ -462,96 +431,8 @@ out_unlock:
static void tcp_synack_timer(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
- struct listen_sock *lopt = tp->accept_queue.listen_opt;
- int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries;
- int thresh = max_retries;
- unsigned long now = jiffies;
- struct request_sock **reqp, *req;
- int i, budget;
-
- if (lopt == NULL || lopt->qlen == 0)
- return;
-
- /* Normally all the openreqs are young and become mature
- * (i.e. converted to established socket) for first timeout.
- * If synack was not acknowledged for 3 seconds, it means
- * one of the following things: synack was lost, ack was lost,
- * rtt is high or nobody planned to ack (i.e. synflood).
- * When server is a bit loaded, queue is populated with old
- * open requests, reducing effective size of queue.
- * When server is well loaded, queue size reduces to zero
- * after several minutes of work. It is not synflood,
- * it is normal operation. The solution is pruning
- * too old entries overriding normal timeout, when
- * situation becomes dangerous.
- *
- * Essentially, we reserve half of room for young
- * embrions; and abort old ones without pity, if old
- * ones are about to clog our table.
- */
- if (lopt->qlen>>(lopt->max_qlen_log-1)) {
- int young = (lopt->qlen_young<<1);
-
- while (thresh > 2) {
- if (lopt->qlen < young)
- break;
- thresh--;
- young <<= 1;
- }
- }
-
- if (tp->defer_accept)
- max_retries = tp->defer_accept;
-
- budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL));
- i = lopt->clock_hand;
-
- do {
- reqp=&lopt->syn_table[i];
- while ((req = *reqp) != NULL) {
- if (time_after_eq(now, req->expires)) {
- if ((req->retrans < thresh ||
- (inet_rsk(req)->acked && req->retrans < max_retries))
- && !req->rsk_ops->rtx_syn_ack(sk, req, NULL)) {
- unsigned long timeo;
-
- if (req->retrans++ == 0)
- lopt->qlen_young--;
- timeo = min((TCP_TIMEOUT_INIT << req->retrans),
- TCP_RTO_MAX);
- req->expires = now + timeo;
- reqp = &req->dl_next;
- continue;
- }
-
- /* Drop this request */
- tcp_synq_unlink(tp, req, reqp);
- reqsk_queue_removed(&tp->accept_queue, req);
- reqsk_free(req);
- continue;
- }
- reqp = &req->dl_next;
- }
-
- i = (i+1)&(TCP_SYNQ_HSIZE-1);
-
- } while (--budget > 0);
-
- lopt->clock_hand = i;
-
- if (lopt->qlen)
- tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL);
-}
-
-void tcp_delete_keepalive_timer (struct sock *sk)
-{
- sk_stop_timer(sk, &sk->sk_timer);
-}
-
-void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len)
-{
- sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
+ inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL,
+ TCP_TIMEOUT_INIT, TCP_RTO_MAX);
}
void tcp_set_keepalive(struct sock *sk, int val)
@@ -560,15 +441,16 @@ void tcp_set_keepalive(struct sock *sk, int val)
return;
if (val && !sock_flag(sk, SOCK_KEEPOPEN))
- tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
+ inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
else if (!val)
- tcp_delete_keepalive_timer(sk);
+ inet_csk_delete_keepalive_timer(sk);
}
static void tcp_keepalive_timer (unsigned long data)
{
struct sock *sk = (struct sock *) data;
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
__u32 elapsed;
@@ -576,7 +458,7 @@ static void tcp_keepalive_timer (unsigned long data)
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
/* Try again later. */
- tcp_reset_keepalive_timer (sk, HZ/20);
+ inet_csk_reset_keepalive_timer (sk, HZ/20);
goto out;
}
@@ -587,7 +469,7 @@ static void tcp_keepalive_timer (unsigned long data)
if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
if (tp->linger2 >= 0) {
- int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN;
+ const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
if (tmo > 0) {
tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
@@ -610,14 +492,14 @@ static void tcp_keepalive_timer (unsigned long data)
elapsed = tcp_time_stamp - tp->rcv_tstamp;
if (elapsed >= keepalive_time_when(tp)) {
- if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) ||
- (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) {
+ if ((!tp->keepalive_probes && icsk->icsk_probes_out >= sysctl_tcp_keepalive_probes) ||
+ (tp->keepalive_probes && icsk->icsk_probes_out >= tp->keepalive_probes)) {
tcp_send_active_reset(sk, GFP_ATOMIC);
tcp_write_err(sk);
goto out;
}
if (tcp_write_wakeup(sk) <= 0) {
- tp->probes_out++;
+ icsk->icsk_probes_out++;
elapsed = keepalive_intvl_when(tp);
} else {
/* If keepalive was lost due to local congestion,
@@ -634,7 +516,7 @@ static void tcp_keepalive_timer (unsigned long data)
sk_stream_mem_reclaim(sk);
resched:
- tcp_reset_keepalive_timer (sk, elapsed);
+ inet_csk_reset_keepalive_timer (sk, elapsed);
goto out;
death:
@@ -644,8 +526,3 @@ out:
bh_unlock_sock(sk);
sock_put(sk);
}
-
-EXPORT_SYMBOL(tcp_clear_xmit_timers);
-EXPORT_SYMBOL(tcp_delete_keepalive_timer);
-EXPORT_SYMBOL(tcp_init_xmit_timers);
-EXPORT_SYMBOL(tcp_reset_keepalive_timer);
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 9bd443db519..93c5f92070f 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -35,7 +35,7 @@
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/skbuff.h>
-#include <linux/tcp_diag.h>
+#include <linux/inet_diag.h>
#include <net/tcp.h>
@@ -82,9 +82,10 @@ struct vegas {
* Instead we must wait until the completion of an RTT during
* which we actually receive ACKs.
*/
-static inline void vegas_enable(struct tcp_sock *tp)
+static inline void vegas_enable(struct sock *sk)
{
- struct vegas *vegas = tcp_ca(tp);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct vegas *vegas = inet_csk_ca(sk);
/* Begin taking Vegas samples next time we send something. */
vegas->doing_vegas_now = 1;
@@ -97,19 +98,19 @@ static inline void vegas_enable(struct tcp_sock *tp)
}
/* Stop taking Vegas samples for now. */
-static inline void vegas_disable(struct tcp_sock *tp)
+static inline void vegas_disable(struct sock *sk)
{
- struct vegas *vegas = tcp_ca(tp);
+ struct vegas *vegas = inet_csk_ca(sk);
vegas->doing_vegas_now = 0;
}
-static void tcp_vegas_init(struct tcp_sock *tp)
+static void tcp_vegas_init(struct sock *sk)
{
- struct vegas *vegas = tcp_ca(tp);
+ struct vegas *vegas = inet_csk_ca(sk);
vegas->baseRTT = 0x7fffffff;
- vegas_enable(tp);
+ vegas_enable(sk);
}
/* Do RTT sampling needed for Vegas.
@@ -120,9 +121,9 @@ static void tcp_vegas_init(struct tcp_sock *tp)
* o min-filter RTT samples from a much longer window (forever for now)
* to find the propagation delay (baseRTT)
*/
-static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt)
+static void tcp_vegas_rtt_calc(struct sock *sk, u32 usrtt)
{
- struct vegas *vegas = tcp_ca(tp);
+ struct vegas *vegas = inet_csk_ca(sk);
u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */
/* Filter to find propagation delay: */
@@ -136,13 +137,13 @@ static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt)
vegas->cntRTT++;
}
-static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state)
+static void tcp_vegas_state(struct sock *sk, u8 ca_state)
{
if (ca_state == TCP_CA_Open)
- vegas_enable(tp);
+ vegas_enable(sk);
else
- vegas_disable(tp);
+ vegas_disable(sk);
}
/*
@@ -154,20 +155,21 @@ static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state)
* packets, _then_ we can make Vegas calculations
* again.
*/
-static void tcp_vegas_cwnd_event(struct tcp_sock *tp, enum tcp_ca_event event)
+static void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event)
{
if (event == CA_EVENT_CWND_RESTART ||
event == CA_EVENT_TX_START)
- tcp_vegas_init(tp);
+ tcp_vegas_init(sk);
}
-static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack,
+static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
u32 seq_rtt, u32 in_flight, int flag)
{
- struct vegas *vegas = tcp_ca(tp);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct vegas *vegas = inet_csk_ca(sk);
if (!vegas->doing_vegas_now)
- return tcp_reno_cong_avoid(tp, ack, seq_rtt, in_flight, flag);
+ return tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag);
/* The key players are v_beg_snd_una and v_beg_snd_nxt.
*
@@ -219,7 +221,7 @@ static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack,
* but that's not too awful, since we're taking the min,
* rather than averaging.
*/
- tcp_vegas_rtt_calc(tp, seq_rtt*1000);
+ tcp_vegas_rtt_calc(sk, seq_rtt * 1000);
/* We do the Vegas calculations only if we got enough RTT
* samples that we can be reasonably sure that we got
@@ -359,14 +361,14 @@ static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack,
}
/* Extract info for Tcp socket info provided via netlink. */
-static void tcp_vegas_get_info(struct tcp_sock *tp, u32 ext,
+static void tcp_vegas_get_info(struct sock *sk, u32 ext,
struct sk_buff *skb)
{
- const struct vegas *ca = tcp_ca(tp);
- if (ext & (1<<(TCPDIAG_VEGASINFO-1))) {
+ const struct vegas *ca = inet_csk_ca(sk);
+ if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
struct tcpvegas_info *info;
- info = RTA_DATA(__RTA_PUT(skb, TCPDIAG_VEGASINFO,
+ info = RTA_DATA(__RTA_PUT(skb, INET_DIAG_VEGASINFO,
sizeof(*info)));
info->tcpv_enabled = ca->doing_vegas_now;
@@ -393,7 +395,7 @@ static struct tcp_congestion_ops tcp_vegas = {
static int __init tcp_vegas_register(void)
{
- BUG_ON(sizeof(struct vegas) > TCP_CA_PRIV_SIZE);
+ BUG_ON(sizeof(struct vegas) > ICSK_CA_PRIV_SIZE);
tcp_register_congestion_control(&tcp_vegas);
return 0;
}
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index ef827242c94..0c340c3756c 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -8,7 +8,7 @@
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/skbuff.h>
-#include <linux/tcp_diag.h>
+#include <linux/inet_diag.h>
#include <net/tcp.h>
/* TCP Westwood structure */
@@ -40,9 +40,9 @@ struct westwood {
* way as soon as possible. It will reasonably happen within the first
* RTT period of the connection lifetime.
*/
-static void tcp_westwood_init(struct tcp_sock *tp)
+static void tcp_westwood_init(struct sock *sk)
{
- struct westwood *w = tcp_ca(tp);
+ struct westwood *w = inet_csk_ca(sk);
w->bk = 0;
w->bw_ns_est = 0;
@@ -51,7 +51,7 @@ static void tcp_westwood_init(struct tcp_sock *tp)
w->cumul_ack = 0;
w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT;
w->rtt_win_sx = tcp_time_stamp;
- w->snd_una = tp->snd_una;
+ w->snd_una = tcp_sk(sk)->snd_una;
}
/*
@@ -74,11 +74,11 @@ static inline void westwood_filter(struct westwood *w, u32 delta)
* Called after processing group of packets.
* but all westwood needs is the last sample of srtt.
*/
-static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt)
+static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt)
{
- struct westwood *w = tcp_ca(tp);
+ struct westwood *w = inet_csk_ca(sk);
if (cnt > 0)
- w->rtt = tp->srtt >> 3;
+ w->rtt = tcp_sk(sk)->srtt >> 3;
}
/*
@@ -86,9 +86,9 @@ static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt)
* It updates RTT evaluation window if it is the right moment to do
* it. If so it calls filter for evaluating bandwidth.
*/
-static void westwood_update_window(struct tcp_sock *tp)
+static void westwood_update_window(struct sock *sk)
{
- struct westwood *w = tcp_ca(tp);
+ struct westwood *w = inet_csk_ca(sk);
s32 delta = tcp_time_stamp - w->rtt_win_sx;
/*
@@ -114,11 +114,12 @@ static void westwood_update_window(struct tcp_sock *tp)
* header prediction is successful. In such case in fact update is
* straight forward and doesn't need any particular care.
*/
-static inline void westwood_fast_bw(struct tcp_sock *tp)
+static inline void westwood_fast_bw(struct sock *sk)
{
- struct westwood *w = tcp_ca(tp);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct westwood *w = inet_csk_ca(sk);
- westwood_update_window(tp);
+ westwood_update_window(sk);
w->bk += tp->snd_una - w->snd_una;
w->snd_una = tp->snd_una;
@@ -130,9 +131,10 @@ static inline void westwood_fast_bw(struct tcp_sock *tp)
* This function evaluates cumul_ack for evaluating bk in case of
* delayed or partial acks.
*/
-static inline u32 westwood_acked_count(struct tcp_sock *tp)
+static inline u32 westwood_acked_count(struct sock *sk)
{
- struct westwood *w = tcp_ca(tp);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct westwood *w = inet_csk_ca(sk);
w->cumul_ack = tp->snd_una - w->snd_una;
@@ -160,9 +162,10 @@ static inline u32 westwood_acked_count(struct tcp_sock *tp)
return w->cumul_ack;
}
-static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp)
+static inline u32 westwood_bw_rttmin(const struct sock *sk)
{
- struct westwood *w = tcp_ca(tp);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct westwood *w = inet_csk_ca(sk);
return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
}
@@ -172,31 +175,32 @@ static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp)
* in packets we use mss_cache). Rttmin is guaranteed to be >= 2
* so avoids ever returning 0.
*/
-static u32 tcp_westwood_cwnd_min(struct tcp_sock *tp)
+static u32 tcp_westwood_cwnd_min(struct sock *sk)
{
- return westwood_bw_rttmin(tp);
+ return westwood_bw_rttmin(sk);
}
-static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event)
+static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
{
- struct westwood *w = tcp_ca(tp);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct westwood *w = inet_csk_ca(sk);
switch(event) {
case CA_EVENT_FAST_ACK:
- westwood_fast_bw(tp);
+ westwood_fast_bw(sk);
break;
case CA_EVENT_COMPLETE_CWR:
- tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(tp);
+ tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(sk);
break;
case CA_EVENT_FRTO:
- tp->snd_ssthresh = westwood_bw_rttmin(tp);
+ tp->snd_ssthresh = westwood_bw_rttmin(sk);
break;
case CA_EVENT_SLOW_ACK:
- westwood_update_window(tp);
- w->bk += westwood_acked_count(tp);
+ westwood_update_window(sk);
+ w->bk += westwood_acked_count(sk);
w->rtt_min = min(w->rtt, w->rtt_min);
break;
@@ -208,15 +212,15 @@ static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event)
/* Extract info for Tcp socket info provided via netlink. */
-static void tcp_westwood_info(struct tcp_sock *tp, u32 ext,
+static void tcp_westwood_info(struct sock *sk, u32 ext,
struct sk_buff *skb)
{
- const struct westwood *ca = tcp_ca(tp);
- if (ext & (1<<(TCPDIAG_VEGASINFO-1))) {
+ const struct westwood *ca = inet_csk_ca(sk);
+ if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
struct rtattr *rta;
struct tcpvegas_info *info;
- rta = __RTA_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*info));
+ rta = __RTA_PUT(skb, INET_DIAG_VEGASINFO, sizeof(*info));
info = RTA_DATA(rta);
info->tcpv_enabled = 1;
info->tcpv_rttcnt = 0;
@@ -242,7 +246,7 @@ static struct tcp_congestion_ops tcp_westwood = {
static int __init tcp_westwood_register(void)
{
- BUG_ON(sizeof(struct westwood) > TCP_CA_PRIV_SIZE);
+ BUG_ON(sizeof(struct westwood) > ICSK_CA_PRIV_SIZE);
return tcp_register_congestion_control(&tcp_westwood);
}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 7c24e64b443..e0bd1013cb0 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -95,7 +95,8 @@
#include <linux/ipv6.h>
#include <linux/netdevice.h>
#include <net/snmp.h>
-#include <net/tcp.h>
+#include <net/ip.h>
+#include <net/tcp_states.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
#include <linux/proc_fs.h>
@@ -112,7 +113,7 @@
* Snmp MIB for the UDP layer
*/
-DEFINE_SNMP_STAT(struct udp_mib, udp_statistics);
+DEFINE_SNMP_STAT(struct udp_mib, udp_statistics) __read_mostly;
struct hlist_head udp_hash[UDP_HTABLE_SIZE];
DEFINE_RWLOCK(udp_hash_lock);
@@ -628,7 +629,7 @@ back_from_confirm:
/* ... which is an evident application bug. --ANK */
release_sock(sk);
- NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 2\n"));
+ LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n");
err = -EINVAL;
goto out;
}
@@ -693,7 +694,7 @@ static int udp_sendpage(struct sock *sk, struct page *page, int offset,
if (unlikely(!up->pending)) {
release_sock(sk);
- NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 3\n"));
+ LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 3\n");
return -EINVAL;
}
@@ -1102,7 +1103,7 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
skb->ip_summed = CHECKSUM_UNNECESSARY;
if (!udp_check(uh, ulen, saddr, daddr, skb->csum))
return 0;
- NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp v4 hw csum failure.\n"));
+ LIMIT_NETDEBUG(KERN_DEBUG "udp v4 hw csum failure.\n");
skb->ip_summed = CHECKSUM_NONE;
}
if (skb->ip_summed != CHECKSUM_UNNECESSARY)
@@ -1140,7 +1141,7 @@ int udp_rcv(struct sk_buff *skb)
if (ulen > len || ulen < sizeof(*uh))
goto short_packet;
- if (pskb_trim(skb, ulen))
+ if (pskb_trim_rcsum(skb, ulen))
goto short_packet;
if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0)
@@ -1181,14 +1182,13 @@ int udp_rcv(struct sk_buff *skb)
return(0);
short_packet:
- NETDEBUG(if (net_ratelimit())
- printk(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n",
- NIPQUAD(saddr),
- ntohs(uh->source),
- ulen,
- len,
- NIPQUAD(daddr),
- ntohs(uh->dest)));
+ LIMIT_NETDEBUG(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n",
+ NIPQUAD(saddr),
+ ntohs(uh->source),
+ ulen,
+ len,
+ NIPQUAD(daddr),
+ ntohs(uh->dest));
no_header:
UDP_INC_STATS_BH(UDP_MIB_INERRORS);
kfree_skb(skb);
@@ -1199,13 +1199,12 @@ csum_error:
* RFC1122: OK. Discards the bad packet silently (as far as
* the network is concerned, anyway) as per 4.1.3.4 (MUST).
*/
- NETDEBUG(if (net_ratelimit())
- printk(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n",
- NIPQUAD(saddr),
- ntohs(uh->source),
- NIPQUAD(daddr),
- ntohs(uh->dest),
- ulen));
+ LIMIT_NETDEBUG(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n",
+ NIPQUAD(saddr),
+ ntohs(uh->source),
+ NIPQUAD(daddr),
+ ntohs(uh->dest),
+ ulen);
drop:
UDP_INC_STATS_BH(UDP_MIB_INERRORS);
kfree_skb(skb);
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 050611d7a96..d23e07fc81f 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -128,8 +128,10 @@ void __init xfrm4_state_init(void)
xfrm_state_register_afinfo(&xfrm4_state_afinfo);
}
+#if 0
void __exit xfrm4_state_fini(void)
{
xfrm_state_unregister_afinfo(&xfrm4_state_afinfo);
}
+#endif /* 0 */
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index b39e0494059..6460eec834b 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -8,7 +8,7 @@ ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o sit.o \
route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o raw.o \
protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \
exthdrs.o sysctl_net_ipv6.o datagram.o proc.o \
- ip6_flowlabel.o ipv6_syms.o
+ ip6_flowlabel.o ipv6_syms.o netfilter.o
ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \
xfrm6_output.o
@@ -23,3 +23,5 @@ obj-$(CONFIG_NETFILTER) += netfilter/
obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o
obj-y += exthdrs_core.o
+
+obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 77004b9456c..2c5f57299d6 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -75,7 +75,7 @@
#ifdef CONFIG_IPV6_PRIVACY
#include <linux/random.h>
#include <linux/crypto.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
#endif
#include <asm/uaccess.h>
@@ -123,8 +123,7 @@ DEFINE_RWLOCK(addrconf_lock);
static void addrconf_verify(unsigned long);
-static struct timer_list addr_chk_timer =
- TIMER_INITIALIZER(addrconf_verify, 0, 0);
+static DEFINE_TIMER(addr_chk_timer, addrconf_verify, 0, 0);
static DEFINE_SPINLOCK(addrconf_verify_lock);
static void addrconf_join_anycast(struct inet6_ifaddr *ifp);
@@ -1041,9 +1040,9 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2)
const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr;
const struct in6_addr *sk2_rcv_saddr6 = tcp_v6_rcv_saddr(sk2);
u32 sk_rcv_saddr = inet_sk(sk)->rcv_saddr;
- u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
+ u32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
int sk_ipv6only = ipv6_only_sock(sk);
- int sk2_ipv6only = tcp_v6_ipv6only(sk2);
+ int sk2_ipv6only = inet_v6_ipv6only(sk2);
int addr_type = ipv6_addr_type(sk_rcv_saddr6);
int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
@@ -1126,7 +1125,7 @@ void addrconf_leave_solict(struct inet6_dev *idev, struct in6_addr *addr)
__ipv6_dev_mc_dec(idev, &maddr);
}
-void addrconf_join_anycast(struct inet6_ifaddr *ifp)
+static void addrconf_join_anycast(struct inet6_ifaddr *ifp)
{
struct in6_addr addr;
ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len);
@@ -1135,7 +1134,7 @@ void addrconf_join_anycast(struct inet6_ifaddr *ifp)
ipv6_dev_ac_inc(ifp->idev->dev, &addr);
}
-void addrconf_leave_anycast(struct inet6_ifaddr *ifp)
+static void addrconf_leave_anycast(struct inet6_ifaddr *ifp)
{
struct in6_addr addr;
ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len);
@@ -1218,12 +1217,8 @@ static int __ipv6_regen_rndid(struct inet6_dev *idev)
struct net_device *dev;
struct scatterlist sg[2];
- sg[0].page = virt_to_page(idev->entropy);
- sg[0].offset = offset_in_page(idev->entropy);
- sg[0].length = 8;
- sg[1].page = virt_to_page(idev->work_eui64);
- sg[1].offset = offset_in_page(idev->work_eui64);
- sg[1].length = 8;
+ sg_set_buf(&sg[0], idev->entropy, 8);
+ sg_set_buf(&sg[1], idev->work_eui64, 8);
dev = idev->dev;
@@ -1807,7 +1802,7 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
}
for (dev = dev_base; dev != NULL; dev = dev->next) {
- struct in_device * in_dev = __in_dev_get(dev);
+ struct in_device * in_dev = __in_dev_get_rtnl(dev);
if (in_dev && (dev->flags & IFF_UP)) {
struct in_ifaddr * ifa;
@@ -2168,7 +2163,7 @@ static int addrconf_ifdown(struct net_device *dev, int how)
/* Step 5: netlink notification of this interface */
idev->tstamp = jiffies;
- inet6_ifinfo_notify(RTM_NEWLINK, idev);
+ inet6_ifinfo_notify(RTM_DELLINK, idev);
/* Shot the device (if unregistered) */
@@ -2858,16 +2853,16 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
skb = alloc_skb(size, GFP_ATOMIC);
if (!skb) {
- netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFADDR, ENOBUFS);
+ netlink_set_err(rtnl, 0, RTNLGRP_IPV6_IFADDR, ENOBUFS);
return;
}
if (inet6_fill_ifaddr(skb, ifa, current->pid, 0, event, 0) < 0) {
kfree_skb(skb);
- netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFADDR, EINVAL);
+ netlink_set_err(rtnl, 0, RTNLGRP_IPV6_IFADDR, EINVAL);
return;
}
- NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_IFADDR;
- netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_IFADDR, GFP_ATOMIC);
+ NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_IFADDR;
+ netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_IFADDR, GFP_ATOMIC);
}
static void inline ipv6_store_devconf(struct ipv6_devconf *cnf,
@@ -2994,16 +2989,16 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev)
skb = alloc_skb(size, GFP_ATOMIC);
if (!skb) {
- netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFINFO, ENOBUFS);
+ netlink_set_err(rtnl, 0, RTNLGRP_IPV6_IFINFO, ENOBUFS);
return;
}
if (inet6_fill_ifinfo(skb, idev, current->pid, 0, event, 0) < 0) {
kfree_skb(skb);
- netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFINFO, EINVAL);
+ netlink_set_err(rtnl, 0, RTNLGRP_IPV6_IFINFO, EINVAL);
return;
}
- NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_IFINFO;
- netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_IFINFO, GFP_ATOMIC);
+ NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_IFINFO;
+ netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_IFINFO, GFP_ATOMIC);
}
static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev,
@@ -3054,16 +3049,16 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev,
skb = alloc_skb(size, GFP_ATOMIC);
if (!skb) {
- netlink_set_err(rtnl, 0, RTMGRP_IPV6_PREFIX, ENOBUFS);
+ netlink_set_err(rtnl, 0, RTNLGRP_IPV6_PREFIX, ENOBUFS);
return;
}
if (inet6_fill_prefix(skb, idev, pinfo, current->pid, 0, event, 0) < 0) {
kfree_skb(skb);
- netlink_set_err(rtnl, 0, RTMGRP_IPV6_PREFIX, EINVAL);
+ netlink_set_err(rtnl, 0, RTNLGRP_IPV6_PREFIX, EINVAL);
return;
}
- NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_PREFIX;
- netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_PREFIX, GFP_ATOMIC);
+ NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_PREFIX;
+ netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_PREFIX, GFP_ATOMIC);
}
static struct rtnetlink_link inet6_rtnetlink_table[RTM_NR_MSGTYPES] = {
@@ -3521,6 +3516,8 @@ int __init addrconf_init(void)
if (err)
return err;
+ ip6_null_entry.rt6i_idev = in6_dev_get(&loopback_dev);
+
register_netdevice_notifier(&ipv6_dev_notf);
#ifdef CONFIG_IPV6_PRIVACY
@@ -3593,10 +3590,8 @@ void __exit addrconf_cleanup(void)
rtnl_unlock();
#ifdef CONFIG_IPV6_PRIVACY
- if (likely(md5_tfm != NULL)) {
- crypto_free_tfm(md5_tfm);
- md5_tfm = NULL;
- }
+ crypto_free_tfm(md5_tfm);
+ md5_tfm = NULL;
#endif
#ifdef CONFIG_PROC_FS
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 28d9bcab097..4f8795af2ed 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -44,6 +44,7 @@
#include <linux/netdevice.h>
#include <linux/icmpv6.h>
#include <linux/smp_lock.h>
+#include <linux/netfilter_ipv6.h>
#include <net/ip.h>
#include <net/ipv6.h>
@@ -66,45 +67,14 @@ MODULE_AUTHOR("Cast of dozens");
MODULE_DESCRIPTION("IPv6 protocol stack for Linux");
MODULE_LICENSE("GPL");
-/* IPv6 procfs goodies... */
-
-#ifdef CONFIG_PROC_FS
-extern int raw6_proc_init(void);
-extern void raw6_proc_exit(void);
-extern int tcp6_proc_init(void);
-extern void tcp6_proc_exit(void);
-extern int udp6_proc_init(void);
-extern void udp6_proc_exit(void);
-extern int ipv6_misc_proc_init(void);
-extern void ipv6_misc_proc_exit(void);
-extern int ac6_proc_init(void);
-extern void ac6_proc_exit(void);
-extern int if6_proc_init(void);
-extern void if6_proc_exit(void);
-#endif
-
int sysctl_ipv6_bindv6only;
-#ifdef INET_REFCNT_DEBUG
-atomic_t inet6_sock_nr;
-EXPORT_SYMBOL(inet6_sock_nr);
-#endif
-
/* The inetsw table contains everything that inet_create needs to
* build a new socket.
*/
static struct list_head inetsw6[SOCK_MAX];
static DEFINE_SPINLOCK(inetsw6_lock);
-static void inet6_sock_destruct(struct sock *sk)
-{
- inet_sock_destruct(sk);
-
-#ifdef INET_REFCNT_DEBUG
- atomic_dec(&inet6_sock_nr);
-#endif
-}
-
static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk)
{
const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo);
@@ -185,7 +155,7 @@ static int inet6_create(struct socket *sock, int protocol)
inet->hdrincl = 1;
}
- sk->sk_destruct = inet6_sock_destruct;
+ sk->sk_destruct = inet_sock_destruct;
sk->sk_family = PF_INET6;
sk->sk_protocol = protocol;
@@ -212,12 +182,17 @@ static int inet6_create(struct socket *sock, int protocol)
inet->pmtudisc = IP_PMTUDISC_DONT;
else
inet->pmtudisc = IP_PMTUDISC_WANT;
+ /*
+ * Increment only the relevant sk_prot->socks debug field, this changes
+ * the previous behaviour of incrementing both the equivalent to
+ * answer->prot->socks (inet6_sock_nr) and inet_sock_nr.
+ *
+ * This allows better debug granularity as we'll know exactly how many
+ * UDPv6, TCPv6, etc socks were allocated, not the sum of all IPv6
+ * transport protocol socks. -acme
+ */
+ sk_refcnt_debug_inc(sk);
-
-#ifdef INET_REFCNT_DEBUG
- atomic_inc(&inet6_sock_nr);
- atomic_inc(&inet_sock_nr);
-#endif
if (inet->num) {
/* It assumes that any protocol which allows
* the user to assign a number at socket
@@ -513,11 +488,6 @@ static struct net_proto_family inet6_family_ops = {
.owner = THIS_MODULE,
};
-#ifdef CONFIG_SYSCTL
-extern void ipv6_sysctl_register(void);
-extern void ipv6_sysctl_unregister(void);
-#endif
-
/* Same as inet6_dgram_ops, sans udp_poll. */
static struct proto_ops inet6_sockraw_ops = {
.family = PF_INET6,
@@ -684,8 +654,6 @@ static void cleanup_ipv6_mibs(void)
snmp6_mib_free((void **)udp_stats_in6);
}
-extern int ipv6_misc_proc_init(void);
-
static int __init inet6_init(void)
{
struct sk_buff *dummy_skb;
@@ -757,6 +725,9 @@ static int __init inet6_init(void)
err = igmp6_init(&inet6_family_ops);
if (err)
goto igmp_fail;
+ err = ipv6_netfilter_init();
+ if (err)
+ goto netfilter_fail;
/* Create /proc/foo6 entries. */
#ifdef CONFIG_PROC_FS
err = -ENOMEM;
@@ -813,6 +784,8 @@ proc_tcp6_fail:
raw6_proc_exit();
proc_raw6_fail:
#endif
+ ipv6_netfilter_fini();
+netfilter_fail:
igmp6_cleanup();
igmp_fail:
ndisc_cleanup();
@@ -852,6 +825,7 @@ static void __exit inet6_exit(void)
ip6_route_cleanup();
ipv6_packet_cleanup();
igmp6_cleanup();
+ ipv6_netfilter_fini();
ndisc_cleanup();
icmpv6_cleanup();
#ifdef CONFIG_SYSCTL
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 986fdfdccbc..f3629730eb1 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -131,10 +131,10 @@ static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len)
case NEXTHDR_HOP:
case NEXTHDR_DEST:
if (!zero_out_mutable_opts(exthdr.opth)) {
- LIMIT_NETDEBUG(printk(
+ LIMIT_NETDEBUG(
KERN_WARNING "overrun %sopts\n",
nexthdr == NEXTHDR_HOP ?
- "hop" : "dest"));
+ "hop" : "dest");
return -EINVAL;
}
break;
@@ -293,8 +293,7 @@ static int ah6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struc
skb_push(skb, skb->data - skb->nh.raw);
ahp->icv(ahp, skb, ah->auth_data);
if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) {
- LIMIT_NETDEBUG(
- printk(KERN_WARNING "ipsec ah authentication error\n"));
+ LIMIT_NETDEBUG(KERN_WARNING "ipsec ah authentication error\n");
x->stats.integrity_failed++;
goto free_out;
}
@@ -332,9 +331,9 @@ static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (!x)
return;
- NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/"
- "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
- ntohl(ah->spi), NIP6(iph->daddr)));
+ NETDEBUG(KERN_DEBUG "pmtu discovery on SA AH/%08x/"
+ "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
+ ntohl(ah->spi), NIP6(iph->daddr));
xfrm_state_put(x);
}
@@ -402,10 +401,8 @@ static int ah6_init_state(struct xfrm_state *x)
error:
if (ahp) {
- if (ahp->work_icv)
- kfree(ahp->work_icv);
- if (ahp->tfm)
- crypto_free_tfm(ahp->tfm);
+ kfree(ahp->work_icv);
+ crypto_free_tfm(ahp->tfm);
kfree(ahp);
}
return -EINVAL;
@@ -418,14 +415,10 @@ static void ah6_destroy(struct xfrm_state *x)
if (!ahp)
return;
- if (ahp->work_icv) {
- kfree(ahp->work_icv);
- ahp->work_icv = NULL;
- }
- if (ahp->tfm) {
- crypto_free_tfm(ahp->tfm);
- ahp->tfm = NULL;
- }
+ kfree(ahp->work_icv);
+ ahp->work_icv = NULL;
+ crypto_free_tfm(ahp->tfm);
+ ahp->tfm = NULL;
kfree(ahp);
}
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 5229365cd8b..cc518405b3e 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -29,6 +29,7 @@
#include <net/addrconf.h>
#include <net/transp_v6.h>
#include <net/ip6_route.h>
+#include <net/tcp_states.h>
#include <linux/errqueue.h>
#include <asm/uaccess.h>
@@ -174,10 +175,8 @@ ipv4_connected:
if (final_p)
ipv6_addr_copy(&fl.fl6_dst, final_p);
- if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
- dst_release(dst);
+ if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0)
goto out;
- }
/* source address lookup done in ip6_dst_lookup */
@@ -389,32 +388,101 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb)
put_cmsg(msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim);
}
+ if (np->rxopt.bits.rxtclass) {
+ int tclass = (ntohl(*(u32 *)skb->nh.ipv6h) >> 20) & 0xff;
+ put_cmsg(msg, SOL_IPV6, IPV6_TCLASS, sizeof(tclass), &tclass);
+ }
+
if (np->rxopt.bits.rxflow && (*(u32*)skb->nh.raw & IPV6_FLOWINFO_MASK)) {
u32 flowinfo = *(u32*)skb->nh.raw & IPV6_FLOWINFO_MASK;
put_cmsg(msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo);
}
+
+ /* HbH is allowed only once */
if (np->rxopt.bits.hopopts && opt->hop) {
u8 *ptr = skb->nh.raw + opt->hop;
put_cmsg(msg, SOL_IPV6, IPV6_HOPOPTS, (ptr[1]+1)<<3, ptr);
}
- if (np->rxopt.bits.dstopts && opt->dst0) {
+
+ if (opt->lastopt &&
+ (np->rxopt.bits.dstopts || np->rxopt.bits.srcrt)) {
+ /*
+ * Silly enough, but we need to reparse in order to
+ * report extension headers (except for HbH)
+ * in order.
+ *
+ * Also note that IPV6_RECVRTHDRDSTOPTS is NOT
+ * (and WILL NOT be) defined because
+ * IPV6_RECVDSTOPTS is more generic. --yoshfuji
+ */
+ unsigned int off = sizeof(struct ipv6hdr);
+ u8 nexthdr = skb->nh.ipv6h->nexthdr;
+
+ while (off <= opt->lastopt) {
+ unsigned len;
+ u8 *ptr = skb->nh.raw + off;
+
+ switch(nexthdr) {
+ case IPPROTO_DSTOPTS:
+ nexthdr = ptr[0];
+ len = (ptr[1] + 1) << 3;
+ if (np->rxopt.bits.dstopts)
+ put_cmsg(msg, SOL_IPV6, IPV6_DSTOPTS, len, ptr);
+ break;
+ case IPPROTO_ROUTING:
+ nexthdr = ptr[0];
+ len = (ptr[1] + 1) << 3;
+ if (np->rxopt.bits.srcrt)
+ put_cmsg(msg, SOL_IPV6, IPV6_RTHDR, len, ptr);
+ break;
+ case IPPROTO_AH:
+ nexthdr = ptr[0];
+ len = (ptr[1] + 1) << 2;
+ break;
+ default:
+ nexthdr = ptr[0];
+ len = (ptr[1] + 1) << 3;
+ break;
+ }
+
+ off += len;
+ }
+ }
+
+ /* socket options in old style */
+ if (np->rxopt.bits.rxoinfo) {
+ struct in6_pktinfo src_info;
+
+ src_info.ipi6_ifindex = opt->iif;
+ ipv6_addr_copy(&src_info.ipi6_addr, &skb->nh.ipv6h->daddr);
+ put_cmsg(msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info);
+ }
+ if (np->rxopt.bits.rxohlim) {
+ int hlim = skb->nh.ipv6h->hop_limit;
+ put_cmsg(msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim);
+ }
+ if (np->rxopt.bits.ohopopts && opt->hop) {
+ u8 *ptr = skb->nh.raw + opt->hop;
+ put_cmsg(msg, SOL_IPV6, IPV6_2292HOPOPTS, (ptr[1]+1)<<3, ptr);
+ }
+ if (np->rxopt.bits.odstopts && opt->dst0) {
u8 *ptr = skb->nh.raw + opt->dst0;
- put_cmsg(msg, SOL_IPV6, IPV6_DSTOPTS, (ptr[1]+1)<<3, ptr);
+ put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr);
}
- if (np->rxopt.bits.srcrt && opt->srcrt) {
+ if (np->rxopt.bits.osrcrt && opt->srcrt) {
struct ipv6_rt_hdr *rthdr = (struct ipv6_rt_hdr *)(skb->nh.raw + opt->srcrt);
- put_cmsg(msg, SOL_IPV6, IPV6_RTHDR, (rthdr->hdrlen+1) << 3, rthdr);
+ put_cmsg(msg, SOL_IPV6, IPV6_2292RTHDR, (rthdr->hdrlen+1) << 3, rthdr);
}
- if (np->rxopt.bits.dstopts && opt->dst1) {
+ if (np->rxopt.bits.odstopts && opt->dst1) {
u8 *ptr = skb->nh.raw + opt->dst1;
- put_cmsg(msg, SOL_IPV6, IPV6_DSTOPTS, (ptr[1]+1)<<3, ptr);
+ put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr);
}
return 0;
}
int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
struct ipv6_txoptions *opt,
- int *hlimit)
+ int *hlimit, int *tclass)
{
struct in6_pktinfo *src_info;
struct cmsghdr *cmsg;
@@ -437,6 +505,7 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
switch (cmsg->cmsg_type) {
case IPV6_PKTINFO:
+ case IPV6_2292PKTINFO:
if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct in6_pktinfo))) {
err = -EINVAL;
goto exit_f;
@@ -491,6 +560,7 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
fl->fl6_flowlabel = IPV6_FLOWINFO_MASK & *(u32 *)CMSG_DATA(cmsg);
break;
+ case IPV6_2292HOPOPTS:
case IPV6_HOPOPTS:
if (opt->hopopt || cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) {
err = -EINVAL;
@@ -511,7 +581,7 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
opt->hopopt = hdr;
break;
- case IPV6_DSTOPTS:
+ case IPV6_2292DSTOPTS:
if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) {
err = -EINVAL;
goto exit_f;
@@ -535,6 +605,33 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
opt->dst1opt = hdr;
break;
+ case IPV6_DSTOPTS:
+ case IPV6_RTHDRDSTOPTS:
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) {
+ err = -EINVAL;
+ goto exit_f;
+ }
+
+ hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg);
+ len = ((hdr->hdrlen + 1) << 3);
+ if (cmsg->cmsg_len < CMSG_LEN(len)) {
+ err = -EINVAL;
+ goto exit_f;
+ }
+ if (!capable(CAP_NET_RAW)) {
+ err = -EPERM;
+ goto exit_f;
+ }
+ if (cmsg->cmsg_type == IPV6_DSTOPTS) {
+ opt->opt_flen += len;
+ opt->dst1opt = hdr;
+ } else {
+ opt->opt_nflen += len;
+ opt->dst0opt = hdr;
+ }
+ break;
+
+ case IPV6_2292RTHDR:
case IPV6_RTHDR:
if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_rt_hdr))) {
err = -EINVAL;
@@ -567,7 +664,7 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
opt->opt_nflen += len;
opt->srcrt = rthdr;
- if (opt->dst1opt) {
+ if (cmsg->cmsg_type == IPV6_2292RTHDR && opt->dst1opt) {
int dsthdrlen = ((opt->dst1opt->hdrlen+1)<<3);
opt->opt_nflen += dsthdrlen;
@@ -578,6 +675,7 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
break;
+ case IPV6_2292HOPLIMIT:
case IPV6_HOPLIMIT:
if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) {
err = -EINVAL;
@@ -587,9 +685,27 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
*hlimit = *(int *)CMSG_DATA(cmsg);
break;
+ case IPV6_TCLASS:
+ {
+ int tc;
+
+ err = -EINVAL;
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) {
+ goto exit_f;
+ }
+
+ tc = *(int *)CMSG_DATA(cmsg);
+ if (tc < 0 || tc > 0xff)
+ goto exit_f;
+
+ err = 0;
+ *tclass = tc;
+
+ break;
+ }
default:
- LIMIT_NETDEBUG(
- printk(KERN_DEBUG "invalid cmsg type: %d\n", cmsg->cmsg_type));
+ LIMIT_NETDEBUG(KERN_DEBUG "invalid cmsg type: %d\n",
+ cmsg->cmsg_type);
err = -EINVAL;
break;
};
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 324db62515a..40d9a1935ab 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -31,6 +31,7 @@
#include <net/esp.h>
#include <asm/scatterlist.h>
#include <linux/crypto.h>
+#include <linux/kernel.h>
#include <linux/pfkeyv2.h>
#include <linux/random.h>
#include <net/icmp.h>
@@ -66,10 +67,10 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
alen = esp->auth.icv_trunc_len;
tfm = esp->conf.tfm;
- blksize = (crypto_tfm_alg_blocksize(tfm) + 3) & ~3;
- clen = (clen + 2 + blksize-1)&~(blksize-1);
+ blksize = ALIGN(crypto_tfm_alg_blocksize(tfm), 4);
+ clen = ALIGN(clen + 2, blksize);
if (esp->conf.padlen)
- clen = (clen + esp->conf.padlen-1)&~(esp->conf.padlen-1);
+ clen = ALIGN(clen, esp->conf.padlen);
if ((nfrags = skb_cow_data(skb, clen-skb->len+alen, &trailer)) < 0) {
goto error;
@@ -133,7 +134,7 @@ static int esp6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, stru
struct ipv6_esp_hdr *esph;
struct esp_data *esp = x->data;
struct sk_buff *trailer;
- int blksize = crypto_tfm_alg_blocksize(esp->conf.tfm);
+ int blksize = ALIGN(crypto_tfm_alg_blocksize(esp->conf.tfm), 4);
int alen = esp->auth.icv_trunc_len;
int elen = skb->len - sizeof(struct ipv6_esp_hdr) - esp->conf.ivlen - alen;
@@ -212,8 +213,7 @@ static int esp6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, stru
padlen = nexthdr[0];
if (padlen+2 >= elen) {
- LIMIT_NETDEBUG(
- printk(KERN_WARNING "ipsec esp packet is garbage padlen=%d, elen=%d\n", padlen+2, elen));
+ LIMIT_NETDEBUG(KERN_WARNING "ipsec esp packet is garbage padlen=%d, elen=%d\n", padlen+2, elen);
ret = -EINVAL;
goto out;
}
@@ -236,16 +236,17 @@ out_nofree:
static u32 esp6_get_max_size(struct xfrm_state *x, int mtu)
{
struct esp_data *esp = x->data;
- u32 blksize = crypto_tfm_alg_blocksize(esp->conf.tfm);
+ u32 blksize = ALIGN(crypto_tfm_alg_blocksize(esp->conf.tfm), 4);
if (x->props.mode) {
- mtu = (mtu + 2 + blksize-1)&~(blksize-1);
+ mtu = ALIGN(mtu + 2, blksize);
} else {
/* The worst case. */
- mtu += 2 + blksize;
+ u32 padsize = ((blksize - 1) & 7) + 1;
+ mtu = ALIGN(mtu + 2, padsize) + blksize - padsize;
}
if (esp->conf.padlen)
- mtu = (mtu + esp->conf.padlen-1)&~(esp->conf.padlen-1);
+ mtu = ALIGN(mtu, esp->conf.padlen);
return mtu + x->props.header_len + esp->auth.icv_full_len;
}
@@ -277,22 +278,14 @@ static void esp6_destroy(struct xfrm_state *x)
if (!esp)
return;
- if (esp->conf.tfm) {
- crypto_free_tfm(esp->conf.tfm);
- esp->conf.tfm = NULL;
- }
- if (esp->conf.ivec) {
- kfree(esp->conf.ivec);
- esp->conf.ivec = NULL;
- }
- if (esp->auth.tfm) {
- crypto_free_tfm(esp->auth.tfm);
- esp->auth.tfm = NULL;
- }
- if (esp->auth.work_icv) {
- kfree(esp->auth.work_icv);
- esp->auth.work_icv = NULL;
- }
+ crypto_free_tfm(esp->conf.tfm);
+ esp->conf.tfm = NULL;
+ kfree(esp->conf.ivec);
+ esp->conf.ivec = NULL;
+ crypto_free_tfm(esp->auth.tfm);
+ esp->auth.tfm = NULL;
+ kfree(esp->auth.work_icv);
+ esp->auth.work_icv = NULL;
kfree(esp);
}
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index e0839eafc3a..922549581ab 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -164,6 +164,7 @@ static int ipv6_destopt_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
return -1;
}
+ opt->lastopt = skb->h.raw - skb->nh.raw;
opt->dst1 = skb->h.raw - skb->nh.raw;
if (ip6_parse_tlv(tlvprocdestopt_lst, skb)) {
@@ -243,6 +244,7 @@ static int ipv6_rthdr_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
looped_back:
if (hdr->segments_left == 0) {
+ opt->lastopt = skb->h.raw - skb->nh.raw;
opt->srcrt = skb->h.raw - skb->nh.raw;
skb->h.raw += (hdr->hdrlen + 1) << 3;
opt->dst0 = opt->dst1;
@@ -404,8 +406,7 @@ ipv6_invert_rthdr(struct sock *sk, struct ipv6_rt_hdr *hdr)
memcpy(opt->srcrt, hdr, sizeof(*hdr));
irthdr = (struct rt0_hdr*)opt->srcrt;
- /* Obsolete field, MBZ, when originated by us */
- irthdr->bitmap = 0;
+ irthdr->reserved = 0;
opt->srcrt->segments_left = n;
for (i=0; i<n; i++)
memcpy(irthdr->addr+i, rthdr->addr+(n-1-i), 16);
@@ -424,8 +425,8 @@ static int ipv6_hop_ra(struct sk_buff *skb, int optoff)
IP6CB(skb)->ra = optoff;
return 1;
}
- LIMIT_NETDEBUG(
- printk(KERN_DEBUG "ipv6_hop_ra: wrong RA length %d\n", skb->nh.raw[optoff+1]));
+ LIMIT_NETDEBUG(KERN_DEBUG "ipv6_hop_ra: wrong RA length %d\n",
+ skb->nh.raw[optoff+1]);
kfree_skb(skb);
return 0;
}
@@ -437,8 +438,8 @@ static int ipv6_hop_jumbo(struct sk_buff *skb, int optoff)
u32 pkt_len;
if (skb->nh.raw[optoff+1] != 4 || (optoff&3) != 2) {
- LIMIT_NETDEBUG(
- printk(KERN_DEBUG "ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n", skb->nh.raw[optoff+1]));
+ LIMIT_NETDEBUG(KERN_DEBUG "ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n",
+ skb->nh.raw[optoff+1]);
IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
goto drop;
}
@@ -459,11 +460,10 @@ static int ipv6_hop_jumbo(struct sk_buff *skb, int optoff)
IP6_INC_STATS_BH(IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop;
}
- if (pkt_len + sizeof(struct ipv6hdr) < skb->len) {
- __pskb_trim(skb, pkt_len + sizeof(struct ipv6hdr));
- if (skb->ip_summed == CHECKSUM_HW)
- skb->ip_summed = CHECKSUM_NONE;
- }
+
+ if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr)))
+ goto drop;
+
return 1;
drop:
@@ -539,10 +539,15 @@ void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt,
u8 *proto,
struct in6_addr **daddr)
{
- if (opt->srcrt)
+ if (opt->srcrt) {
ipv6_push_rthdr(skb, proto, opt->srcrt, daddr);
- if (opt->dst0opt)
- ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst0opt);
+ /*
+ * IPV6_RTHDRDSTOPTS is ignored
+ * unless IPV6_RTHDR is set (RFC3542).
+ */
+ if (opt->dst0opt)
+ ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst0opt);
+ }
if (opt->hopopt)
ipv6_push_exthdr(skb, proto, NEXTHDR_HOP, opt->hopopt);
}
@@ -573,3 +578,97 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt)
}
return opt2;
}
+
+static int ipv6_renew_option(void *ohdr,
+ struct ipv6_opt_hdr __user *newopt, int newoptlen,
+ int inherit,
+ struct ipv6_opt_hdr **hdr,
+ char **p)
+{
+ if (inherit) {
+ if (ohdr) {
+ memcpy(*p, ohdr, ipv6_optlen((struct ipv6_opt_hdr *)ohdr));
+ *hdr = (struct ipv6_opt_hdr *)*p;
+ *p += CMSG_ALIGN(ipv6_optlen(*(struct ipv6_opt_hdr **)hdr));
+ }
+ } else {
+ if (newopt) {
+ if (copy_from_user(*p, newopt, newoptlen))
+ return -EFAULT;
+ *hdr = (struct ipv6_opt_hdr *)*p;
+ if (ipv6_optlen(*(struct ipv6_opt_hdr **)hdr) > newoptlen)
+ return -EINVAL;
+ *p += CMSG_ALIGN(newoptlen);
+ }
+ }
+ return 0;
+}
+
+struct ipv6_txoptions *
+ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
+ int newtype,
+ struct ipv6_opt_hdr __user *newopt, int newoptlen)
+{
+ int tot_len = 0;
+ char *p;
+ struct ipv6_txoptions *opt2;
+ int err;
+
+ if (newtype != IPV6_HOPOPTS && opt->hopopt)
+ tot_len += CMSG_ALIGN(ipv6_optlen(opt->hopopt));
+ if (newtype != IPV6_RTHDRDSTOPTS && opt->dst0opt)
+ tot_len += CMSG_ALIGN(ipv6_optlen(opt->dst0opt));
+ if (newtype != IPV6_RTHDR && opt->srcrt)
+ tot_len += CMSG_ALIGN(ipv6_optlen(opt->srcrt));
+ if (newtype != IPV6_DSTOPTS && opt->dst1opt)
+ tot_len += CMSG_ALIGN(ipv6_optlen(opt->dst1opt));
+ if (newopt && newoptlen)
+ tot_len += CMSG_ALIGN(newoptlen);
+
+ if (!tot_len)
+ return NULL;
+
+ opt2 = sock_kmalloc(sk, tot_len, GFP_ATOMIC);
+ if (!opt2)
+ return ERR_PTR(-ENOBUFS);
+
+ memset(opt2, 0, tot_len);
+
+ opt2->tot_len = tot_len;
+ p = (char *)(opt2 + 1);
+
+ err = ipv6_renew_option(opt->hopopt, newopt, newoptlen,
+ newtype != IPV6_HOPOPTS,
+ &opt2->hopopt, &p);
+ if (err)
+ goto out;
+
+ err = ipv6_renew_option(opt->dst0opt, newopt, newoptlen,
+ newtype != IPV6_RTHDRDSTOPTS,
+ &opt2->dst0opt, &p);
+ if (err)
+ goto out;
+
+ err = ipv6_renew_option(opt->srcrt, newopt, newoptlen,
+ newtype != IPV6_RTHDR,
+ (struct ipv6_opt_hdr **)opt2->srcrt, &p);
+ if (err)
+ goto out;
+
+ err = ipv6_renew_option(opt->dst1opt, newopt, newoptlen,
+ newtype != IPV6_DSTOPTS,
+ &opt2->dst1opt, &p);
+ if (err)
+ goto out;
+
+ opt2->opt_nflen = (opt2->hopopt ? ipv6_optlen(opt2->hopopt) : 0) +
+ (opt2->dst0opt ? ipv6_optlen(opt2->dst0opt) : 0) +
+ (opt2->srcrt ? ipv6_optlen(opt2->srcrt) : 0);
+ opt2->opt_flen = (opt2->dst1opt ? ipv6_optlen(opt2->dst1opt) : 0);
+
+ return opt2;
+out:
+ sock_kfree_s(sk, p, tot_len);
+ return ERR_PTR(err);
+}
+
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index ff3ec9822e3..23e540365a1 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -67,7 +67,7 @@
#include <asm/uaccess.h>
#include <asm/system.h>
-DEFINE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics);
+DEFINE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics) __read_mostly;
/*
* The ICMP socket(s). This is the most convenient way to flow control
@@ -287,7 +287,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
int iif = 0;
int addr_type = 0;
int len;
- int hlimit;
+ int hlimit, tclass;
int err = 0;
if ((u8*)hdr < skb->head || (u8*)(hdr+1) > skb->tail)
@@ -332,8 +332,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
* for now we don't know that.
*/
if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) {
- LIMIT_NETDEBUG(
- printk(KERN_DEBUG "icmpv6_send: addr_any/mcast source\n"));
+ LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: addr_any/mcast source\n");
return;
}
@@ -341,8 +340,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
* Never answer to a ICMP packet.
*/
if (is_ineligible(skb)) {
- LIMIT_NETDEBUG(
- printk(KERN_DEBUG "icmpv6_send: no reply to icmp error\n"));
+ LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: no reply to icmp error\n");
return;
}
@@ -376,7 +374,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
if (err)
goto out;
if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0)
- goto out_dst_release;
+ goto out;
if (ipv6_addr_is_multicast(&fl.fl6_dst))
hlimit = np->mcast_hops;
@@ -387,14 +385,17 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
if (hlimit < 0)
hlimit = ipv6_get_hoplimit(dst->dev);
+ tclass = np->cork.tclass;
+ if (tclass < 0)
+ tclass = 0;
+
msg.skb = skb;
msg.offset = skb->nh.raw - skb->data;
len = skb->len - msg.offset;
len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr) -sizeof(struct icmp6hdr));
if (len < 0) {
- LIMIT_NETDEBUG(
- printk(KERN_DEBUG "icmp: len problem\n"));
+ LIMIT_NETDEBUG(KERN_DEBUG "icmp: len problem\n");
goto out_dst_release;
}
@@ -403,7 +404,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
err = ip6_append_data(sk, icmpv6_getfrag, &msg,
len + sizeof(struct icmp6hdr),
sizeof(struct icmp6hdr),
- hlimit, NULL, &fl, (struct rt6_info*)dst,
+ hlimit, tclass, NULL, &fl, (struct rt6_info*)dst,
MSG_DONTWAIT);
if (err) {
ip6_flush_pending_frames(sk);
@@ -437,6 +438,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
struct dst_entry *dst;
int err = 0;
int hlimit;
+ int tclass;
saddr = &skb->nh.ipv6h->daddr;
@@ -467,7 +469,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
if (err)
goto out;
if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0)
- goto out_dst_release;
+ goto out;
if (ipv6_addr_is_multicast(&fl.fl6_dst))
hlimit = np->mcast_hops;
@@ -478,13 +480,17 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
if (hlimit < 0)
hlimit = ipv6_get_hoplimit(dst->dev);
+ tclass = np->cork.tclass;
+ if (tclass < 0)
+ tclass = 0;
+
idev = in6_dev_get(skb->dev);
msg.skb = skb;
msg.offset = 0;
err = ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr),
- sizeof(struct icmp6hdr), hlimit, NULL, &fl,
+ sizeof(struct icmp6hdr), hlimit, tclass, NULL, &fl,
(struct rt6_info*)dst, MSG_DONTWAIT);
if (err) {
@@ -499,7 +505,6 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
out_put:
if (likely(idev != NULL))
in6_dev_put(idev);
-out_dst_release:
dst_release(dst);
out:
icmpv6_xmit_unlock();
@@ -551,7 +556,8 @@ static void icmpv6_notify(struct sk_buff *skb, int type, int code, u32 info)
read_lock(&raw_v6_lock);
if ((sk = sk_head(&raw_v6_htable[hash])) != NULL) {
- while((sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr))) {
+ while((sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr,
+ IP6CB(skb)->iif))) {
rawv6_err(sk, skb, NULL, type, code, inner_offset, info);
sk = sk_next(sk);
}
@@ -583,17 +589,15 @@ static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
skb->ip_summed = CHECKSUM_UNNECESSARY;
if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6,
skb->csum)) {
- LIMIT_NETDEBUG(
- printk(KERN_DEBUG "ICMPv6 hw checksum failed\n"));
+ LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 hw checksum failed\n");
skb->ip_summed = CHECKSUM_NONE;
}
}
if (skb->ip_summed == CHECKSUM_NONE) {
if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6,
skb_checksum(skb, 0, skb->len, 0))) {
- LIMIT_NETDEBUG(
- printk(KERN_DEBUG "ICMPv6 checksum failed [%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x > %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n",
- NIP6(*saddr), NIP6(*daddr)));
+ LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x > %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n",
+ NIP6(*saddr), NIP6(*daddr));
goto discard_it;
}
}
@@ -669,8 +673,7 @@ static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
break;
default:
- LIMIT_NETDEBUG(
- printk(KERN_DEBUG "icmpv6: msg of unknown type\n"));
+ LIMIT_NETDEBUG(KERN_DEBUG "icmpv6: msg of unknown type\n");
/* informational */
if (type & ICMPV6_INFOMSG_MASK)
@@ -697,10 +700,7 @@ int __init icmpv6_init(struct net_proto_family *ops)
struct sock *sk;
int err, i, j;
- for (i = 0; i < NR_CPUS; i++) {
- if (!cpu_possible(i))
- continue;
-
+ for_each_cpu(i) {
err = sock_create_kern(PF_INET6, SOCK_RAW, IPPROTO_ICMPV6,
&per_cpu(__icmpv6_socket, i));
if (err < 0) {
@@ -746,9 +746,7 @@ void icmpv6_cleanup(void)
{
int i;
- for (i = 0; i < NR_CPUS; i++) {
- if (!cpu_possible(i))
- continue;
+ for_each_cpu(i) {
sock_release(per_cpu(__icmpv6_socket, i));
}
inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6);
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
new file mode 100644
index 00000000000..01d5f46d4e4
--- /dev/null
+++ b/net/ipv6/inet6_hashtables.c
@@ -0,0 +1,81 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Generic INET6 transport hashtables
+ *
+ * Authors: Lotsa people, from code originally in tcp
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+
+#include <linux/module.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/inet6_hashtables.h>
+
+struct sock *inet6_lookup_listener(struct inet_hashinfo *hashinfo,
+ const struct in6_addr *daddr,
+ const unsigned short hnum, const int dif)
+{
+ struct sock *sk;
+ const struct hlist_node *node;
+ struct sock *result = NULL;
+ int score, hiscore = 0;
+
+ read_lock(&hashinfo->lhash_lock);
+ sk_for_each(sk, node, &hashinfo->listening_hash[inet_lhashfn(hnum)]) {
+ if (inet_sk(sk)->num == hnum && sk->sk_family == PF_INET6) {
+ const struct ipv6_pinfo *np = inet6_sk(sk);
+
+ score = 1;
+ if (!ipv6_addr_any(&np->rcv_saddr)) {
+ if (!ipv6_addr_equal(&np->rcv_saddr, daddr))
+ continue;
+ score++;
+ }
+ if (sk->sk_bound_dev_if) {
+ if (sk->sk_bound_dev_if != dif)
+ continue;
+ score++;
+ }
+ if (score == 3) {
+ result = sk;
+ break;
+ }
+ if (score > hiscore) {
+ hiscore = score;
+ result = sk;
+ }
+ }
+ }
+ if (result)
+ sock_hold(result);
+ read_unlock(&hashinfo->lhash_lock);
+ return result;
+}
+
+EXPORT_SYMBOL_GPL(inet6_lookup_listener);
+
+struct sock *inet6_lookup(struct inet_hashinfo *hashinfo,
+ const struct in6_addr *saddr, const u16 sport,
+ const struct in6_addr *daddr, const u16 dport,
+ const int dif)
+{
+ struct sock *sk;
+
+ local_bh_disable();
+ sk = __inet6_lookup(hashinfo, saddr, sport, daddr, ntohs(dport), dif);
+ local_bh_enable();
+
+ return sk;
+}
+
+EXPORT_SYMBOL_GPL(inet6_lookup);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 1b354aa9793..4fcc5a7acf6 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -49,7 +49,7 @@
struct rt6_statistics rt6_stats;
-static kmem_cache_t * fib6_node_kmem;
+static kmem_cache_t * fib6_node_kmem __read_mostly;
enum fib_walk_state_t
{
@@ -92,7 +92,7 @@ static struct fib6_node * fib6_repair_tree(struct fib6_node *fn);
static __u32 rt_sernum;
-static struct timer_list ip6_fib_timer = TIMER_INITIALIZER(fib6_run_gc, 0, 0);
+static DEFINE_TIMER(ip6_fib_timer, fib6_run_gc, 0, 0);
struct fib6_walker_t fib6_walker_list = {
.prev = &fib6_walker_list,
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index b6c73da5ff3..bbbe80cdaf7 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -50,7 +50,7 @@ static atomic_t fl_size = ATOMIC_INIT(0);
static struct ip6_flowlabel *fl_ht[FL_HASH_MASK+1];
static void ip6_fl_gc(unsigned long dummy);
-static struct timer_list ip6_fl_gc_timer = TIMER_INITIALIZER(ip6_fl_gc, 0, 0);
+static DEFINE_TIMER(ip6_fl_gc_timer, ip6_fl_gc, 0, 0);
/* FL hash table lock: it protects only of GC */
@@ -225,16 +225,20 @@ struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions * opt_space,
struct ip6_flowlabel * fl,
struct ipv6_txoptions * fopt)
{
- struct ipv6_txoptions * fl_opt = fl->opt;
+ struct ipv6_txoptions * fl_opt = fl ? fl->opt : NULL;
- if (fopt == NULL || fopt->opt_flen == 0)
- return fl_opt;
+ if (fopt == NULL || fopt->opt_flen == 0) {
+ if (!fl_opt || !fl_opt->dst0opt || fl_opt->srcrt)
+ return fl_opt;
+ }
if (fl_opt != NULL) {
opt_space->hopopt = fl_opt->hopopt;
- opt_space->dst0opt = fl_opt->dst0opt;
+ opt_space->dst0opt = fl_opt->srcrt ? fl_opt->dst0opt : NULL;
opt_space->srcrt = fl_opt->srcrt;
opt_space->opt_nflen = fl_opt->opt_nflen;
+ if (fl_opt->dst0opt && !fl_opt->srcrt)
+ opt_space->opt_nflen -= ipv6_optlen(fl_opt->dst0opt);
} else {
if (fopt->opt_nflen == 0)
return fopt;
@@ -310,7 +314,7 @@ fl_create(struct in6_flowlabel_req *freq, char __user *optval, int optlen, int *
msg.msg_control = (void*)(fl->opt+1);
flowi.oif = 0;
- err = datagram_send_ctl(&msg, &flowi, fl->opt, &junk);
+ err = datagram_send_ctl(&msg, &flowi, fl->opt, &junk, &junk);
if (err)
goto done;
err = -EINVAL;
@@ -479,7 +483,7 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen)
goto done;
}
fl1 = sfl->fl;
- atomic_inc(&fl->users);
+ atomic_inc(&fl1->users);
break;
}
}
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 866f10726c5..6e348042693 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -56,7 +56,7 @@ static inline int ip6_rcv_finish( struct sk_buff *skb)
return dst_input(skb);
}
-int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
struct ipv6hdr *hdr;
u32 pkt_len;
@@ -166,8 +166,8 @@ resubmit:
nexthdr = skb->nh.raw[nhoff];
raw_sk = sk_head(&raw_v6_htable[nexthdr & (MAX_INET_PROTOS - 1)]);
- if (raw_sk)
- ipv6_raw_deliver(skb, nexthdr);
+ if (raw_sk && !ipv6_raw_deliver(skb, nexthdr))
+ raw_sk = NULL;
hash = nexthdr & (MAX_INET_PROTOS - 1);
if ((ipprot = rcu_dereference(inet6_protos[hash])) != NULL) {
@@ -198,12 +198,13 @@ resubmit:
if (!raw_sk) {
if (xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
IP6_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS);
- icmpv6_param_prob(skb, ICMPV6_UNK_NEXTHDR, nhoff);
+ icmpv6_send(skb, ICMPV6_PARAMPROB,
+ ICMPV6_UNK_NEXTHDR, nhoff,
+ skb->dev);
}
- } else {
+ } else
IP6_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
- kfree_skb(skb);
- }
+ kfree_skb(skb);
}
rcu_read_unlock();
return 0;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index ae652ca14bc..614296a920c 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -147,57 +147,13 @@ static int ip6_output2(struct sk_buff *skb)
int ip6_output(struct sk_buff *skb)
{
- if (skb->len > dst_mtu(skb->dst) || dst_allfrag(skb->dst))
+ if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->ufo_size) ||
+ dst_allfrag(skb->dst))
return ip6_fragment(skb, ip6_output2);
else
return ip6_output2(skb);
}
-#ifdef CONFIG_NETFILTER
-int ip6_route_me_harder(struct sk_buff *skb)
-{
- struct ipv6hdr *iph = skb->nh.ipv6h;
- struct dst_entry *dst;
- struct flowi fl = {
- .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
- .nl_u =
- { .ip6_u =
- { .daddr = iph->daddr,
- .saddr = iph->saddr, } },
- .proto = iph->nexthdr,
- };
-
- dst = ip6_route_output(skb->sk, &fl);
-
- if (dst->error) {
- IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
- LIMIT_NETDEBUG(
- printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n"));
- dst_release(dst);
- return -EINVAL;
- }
-
- /* Drop old route. */
- dst_release(skb->dst);
-
- skb->dst = dst;
- return 0;
-}
-#endif
-
-static inline int ip6_maybe_reroute(struct sk_buff *skb)
-{
-#ifdef CONFIG_NETFILTER
- if (skb->nfcache & NFC_ALTERED){
- if (ip6_route_me_harder(skb) != 0){
- kfree_skb(skb);
- return -EINVAL;
- }
- }
-#endif /* CONFIG_NETFILTER */
- return dst_output(skb);
-}
-
/*
* xmit an sk_buff (used by TCP)
*/
@@ -211,7 +167,7 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
struct ipv6hdr *hdr;
u8 proto = fl->proto;
int seg_len = skb->len;
- int hlimit;
+ int hlimit, tclass;
u32 mtu;
if (opt) {
@@ -247,7 +203,6 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
* Fill in the IPv6 header
*/
- *(u32*)hdr = htonl(0x60000000) | fl->fl6_flowlabel;
hlimit = -1;
if (np)
hlimit = np->hop_limit;
@@ -256,6 +211,14 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
if (hlimit < 0)
hlimit = ipv6_get_hoplimit(dst->dev);
+ tclass = -1;
+ if (np)
+ tclass = np->tclass;
+ if (tclass < 0)
+ tclass = 0;
+
+ *(u32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
+
hdr->payload_len = htons(seg_len);
hdr->nexthdr = proto;
hdr->hop_limit = hlimit;
@@ -266,7 +229,8 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
mtu = dst_mtu(dst);
if ((skb->len <= mtu) || ipfragok) {
IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
- return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute);
+ return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev,
+ dst_output);
}
if (net_ratelimit())
@@ -321,7 +285,9 @@ static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
read_lock(&ip6_ra_lock);
for (ra = ip6_ra_chain; ra; ra = ra->next) {
struct sock *sk = ra->sk;
- if (sk && ra->sel == sel) {
+ if (sk && ra->sel == sel &&
+ (!sk->sk_bound_dev_if ||
+ sk->sk_bound_dev_if == skb->dev->ifindex)) {
if (last) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
if (skb2)
@@ -667,7 +633,7 @@ slow_path:
*/
if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
- NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n"));
+ NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
err = -ENOMEM;
goto fail;
@@ -701,7 +667,7 @@ slow_path:
*/
fh->nexthdr = nexthdr;
fh->reserved = 0;
- if (frag_id) {
+ if (!frag_id) {
ipv6_select_ident(skb, fh);
frag_id = fh->identification;
} else
@@ -803,11 +769,71 @@ out_err_release:
*dst = NULL;
return err;
}
+inline int ip6_ufo_append_data(struct sock *sk,
+ int getfrag(void *from, char *to, int offset, int len,
+ int odd, struct sk_buff *skb),
+ void *from, int length, int hh_len, int fragheaderlen,
+ int transhdrlen, int mtu,unsigned int flags)
-int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb),
- void *from, int length, int transhdrlen,
- int hlimit, struct ipv6_txoptions *opt, struct flowi *fl, struct rt6_info *rt,
- unsigned int flags)
+{
+ struct sk_buff *skb;
+ int err;
+
+ /* There is support for UDP large send offload by network
+ * device, so create one single skb packet containing complete
+ * udp datagram
+ */
+ if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
+ skb = sock_alloc_send_skb(sk,
+ hh_len + fragheaderlen + transhdrlen + 20,
+ (flags & MSG_DONTWAIT), &err);
+ if (skb == NULL)
+ return -ENOMEM;
+
+ /* reserve space for Hardware header */
+ skb_reserve(skb, hh_len);
+
+ /* create space for UDP/IP header */
+ skb_put(skb,fragheaderlen + transhdrlen);
+
+ /* initialize network header pointer */
+ skb->nh.raw = skb->data;
+
+ /* initialize protocol header pointer */
+ skb->h.raw = skb->data + fragheaderlen;
+
+ skb->ip_summed = CHECKSUM_HW;
+ skb->csum = 0;
+ sk->sk_sndmsg_off = 0;
+ }
+
+ err = skb_append_datato_frags(sk,skb, getfrag, from,
+ (length - transhdrlen));
+ if (!err) {
+ struct frag_hdr fhdr;
+
+ /* specify the length of each IP datagram fragment*/
+ skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen) -
+ sizeof(struct frag_hdr);
+ ipv6_select_ident(skb, &fhdr);
+ skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
+ __skb_queue_tail(&sk->sk_write_queue, skb);
+
+ return 0;
+ }
+ /* There is not enough support do UPD LSO,
+ * so follow normal path
+ */
+ kfree_skb(skb);
+
+ return err;
+}
+
+int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
+ int offset, int len, int odd, struct sk_buff *skb),
+ void *from, int length, int transhdrlen,
+ int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
+ struct rt6_info *rt, unsigned int flags)
{
struct inet_sock *inet = inet_sk(sk);
struct ipv6_pinfo *np = inet6_sk(sk);
@@ -845,6 +871,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offse
np->cork.rt = rt;
inet->cork.fl = *fl;
np->cork.hop_limit = hlimit;
+ np->cork.tclass = tclass;
inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
if (dst_allfrag(rt->u.dst.path))
inet->cork.flags |= IPCORK_ALLFRAG;
@@ -893,6 +920,15 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offse
*/
inet->cork.length += length;
+ if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
+ (rt->u.dst.dev->features & NETIF_F_UFO)) {
+
+ if(ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
+ fragheaderlen, transhdrlen, mtu, flags))
+ goto error;
+
+ return 0;
+ }
if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
goto alloc_new_skb;
@@ -1126,7 +1162,8 @@ int ip6_push_pending_frames(struct sock *sk)
skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
- *(u32*)hdr = fl->fl6_flowlabel | htonl(0x60000000);
+ *(u32*)hdr = fl->fl6_flowlabel |
+ htonl(0x60000000 | ((int)np->cork.tclass << 20));
if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 09613729404..cf94372d1af 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -673,11 +673,12 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
if ((dst = ip6_tnl_dst_check(t)) != NULL)
dst_hold(dst);
- else
+ else {
dst = ip6_route_output(NULL, &fl);
- if (dst->error || xfrm_lookup(&dst, &fl, NULL, 0) < 0)
- goto tx_err_link_failure;
+ if (dst->error || xfrm_lookup(&dst, &fl, NULL, 0) < 0)
+ goto tx_err_link_failure;
+ }
tdev = dst->dev;
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 423feb46ccc..85bfbc69b2c 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -341,8 +341,7 @@ static void ipcomp6_free_tfms(struct crypto_tfm **tfms)
for_each_cpu(cpu) {
struct crypto_tfm *tfm = *per_cpu_ptr(tfms, cpu);
- if (tfm)
- crypto_free_tfm(tfm);
+ crypto_free_tfm(tfm);
}
free_percpu(tfms);
}
@@ -354,7 +353,7 @@ static struct crypto_tfm **ipcomp6_alloc_tfms(const char *alg_name)
int cpu;
/* This can be any valid CPU ID so we don't need locking. */
- cpu = smp_processor_id();
+ cpu = raw_smp_processor_id();
list_for_each_entry(pos, &ipcomp6_tfms_list, list) {
struct crypto_tfm *tfm;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index f3ef4c38d31..8567873d0dd 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -55,7 +55,7 @@
#include <asm/uaccess.h>
-DEFINE_SNMP_STAT(struct ipstats_mib, ipv6_statistics);
+DEFINE_SNMP_STAT(struct ipstats_mib, ipv6_statistics) __read_mostly;
static struct packet_type ipv6_packet_type = {
.type = __constant_htons(ETH_P_IPV6),
@@ -109,13 +109,6 @@ int ip6_ra_control(struct sock *sk, int sel, void (*destructor)(struct sock *))
return 0;
}
-extern int ip6_mc_source(int add, int omode, struct sock *sk,
- struct group_source_req *pgsr);
-extern int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf);
-extern int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
- struct group_filter __user *optval, int __user *optlen);
-
-
int ipv6_setsockopt(struct sock *sk, int level, int optname,
char __user *optval, int optlen)
{
@@ -163,6 +156,13 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname,
fl6_free_socklist(sk);
ipv6_sock_mc_close(sk);
+ /*
+ * Sock is moving from IPv6 to IPv4 (sk_prot), so
+ * remove it from the refcnt debug socks count in the
+ * original family...
+ */
+ sk_refcnt_debug_dec(sk);
+
if (sk->sk_protocol == IPPROTO_TCP) {
struct tcp_sock *tp = tcp_sk(sk);
@@ -192,9 +192,11 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname,
kfree_skb(pktopt);
sk->sk_destruct = inet_sock_destruct;
-#ifdef INET_REFCNT_DEBUG
- atomic_dec(&inet6_sock_nr);
-#endif
+ /*
+ * ... and add it to the refcnt debug socks count
+ * in the new family. -acme
+ */
+ sk_refcnt_debug_inc(sk);
module_put(THIS_MODULE);
retv = 0;
break;
@@ -208,39 +210,139 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname,
retv = 0;
break;
- case IPV6_PKTINFO:
+ case IPV6_RECVPKTINFO:
np->rxopt.bits.rxinfo = valbool;
retv = 0;
break;
+
+ case IPV6_2292PKTINFO:
+ np->rxopt.bits.rxoinfo = valbool;
+ retv = 0;
+ break;
- case IPV6_HOPLIMIT:
+ case IPV6_RECVHOPLIMIT:
np->rxopt.bits.rxhlim = valbool;
retv = 0;
break;
- case IPV6_RTHDR:
+ case IPV6_2292HOPLIMIT:
+ np->rxopt.bits.rxohlim = valbool;
+ retv = 0;
+ break;
+
+ case IPV6_RECVRTHDR:
if (val < 0 || val > 2)
goto e_inval;
np->rxopt.bits.srcrt = val;
retv = 0;
break;
- case IPV6_HOPOPTS:
+ case IPV6_2292RTHDR:
+ if (val < 0 || val > 2)
+ goto e_inval;
+ np->rxopt.bits.osrcrt = val;
+ retv = 0;
+ break;
+
+ case IPV6_RECVHOPOPTS:
np->rxopt.bits.hopopts = valbool;
retv = 0;
break;
- case IPV6_DSTOPTS:
+ case IPV6_2292HOPOPTS:
+ np->rxopt.bits.ohopopts = valbool;
+ retv = 0;
+ break;
+
+ case IPV6_RECVDSTOPTS:
np->rxopt.bits.dstopts = valbool;
retv = 0;
break;
+ case IPV6_2292DSTOPTS:
+ np->rxopt.bits.odstopts = valbool;
+ retv = 0;
+ break;
+
+ case IPV6_TCLASS:
+ if (val < 0 || val > 0xff)
+ goto e_inval;
+ np->tclass = val;
+ retv = 0;
+ break;
+
+ case IPV6_RECVTCLASS:
+ np->rxopt.bits.rxtclass = valbool;
+ retv = 0;
+ break;
+
case IPV6_FLOWINFO:
np->rxopt.bits.rxflow = valbool;
retv = 0;
break;
- case IPV6_PKTOPTIONS:
+ case IPV6_HOPOPTS:
+ case IPV6_RTHDRDSTOPTS:
+ case IPV6_RTHDR:
+ case IPV6_DSTOPTS:
+ {
+ struct ipv6_txoptions *opt;
+ if (optlen == 0)
+ optval = 0;
+
+ /* hop-by-hop / destination options are privileged option */
+ retv = -EPERM;
+ if (optname != IPV6_RTHDR && !capable(CAP_NET_RAW))
+ break;
+
+ retv = -EINVAL;
+ if (optlen & 0x7 || optlen > 8 * 255)
+ break;
+
+ opt = ipv6_renew_options(sk, np->opt, optname,
+ (struct ipv6_opt_hdr __user *)optval,
+ optlen);
+ if (IS_ERR(opt)) {
+ retv = PTR_ERR(opt);
+ break;
+ }
+
+ /* routing header option needs extra check */
+ if (optname == IPV6_RTHDR && opt->srcrt) {
+ struct ipv6_rt_hdr *rthdr = opt->srcrt;
+ if (rthdr->type)
+ goto sticky_done;
+ if ((rthdr->hdrlen & 1) ||
+ (rthdr->hdrlen >> 1) != rthdr->segments_left)
+ goto sticky_done;
+ }
+
+ retv = 0;
+ if (sk->sk_type == SOCK_STREAM) {
+ if (opt) {
+ struct tcp_sock *tp = tcp_sk(sk);
+ if (!((1 << sk->sk_state) &
+ (TCPF_LISTEN | TCPF_CLOSE))
+ && inet_sk(sk)->daddr != LOOPBACK4_IPV6) {
+ tp->ext_header_len = opt->opt_flen + opt->opt_nflen;
+ tcp_sync_mss(sk, tp->pmtu_cookie);
+ }
+ }
+ opt = xchg(&np->opt, opt);
+ sk_dst_reset(sk);
+ } else {
+ write_lock(&sk->sk_dst_lock);
+ opt = xchg(&np->opt, opt);
+ write_unlock(&sk->sk_dst_lock);
+ sk_dst_reset(sk);
+ }
+sticky_done:
+ if (opt)
+ sock_kfree_s(sk, opt, opt->tot_len);
+ break;
+ }
+
+ case IPV6_2292PKTOPTIONS:
{
struct ipv6_txoptions *opt = NULL;
struct msghdr msg;
@@ -274,7 +376,7 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname,
msg.msg_controllen = optlen;
msg.msg_control = (void*)(opt+1);
- retv = datagram_send_ctl(&msg, &fl, opt, &junk);
+ retv = datagram_send_ctl(&msg, &fl, opt, &junk, &junk);
if (retv)
goto done;
update:
@@ -437,7 +539,6 @@ done:
}
case MCAST_MSFILTER:
{
- extern int sysctl_optmem_max;
extern int sysctl_mld_max_msf;
struct group_filter *gsf;
@@ -504,6 +605,9 @@ done:
break;
case IPV6_IPSEC_POLICY:
case IPV6_XFRM_POLICY:
+ retv = -EPERM;
+ if (!capable(CAP_NET_ADMIN))
+ break;
retv = xfrm_user_policy(sk, optname, optval, optlen);
break;
@@ -525,6 +629,17 @@ e_inval:
return -EINVAL;
}
+int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_opt_hdr *hdr,
+ char __user *optval, int len)
+{
+ if (!hdr)
+ return 0;
+ len = min_t(int, len, ipv6_optlen(hdr));
+ if (copy_to_user(optval, hdr, ipv6_optlen(hdr)))
+ return -EFAULT;
+ return len;
+}
+
int ipv6_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen)
{
@@ -563,7 +678,7 @@ int ipv6_getsockopt(struct sock *sk, int level, int optname,
return err;
}
- case IPV6_PKTOPTIONS:
+ case IPV6_2292PKTOPTIONS:
{
struct msghdr msg;
struct sk_buff *skb;
@@ -597,6 +712,16 @@ int ipv6_getsockopt(struct sock *sk, int level, int optname,
int hlim = np->mcast_hops;
put_cmsg(&msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim);
}
+ if (np->rxopt.bits.rxoinfo) {
+ struct in6_pktinfo src_info;
+ src_info.ipi6_ifindex = np->mcast_oif;
+ ipv6_addr_copy(&src_info.ipi6_addr, &np->daddr);
+ put_cmsg(&msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info);
+ }
+ if (np->rxopt.bits.rxohlim) {
+ int hlim = np->mcast_hops;
+ put_cmsg(&msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim);
+ }
}
len -= msg.msg_controllen;
return put_user(len, optlen);
@@ -621,26 +746,67 @@ int ipv6_getsockopt(struct sock *sk, int level, int optname,
val = np->ipv6only;
break;
- case IPV6_PKTINFO:
+ case IPV6_RECVPKTINFO:
val = np->rxopt.bits.rxinfo;
break;
- case IPV6_HOPLIMIT:
+ case IPV6_2292PKTINFO:
+ val = np->rxopt.bits.rxoinfo;
+ break;
+
+ case IPV6_RECVHOPLIMIT:
val = np->rxopt.bits.rxhlim;
break;
- case IPV6_RTHDR:
+ case IPV6_2292HOPLIMIT:
+ val = np->rxopt.bits.rxohlim;
+ break;
+
+ case IPV6_RECVRTHDR:
val = np->rxopt.bits.srcrt;
break;
+ case IPV6_2292RTHDR:
+ val = np->rxopt.bits.osrcrt;
+ break;
+
case IPV6_HOPOPTS:
+ case IPV6_RTHDRDSTOPTS:
+ case IPV6_RTHDR:
+ case IPV6_DSTOPTS:
+ {
+
+ lock_sock(sk);
+ len = ipv6_getsockopt_sticky(sk, np->opt->hopopt,
+ optval, len);
+ release_sock(sk);
+ return put_user(len, optlen);
+ }
+
+ case IPV6_RECVHOPOPTS:
val = np->rxopt.bits.hopopts;
break;
- case IPV6_DSTOPTS:
+ case IPV6_2292HOPOPTS:
+ val = np->rxopt.bits.ohopopts;
+ break;
+
+ case IPV6_RECVDSTOPTS:
val = np->rxopt.bits.dstopts;
break;
+ case IPV6_2292DSTOPTS:
+ val = np->rxopt.bits.odstopts;
+ break;
+
+ case IPV6_TCLASS:
+ val = np->tclass;
+ break;
+
+ case IPV6_RECVTCLASS:
+ val = np->rxopt.bits.rxtclass;
+ break;
+
case IPV6_FLOWINFO:
val = np->rxopt.bits.rxflow;
break;
diff --git a/net/ipv6/ipv6_syms.c b/net/ipv6/ipv6_syms.c
index 5ade5a5d199..37a4a99c9fe 100644
--- a/net/ipv6/ipv6_syms.c
+++ b/net/ipv6/ipv6_syms.c
@@ -15,9 +15,6 @@ EXPORT_SYMBOL(ndisc_mc_map);
EXPORT_SYMBOL(register_inet6addr_notifier);
EXPORT_SYMBOL(unregister_inet6addr_notifier);
EXPORT_SYMBOL(ip6_route_output);
-#ifdef CONFIG_NETFILTER
-EXPORT_SYMBOL(ip6_route_me_harder);
-#endif
EXPORT_SYMBOL(addrconf_lock);
EXPORT_SYMBOL(ipv6_setsockopt);
EXPORT_SYMBOL(ipv6_getsockopt);
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 29fed6e58d0..f15e04ad026 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -164,7 +164,7 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml,
#define MLDV2_MASK(value, nb) ((nb)>=32 ? (value) : ((1<<(nb))-1) & (value))
#define MLDV2_EXP(thresh, nbmant, nbexp, value) \
((value) < (thresh) ? (value) : \
- ((MLDV2_MASK(value, nbmant) | (1<<(nbmant+nbexp))) << \
+ ((MLDV2_MASK(value, nbmant) | (1<<(nbmant))) << \
(MLDV2_MASK((value) >> (nbmant), nbexp) + (nbexp))))
#define MLDV2_QQIC(value) MLDV2_EXP(0x80, 4, 3, value)
@@ -545,8 +545,10 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
sock_kfree_s(sk, newpsl, IP6_SFLSIZE(newpsl->sl_max));
goto done;
}
- } else
+ } else {
newpsl = NULL;
+ (void) ip6_mc_add_src(idev, group, gsf->gf_fmode, 0, NULL, 0);
+ }
psl = pmc->sflist;
if (psl) {
(void) ip6_mc_del_src(idev, group, pmc->sfmode,
@@ -1087,7 +1089,7 @@ static void mld_marksources(struct ifmcaddr6 *pmc, int nsrcs,
int igmp6_event_query(struct sk_buff *skb)
{
- struct mld2_query *mlh2 = (struct mld2_query *) skb->h.raw;
+ struct mld2_query *mlh2 = NULL;
struct ifmcaddr6 *ma;
struct in6_addr *group;
unsigned long max_delay;
@@ -1140,6 +1142,13 @@ int igmp6_event_query(struct sk_buff *skb)
/* clear deleted report items */
mld_clear_delrec(idev);
} else if (len >= 28) {
+ int srcs_offset = sizeof(struct mld2_query) -
+ sizeof(struct icmp6hdr);
+ if (!pskb_may_pull(skb, srcs_offset)) {
+ in6_dev_put(idev);
+ return -EINVAL;
+ }
+ mlh2 = (struct mld2_query *) skb->h.raw;
max_delay = (MLDV2_MRC(ntohs(mlh2->mrc))*HZ)/1000;
if (!max_delay)
max_delay = 1;
@@ -1156,7 +1165,15 @@ int igmp6_event_query(struct sk_buff *skb)
return 0;
}
/* mark sources to include, if group & source-specific */
- mark = mlh2->nsrcs != 0;
+ if (mlh2->nsrcs != 0) {
+ if (!pskb_may_pull(skb, srcs_offset +
+ mlh2->nsrcs * sizeof(struct in6_addr))) {
+ in6_dev_put(idev);
+ return -EINVAL;
+ }
+ mlh2 = (struct mld2_query *) skb->h.raw;
+ mark = 1;
+ }
} else {
in6_dev_put(idev);
return -EINVAL;
@@ -1393,7 +1410,7 @@ static void mld_sendpack(struct sk_buff *skb)
static int grec_size(struct ifmcaddr6 *pmc, int type, int gdel, int sdel)
{
- return sizeof(struct mld2_grec) + 4*mld_scount(pmc,type,gdel,sdel);
+ return sizeof(struct mld2_grec) + 16 * mld_scount(pmc,type,gdel,sdel);
}
static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc,
@@ -1968,7 +1985,7 @@ static void ip6_mc_clear_src(struct ifmcaddr6 *pmc)
}
pmc->mca_sources = NULL;
pmc->mca_sfmode = MCAST_EXCLUDE;
- pmc->mca_sfcount[MCAST_EXCLUDE] = 0;
+ pmc->mca_sfcount[MCAST_INCLUDE] = 0;
pmc->mca_sfcount[MCAST_EXCLUDE] = 1;
}
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 7ae72d4c9bd..305d9ee6d7d 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -447,10 +447,8 @@ static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh,
return;
err = xfrm_lookup(&dst, &fl, NULL, 0);
- if (err < 0) {
- dst_release(dst);
+ if (err < 0)
return;
- }
if (inc_opt) {
if (dev->addr_len)
@@ -539,10 +537,8 @@ void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh,
return;
err = xfrm_lookup(&dst, &fl, NULL, 0);
- if (err < 0) {
- dst_release(dst);
+ if (err < 0)
return;
- }
len = sizeof(struct icmp6hdr) + sizeof(struct in6_addr);
send_llinfo = dev->addr_len && !ipv6_addr_any(saddr);
@@ -616,10 +612,8 @@ void ndisc_send_rs(struct net_device *dev, struct in6_addr *saddr,
return;
err = xfrm_lookup(&dst, &fl, NULL, 0);
- if (err < 0) {
- dst_release(dst);
+ if (err < 0)
return;
- }
len = sizeof(struct icmp6hdr);
if (dev->addr_len)
@@ -812,7 +806,7 @@ static void ndisc_recv_ns(struct sk_buff *skb)
if (ipv6_chk_acast_addr(dev, &msg->target) ||
(idev->cnf.forwarding &&
pneigh_lookup(&nd_tbl, &msg->target, dev, 0))) {
- if (skb->stamp.tv_sec != LOCALLY_ENQUEUED &&
+ if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) &&
skb->pkt_type != PACKET_HOST &&
inc != 0 &&
idev->nd_parms->proxy_delay != 0) {
@@ -1353,10 +1347,8 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
return;
err = xfrm_lookup(&dst, &fl, NULL, 0);
- if (err) {
- dst_release(dst);
+ if (err)
return;
- }
rt = (struct rt6_info *) dst;
@@ -1458,7 +1450,7 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
static void pndisc_redo(struct sk_buff *skb)
{
- ndisc_rcv(skb);
+ ndisc_recv_ns(skb);
kfree_skb(skb);
}
@@ -1487,6 +1479,8 @@ int ndisc_rcv(struct sk_buff *skb)
return 0;
}
+ memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
+
switch (msg->icmph.icmp6_type) {
case NDISC_NEIGHBOUR_SOLICITATION:
ndisc_recv_ns(skb);
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
new file mode 100644
index 00000000000..f8626ebf90f
--- /dev/null
+++ b/net/ipv6/netfilter.c
@@ -0,0 +1,104 @@
+#include <linux/config.h>
+#include <linux/init.h>
+
+#ifdef CONFIG_NETFILTER
+
+#include <linux/kernel.h>
+#include <linux/ipv6.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/dst.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+
+int ip6_route_me_harder(struct sk_buff *skb)
+{
+ struct ipv6hdr *iph = skb->nh.ipv6h;
+ struct dst_entry *dst;
+ struct flowi fl = {
+ .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
+ .nl_u =
+ { .ip6_u =
+ { .daddr = iph->daddr,
+ .saddr = iph->saddr, } },
+ .proto = iph->nexthdr,
+ };
+
+ dst = ip6_route_output(skb->sk, &fl);
+
+ if (dst->error) {
+ IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
+ LIMIT_NETDEBUG(KERN_DEBUG "ip6_route_me_harder: No more route.\n");
+ dst_release(dst);
+ return -EINVAL;
+ }
+
+ /* Drop old route. */
+ dst_release(skb->dst);
+
+ skb->dst = dst;
+ return 0;
+}
+EXPORT_SYMBOL(ip6_route_me_harder);
+
+/*
+ * Extra routing may needed on local out, as the QUEUE target never
+ * returns control to the table.
+ */
+
+struct ip6_rt_info {
+ struct in6_addr daddr;
+ struct in6_addr saddr;
+};
+
+static void save(const struct sk_buff *skb, struct nf_info *info)
+{
+ struct ip6_rt_info *rt_info = nf_info_reroute(info);
+
+ if (info->hook == NF_IP6_LOCAL_OUT) {
+ struct ipv6hdr *iph = skb->nh.ipv6h;
+
+ rt_info->daddr = iph->daddr;
+ rt_info->saddr = iph->saddr;
+ }
+}
+
+static int reroute(struct sk_buff **pskb, const struct nf_info *info)
+{
+ struct ip6_rt_info *rt_info = nf_info_reroute(info);
+
+ if (info->hook == NF_IP6_LOCAL_OUT) {
+ struct ipv6hdr *iph = (*pskb)->nh.ipv6h;
+ if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) ||
+ !ipv6_addr_equal(&iph->saddr, &rt_info->saddr))
+ return ip6_route_me_harder(*pskb);
+ }
+ return 0;
+}
+
+static struct nf_queue_rerouter ip6_reroute = {
+ .rer_size = sizeof(struct ip6_rt_info),
+ .save = &save,
+ .reroute = &reroute,
+};
+
+int __init ipv6_netfilter_init(void)
+{
+ return nf_register_queue_rerouter(PF_INET6, &ip6_reroute);
+}
+
+void ipv6_netfilter_fini(void)
+{
+ nf_unregister_queue_rerouter(PF_INET6);
+}
+
+#else /* CONFIG_NETFILTER */
+int __init ipv6_netfilter_init(void)
+{
+ return 0;
+}
+
+void ipv6_netfilter_fini(void)
+{
+}
+#endif /* CONFIG_NETFILTER */
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 77ec704c9ee..bb7ccfe33f2 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -10,13 +10,16 @@ menu "IPv6: Netfilter Configuration (EXPERIMENTAL)"
# dep_tristate ' FTP protocol support' CONFIG_IP6_NF_FTP $CONFIG_IP6_NF_CONNTRACK
#fi
config IP6_NF_QUEUE
- tristate "Userspace queueing via NETLINK"
+ tristate "IP6 Userspace queueing via NETLINK (OBSOLETE)"
---help---
This option adds a queue handler to the kernel for IPv6
- packets which lets us to receive the filtered packets
- with QUEUE target using libiptc as we can do with
- the IPv4 now.
+ packets which enables users to receive the filtered packets
+ with QUEUE target using libipq.
+
+ THis option enables the old IPv6-only "ip6_queue" implementation
+ which has been obsoleted by the new "nfnetlink_queue" code (see
+ CONFIG_NETFILTER_NETLINK_QUEUE).
(C) Fernando Anton 2001
IPv64 Project - Work based in IPv64 draft by Arturo Azcorra.
@@ -196,6 +199,27 @@ config IP6_NF_TARGET_LOG
To compile it as a module, choose M here. If unsure, say N.
+config IP6_NF_TARGET_REJECT
+ tristate "REJECT target support"
+ depends on IP6_NF_FILTER
+ help
+ The REJECT target allows a filtering rule to specify that an ICMPv6
+ error should be issued in response to an incoming packet, rather
+ than silently being dropped.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP6_NF_TARGET_NFQUEUE
+ tristate "NFQUEUE Target Support"
+ depends on IP_NF_IPTABLES
+ help
+ This Target replaced the old obsolete QUEUE target.
+
+ As opposed to QUEUE, it supports 65535 different queues,
+ not just one.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
# if [ "$CONFIG_IP6_NF_FILTER" != "n" ]; then
# dep_tristate ' REJECT target support' CONFIG_IP6_NF_TARGET_REJECT $CONFIG_IP6_NF_FILTER
# if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
@@ -226,6 +250,22 @@ config IP6_NF_TARGET_MARK
To compile it as a module, choose M here. If unsure, say N.
+config IP6_NF_TARGET_HL
+ tristate 'HL (hoplimit) target support'
+ depends on IP6_NF_MANGLE
+ help
+ This option adds a `HL' target, which enables the user to decrement
+ the hoplimit value of the IPv6 header or set it to a given (lower)
+ value.
+
+ While it is safe to decrement the hoplimit value, this option also
+ enables functionality to increment and set the hoplimit value of the
+ IPv6 header to arbitrary values. This is EXTREMELY DANGEROUS since
+ you can easily create immortal packets that loop forever on the
+ network.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
#dep_tristate ' LOG target support' CONFIG_IP6_NF_TARGET_LOG $CONFIG_IP6_NF_IPTABLES
config IP6_NF_RAW
tristate 'raw table support (required for TRACE)'
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 2e51714953b..2b2c370e8b1 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -20,7 +20,10 @@ obj-$(CONFIG_IP6_NF_MATCH_PHYSDEV) += ip6t_physdev.o
obj-$(CONFIG_IP6_NF_FILTER) += ip6table_filter.o
obj-$(CONFIG_IP6_NF_MANGLE) += ip6table_mangle.o
obj-$(CONFIG_IP6_NF_TARGET_MARK) += ip6t_MARK.o
+obj-$(CONFIG_IP6_NF_TARGET_HL) += ip6t_HL.o
+obj-$(CONFIG_IP6_NF_TARGET_NFQUEUE) += ip6t_NFQUEUE.o
obj-$(CONFIG_IP6_NF_QUEUE) += ip6_queue.o
obj-$(CONFIG_IP6_NF_TARGET_LOG) += ip6t_LOG.o
obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o
obj-$(CONFIG_IP6_NF_MATCH_HL) += ip6t_hl.o
+obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o
diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c
index 5493180f0d4..5027bbe6415 100644
--- a/net/ipv6/netfilter/ip6_queue.c
+++ b/net/ipv6/netfilter/ip6_queue.c
@@ -47,16 +47,10 @@
#define NET_IPQ_QMAX 2088
#define NET_IPQ_QMAX_NAME "ip6_queue_maxlen"
-struct ipq_rt_info {
- struct in6_addr daddr;
- struct in6_addr saddr;
-};
-
struct ipq_queue_entry {
struct list_head list;
struct nf_info *info;
struct sk_buff *skb;
- struct ipq_rt_info rt_info;
};
typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long);
@@ -211,6 +205,12 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
break;
case IPQ_COPY_PACKET:
+ if (entry->skb->ip_summed == CHECKSUM_HW &&
+ (*errp = skb_checksum_help(entry->skb,
+ entry->info->outdev == NULL))) {
+ read_unlock_bh(&queue_lock);
+ return NULL;
+ }
if (copy_range == 0 || copy_range > entry->skb->len)
data_len = entry->skb->len;
else
@@ -238,8 +238,8 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
pmsg->packet_id = (unsigned long )entry;
pmsg->data_len = data_len;
- pmsg->timestamp_sec = entry->skb->stamp.tv_sec;
- pmsg->timestamp_usec = entry->skb->stamp.tv_usec;
+ pmsg->timestamp_sec = entry->skb->tstamp.off_sec;
+ pmsg->timestamp_usec = entry->skb->tstamp.off_usec;
pmsg->mark = entry->skb->nfmark;
pmsg->hook = entry->info->hook;
pmsg->hw_protocol = entry->skb->protocol;
@@ -278,7 +278,8 @@ nlmsg_failure:
}
static int
-ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data)
+ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info,
+ unsigned int queuenum, void *data)
{
int status = -EINVAL;
struct sk_buff *nskb;
@@ -296,13 +297,6 @@ ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data)
entry->info = info;
entry->skb = skb;
- if (entry->info->hook == NF_IP_LOCAL_OUT) {
- struct ipv6hdr *iph = skb->nh.ipv6h;
-
- entry->rt_info.daddr = iph->daddr;
- entry->rt_info.saddr = iph->saddr;
- }
-
nskb = ipq_build_packet_message(entry, &status);
if (nskb == NULL)
goto err_out_free;
@@ -378,22 +372,11 @@ ipq_mangle_ipv6(ipq_verdict_msg_t *v, struct ipq_queue_entry *e)
}
skb_put(e->skb, diff);
}
- if (!skb_ip_make_writable(&e->skb, v->data_len))
+ if (!skb_make_writable(&e->skb, v->data_len))
return -ENOMEM;
memcpy(e->skb->data, v->payload, v->data_len);
- e->skb->nfcache |= NFC_ALTERED;
-
- /*
- * Extra routing may needed on local out, as the QUEUE target never
- * returns control to the table.
- * Not a nice way to cmp, but works
- */
- if (e->info->hook == NF_IP_LOCAL_OUT) {
- struct ipv6hdr *iph = e->skb->nh.ipv6h;
- if (!ipv6_addr_equal(&iph->daddr, &e->rt_info.daddr) ||
- !ipv6_addr_equal(&iph->saddr, &e->rt_info.saddr))
- return ip6_route_me_harder(e->skb);
- }
+ e->skb->ip_summed = CHECKSUM_NONE;
+
return 0;
}
@@ -669,6 +652,11 @@ ipq_get_info(char *buffer, char **start, off_t offset, int length)
return len;
}
+static struct nf_queue_handler nfqh = {
+ .name = "ip6_queue",
+ .outfn = &ipq_enqueue_packet,
+};
+
static int
init_or_cleanup(int init)
{
@@ -679,7 +667,8 @@ init_or_cleanup(int init)
goto cleanup;
netlink_register_notifier(&ipq_nl_notifier);
- ipqnl = netlink_kernel_create(NETLINK_IP6_FW, ipq_rcv_sk);
+ ipqnl = netlink_kernel_create(NETLINK_IP6_FW, 0, ipq_rcv_sk,
+ THIS_MODULE);
if (ipqnl == NULL) {
printk(KERN_ERR "ip6_queue: failed to create netlink socket\n");
goto cleanup_netlink_notifier;
@@ -696,7 +685,7 @@ init_or_cleanup(int init)
register_netdevice_notifier(&ipq_dev_notifier);
ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0);
- status = nf_register_queue_handler(PF_INET6, ipq_enqueue_packet, NULL);
+ status = nf_register_queue_handler(PF_INET6, &nfqh);
if (status < 0) {
printk(KERN_ERR "ip6_queue: failed to register queue handler\n");
goto cleanup_sysctl;
@@ -704,7 +693,7 @@ init_or_cleanup(int init)
return status;
cleanup:
- nf_unregister_queue_handler(PF_INET6);
+ nf_unregister_queue_handlers(&nfqh);
synchronize_net();
ipq_flush(NF_DROP);
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 73034511c8d..7d492226c16 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -2,7 +2,7 @@
* Packet matching code.
*
* Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
- * Copyright (C) 2000-2002 Netfilter core team <coreteam@netfilter.org>
+ * Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -23,11 +23,11 @@
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/icmpv6.h>
-#include <net/ip.h>
#include <net/ipv6.h>
#include <asm/uaccess.h>
#include <asm/semaphore.h>
#include <linux/proc_fs.h>
+#include <linux/cpumask.h>
#include <linux/netfilter_ipv6/ip6_tables.h>
@@ -79,13 +79,12 @@ static DECLARE_MUTEX(ip6t_mutex);
#define inline
#endif
-/* Locking is simple: we assume at worst case there will be one packet
- in user context and one from bottom halves (or soft irq if Alexey's
- softnet patch was applied).
-
+/*
We keep a set of rules for each CPU, so we can avoid write-locking
- them; doing a readlock_bh() stops packets coming through if we're
- in user context.
+ them in the softirq when updating the counters and therefore
+ only need to read-lock in the softirq; doing a write_lock_bh() in user
+ context stops packets coming through and allows user context to read
+ the counters or update the rules.
To be cache friendly on SMP, we arrange them like so:
[ n-entries ]
@@ -355,7 +354,7 @@ ip6t_do_table(struct sk_buff **pskb,
struct ip6t_table *table,
void *userdata)
{
- static const char nulldevname[IFNAMSIZ];
+ static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
int offset = 0;
unsigned int protoff = 0;
int hotdrop = 0;
@@ -368,7 +367,6 @@ ip6t_do_table(struct sk_buff **pskb,
/* Initialization */
indev = in ? in->name : nulldevname;
outdev = out ? out->name : nulldevname;
-
/* We handle fragments by dealing with the first fragment as
* if it was a normal packet. All other fragments are treated
* normally, except that they will NEVER match rules that ask
@@ -401,7 +399,6 @@ ip6t_do_table(struct sk_buff **pskb,
do {
IP_NF_ASSERT(e);
IP_NF_ASSERT(back);
- (*pskb)->nfcache |= e->nfcache;
if (ip6_packet_match(*pskb, indev, outdev, &e->ipv6,
&protoff, &offset)) {
struct ip6t_entry_target *t;
@@ -434,8 +431,8 @@ ip6t_do_table(struct sk_buff **pskb,
back->comefrom);
continue;
}
- if (table_base + v
- != (void *)e + e->next_offset) {
+ if (table_base + v != (void *)e + e->next_offset
+ && !(e->ipv6.flags & IP6T_F_GOTO)) {
/* Save old back ptr in next entry */
struct ip6t_entry *next
= (void *)e + e->next_offset;
@@ -497,75 +494,145 @@ ip6t_do_table(struct sk_buff **pskb,
#endif
}
-/* If it succeeds, returns element and locks mutex */
-static inline void *
-find_inlist_lock_noload(struct list_head *head,
- const char *name,
- int *error,
- struct semaphore *mutex)
+/*
+ * These are weird, but module loading must not be done with mutex
+ * held (since they will register), and we have to have a single
+ * function to use try_then_request_module().
+ */
+
+/* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */
+static inline struct ip6t_table *find_table_lock(const char *name)
{
- void *ret;
+ struct ip6t_table *t;
-#if 1
- duprintf("find_inlist: searching for `%s' in %s.\n",
- name, head == &ip6t_target ? "ip6t_target"
- : head == &ip6t_match ? "ip6t_match"
- : head == &ip6t_tables ? "ip6t_tables" : "UNKNOWN");
-#endif
+ if (down_interruptible(&ip6t_mutex) != 0)
+ return ERR_PTR(-EINTR);
- *error = down_interruptible(mutex);
- if (*error != 0)
- return NULL;
+ list_for_each_entry(t, &ip6t_tables, list)
+ if (strcmp(t->name, name) == 0 && try_module_get(t->me))
+ return t;
+ up(&ip6t_mutex);
+ return NULL;
+}
- ret = list_named_find(head, name);
- if (!ret) {
- *error = -ENOENT;
- up(mutex);
+/* Find match, grabs ref. Returns ERR_PTR() on error. */
+static inline struct ip6t_match *find_match(const char *name, u8 revision)
+{
+ struct ip6t_match *m;
+ int err = 0;
+
+ if (down_interruptible(&ip6t_mutex) != 0)
+ return ERR_PTR(-EINTR);
+
+ list_for_each_entry(m, &ip6t_match, list) {
+ if (strcmp(m->name, name) == 0) {
+ if (m->revision == revision) {
+ if (try_module_get(m->me)) {
+ up(&ip6t_mutex);
+ return m;
+ }
+ } else
+ err = -EPROTOTYPE; /* Found something. */
+ }
}
- return ret;
+ up(&ip6t_mutex);
+ return ERR_PTR(err);
}
-#ifndef CONFIG_KMOD
-#define find_inlist_lock(h,n,p,e,m) find_inlist_lock_noload((h),(n),(e),(m))
-#else
-static void *
-find_inlist_lock(struct list_head *head,
- const char *name,
- const char *prefix,
- int *error,
- struct semaphore *mutex)
+/* Find target, grabs ref. Returns ERR_PTR() on error. */
+static inline struct ip6t_target *find_target(const char *name, u8 revision)
{
- void *ret;
+ struct ip6t_target *t;
+ int err = 0;
- ret = find_inlist_lock_noload(head, name, error, mutex);
- if (!ret) {
- duprintf("find_inlist: loading `%s%s'.\n", prefix, name);
- request_module("%s%s", prefix, name);
- ret = find_inlist_lock_noload(head, name, error, mutex);
+ if (down_interruptible(&ip6t_mutex) != 0)
+ return ERR_PTR(-EINTR);
+
+ list_for_each_entry(t, &ip6t_target, list) {
+ if (strcmp(t->name, name) == 0) {
+ if (t->revision == revision) {
+ if (try_module_get(t->me)) {
+ up(&ip6t_mutex);
+ return t;
+ }
+ } else
+ err = -EPROTOTYPE; /* Found something. */
+ }
}
+ up(&ip6t_mutex);
+ return ERR_PTR(err);
+}
- return ret;
+struct ip6t_target *ip6t_find_target(const char *name, u8 revision)
+{
+ struct ip6t_target *target;
+
+ target = try_then_request_module(find_target(name, revision),
+ "ip6t_%s", name);
+ if (IS_ERR(target) || !target)
+ return NULL;
+ return target;
}
-#endif
-static inline struct ip6t_table *
-ip6t_find_table_lock(const char *name, int *error, struct semaphore *mutex)
+static int match_revfn(const char *name, u8 revision, int *bestp)
{
- return find_inlist_lock(&ip6t_tables, name, "ip6table_", error, mutex);
+ struct ip6t_match *m;
+ int have_rev = 0;
+
+ list_for_each_entry(m, &ip6t_match, list) {
+ if (strcmp(m->name, name) == 0) {
+ if (m->revision > *bestp)
+ *bestp = m->revision;
+ if (m->revision == revision)
+ have_rev = 1;
+ }
+ }
+ return have_rev;
}
-static inline struct ip6t_match *
-find_match_lock(const char *name, int *error, struct semaphore *mutex)
+static int target_revfn(const char *name, u8 revision, int *bestp)
{
- return find_inlist_lock(&ip6t_match, name, "ip6t_", error, mutex);
+ struct ip6t_target *t;
+ int have_rev = 0;
+
+ list_for_each_entry(t, &ip6t_target, list) {
+ if (strcmp(t->name, name) == 0) {
+ if (t->revision > *bestp)
+ *bestp = t->revision;
+ if (t->revision == revision)
+ have_rev = 1;
+ }
+ }
+ return have_rev;
}
-static struct ip6t_target *
-ip6t_find_target_lock(const char *name, int *error, struct semaphore *mutex)
+/* Returns true or fals (if no such extension at all) */
+static inline int find_revision(const char *name, u8 revision,
+ int (*revfn)(const char *, u8, int *),
+ int *err)
{
- return find_inlist_lock(&ip6t_target, name, "ip6t_", error, mutex);
+ int have_rev, best = -1;
+
+ if (down_interruptible(&ip6t_mutex) != 0) {
+ *err = -EINTR;
+ return 1;
+ }
+ have_rev = revfn(name, revision, &best);
+ up(&ip6t_mutex);
+
+ /* Nothing at all? Return 0 to try loading module. */
+ if (best == -1) {
+ *err = -ENOENT;
+ return 0;
+ }
+
+ *err = best;
+ if (!have_rev)
+ *err = -EPROTONOSUPPORT;
+ return 1;
}
+
/* All zeroes == unconditional rule. */
static inline int
unconditional(const struct ip6t_ip6 *ipv6)
@@ -725,20 +792,16 @@ check_match(struct ip6t_entry_match *m,
unsigned int hookmask,
unsigned int *i)
{
- int ret;
struct ip6t_match *match;
- match = find_match_lock(m->u.user.name, &ret, &ip6t_mutex);
- if (!match) {
- // duprintf("check_match: `%s' not found\n", m->u.name);
- return ret;
- }
- if (!try_module_get(match->me)) {
- up(&ip6t_mutex);
- return -ENOENT;
+ match = try_then_request_module(find_match(m->u.user.name,
+ m->u.user.revision),
+ "ip6t_%s", m->u.user.name);
+ if (IS_ERR(match) || !match) {
+ duprintf("check_match: `%s' not found\n", m->u.user.name);
+ return match ? PTR_ERR(match) : -ENOENT;
}
m->u.kernel.match = match;
- up(&ip6t_mutex);
if (m->u.kernel.match->checkentry
&& !m->u.kernel.match->checkentry(name, ipv6, m->data,
@@ -776,22 +839,16 @@ check_entry(struct ip6t_entry *e, const char *name, unsigned int size,
goto cleanup_matches;
t = ip6t_get_target(e);
- target = ip6t_find_target_lock(t->u.user.name, &ret, &ip6t_mutex);
- if (!target) {
+ target = try_then_request_module(find_target(t->u.user.name,
+ t->u.user.revision),
+ "ip6t_%s", t->u.user.name);
+ if (IS_ERR(target) || !target) {
duprintf("check_entry: `%s' not found\n", t->u.user.name);
- goto cleanup_matches;
- }
- if (!try_module_get(target->me)) {
- up(&ip6t_mutex);
- ret = -ENOENT;
+ ret = target ? PTR_ERR(target) : -ENOENT;
goto cleanup_matches;
}
t->u.kernel.target = target;
- up(&ip6t_mutex);
- if (!t->u.kernel.target) {
- ret = -EBUSY;
- goto cleanup_matches;
- }
+
if (t->u.kernel.target == &ip6t_standard_target) {
if (!standard_check(t, size)) {
ret = -EINVAL;
@@ -951,8 +1008,10 @@ translate_table(const char *name,
}
/* And one copy for every other CPU */
- for (i = 1; i < num_possible_cpus(); i++) {
- memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i,
+ for_each_cpu(i) {
+ if (i == 0)
+ continue;
+ memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i,
newinfo->entries,
SMP_ALIGN(newinfo->size));
}
@@ -973,7 +1032,7 @@ replace_table(struct ip6t_table *table,
struct ip6t_entry *table_base;
unsigned int i;
- for (i = 0; i < num_possible_cpus(); i++) {
+ for_each_cpu(i) {
table_base =
(void *)newinfo->entries
+ TABLE_OFFSET(newinfo, i);
@@ -1020,7 +1079,7 @@ get_counters(const struct ip6t_table_info *t,
unsigned int cpu;
unsigned int i;
- for (cpu = 0; cpu < num_possible_cpus(); cpu++) {
+ for_each_cpu(cpu) {
i = 0;
IP6T_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu),
t->size,
@@ -1116,8 +1175,8 @@ get_entries(const struct ip6t_get_entries *entries,
int ret;
struct ip6t_table *t;
- t = ip6t_find_table_lock(entries->name, &ret, &ip6t_mutex);
- if (t) {
+ t = find_table_lock(entries->name);
+ if (t && !IS_ERR(t)) {
duprintf("t->private->number = %u\n",
t->private->number);
if (entries->size == t->private->size)
@@ -1129,10 +1188,10 @@ get_entries(const struct ip6t_get_entries *entries,
entries->size);
ret = -EINVAL;
}
+ module_put(t->me);
up(&ip6t_mutex);
} else
- duprintf("get_entries: Can't find %s!\n",
- entries->name);
+ ret = t ? PTR_ERR(t) : -ENOENT;
return ret;
}
@@ -1154,7 +1213,8 @@ do_replace(void __user *user, unsigned int len)
return -ENOMEM;
newinfo = vmalloc(sizeof(struct ip6t_table_info)
- + SMP_ALIGN(tmp.size) * num_possible_cpus());
+ + SMP_ALIGN(tmp.size) *
+ (highest_possible_processor_id()+1));
if (!newinfo)
return -ENOMEM;
@@ -1179,22 +1239,19 @@ do_replace(void __user *user, unsigned int len)
duprintf("ip_tables: Translated table\n");
- t = ip6t_find_table_lock(tmp.name, &ret, &ip6t_mutex);
- if (!t)
+ t = try_then_request_module(find_table_lock(tmp.name),
+ "ip6table_%s", tmp.name);
+ if (!t || IS_ERR(t)) {
+ ret = t ? PTR_ERR(t) : -ENOENT;
goto free_newinfo_counters_untrans;
+ }
/* You lied! */
if (tmp.valid_hooks != t->valid_hooks) {
duprintf("Valid hook crap: %08X vs %08X\n",
tmp.valid_hooks, t->valid_hooks);
ret = -EINVAL;
- goto free_newinfo_counters_untrans_unlock;
- }
-
- /* Get a reference in advance, we're not allowed fail later */
- if (!try_module_get(t->me)) {
- ret = -EBUSY;
- goto free_newinfo_counters_untrans_unlock;
+ goto put_module;
}
oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret);
@@ -1216,7 +1273,6 @@ do_replace(void __user *user, unsigned int len)
/* Decrease module usage counts and free resource */
IP6T_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL);
vfree(oldinfo);
- /* Silent error: too late now. */
if (copy_to_user(tmp.counters, counters,
sizeof(struct ip6t_counters) * tmp.num_counters) != 0)
ret = -EFAULT;
@@ -1226,7 +1282,6 @@ do_replace(void __user *user, unsigned int len)
put_module:
module_put(t->me);
- free_newinfo_counters_untrans_unlock:
up(&ip6t_mutex);
free_newinfo_counters_untrans:
IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL);
@@ -1265,7 +1320,7 @@ do_add_counters(void __user *user, unsigned int len)
unsigned int i;
struct ip6t_counters_info tmp, *paddc;
struct ip6t_table *t;
- int ret;
+ int ret = 0;
if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
return -EFAULT;
@@ -1282,9 +1337,11 @@ do_add_counters(void __user *user, unsigned int len)
goto free;
}
- t = ip6t_find_table_lock(tmp.name, &ret, &ip6t_mutex);
- if (!t)
+ t = find_table_lock(tmp.name);
+ if (!t || IS_ERR(t)) {
+ ret = t ? PTR_ERR(t) : -ENOENT;
goto free;
+ }
write_lock_bh(&t->lock);
if (t->private->number != paddc->num_counters) {
@@ -1301,6 +1358,7 @@ do_add_counters(void __user *user, unsigned int len)
unlock_up_free:
write_unlock_bh(&t->lock);
up(&ip6t_mutex);
+ module_put(t->me);
free:
vfree(paddc);
@@ -1357,8 +1415,10 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
break;
}
name[IP6T_TABLE_MAXNAMELEN-1] = '\0';
- t = ip6t_find_table_lock(name, &ret, &ip6t_mutex);
- if (t) {
+
+ t = try_then_request_module(find_table_lock(name),
+ "ip6table_%s", name);
+ if (t && !IS_ERR(t)) {
struct ip6t_getinfo info;
info.valid_hooks = t->valid_hooks;
@@ -1374,9 +1434,10 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
ret = -EFAULT;
else
ret = 0;
-
up(&ip6t_mutex);
- }
+ module_put(t->me);
+ } else
+ ret = t ? PTR_ERR(t) : -ENOENT;
}
break;
@@ -1397,6 +1458,31 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
break;
}
+ case IP6T_SO_GET_REVISION_MATCH:
+ case IP6T_SO_GET_REVISION_TARGET: {
+ struct ip6t_get_revision rev;
+ int (*revfn)(const char *, u8, int *);
+
+ if (*len != sizeof(rev)) {
+ ret = -EINVAL;
+ break;
+ }
+ if (copy_from_user(&rev, user, sizeof(rev)) != 0) {
+ ret = -EFAULT;
+ break;
+ }
+
+ if (cmd == IP6T_SO_GET_REVISION_TARGET)
+ revfn = target_revfn;
+ else
+ revfn = match_revfn;
+
+ try_then_request_module(find_revision(rev.name, rev.revision,
+ revfn, &ret),
+ "ip6t_%s", rev.name);
+ break;
+ }
+
default:
duprintf("do_ip6t_get_ctl: unknown request %i\n", cmd);
ret = -EINVAL;
@@ -1414,12 +1500,7 @@ ip6t_register_target(struct ip6t_target *target)
ret = down_interruptible(&ip6t_mutex);
if (ret != 0)
return ret;
-
- if (!list_named_insert(&ip6t_target, target)) {
- duprintf("ip6t_register_target: `%s' already in list!\n",
- target->name);
- ret = -EINVAL;
- }
+ list_add(&target->list, &ip6t_target);
up(&ip6t_mutex);
return ret;
}
@@ -1441,11 +1522,7 @@ ip6t_register_match(struct ip6t_match *match)
if (ret != 0)
return ret;
- if (!list_named_insert(&ip6t_match, match)) {
- duprintf("ip6t_register_match: `%s' already in list!\n",
- match->name);
- ret = -EINVAL;
- }
+ list_add(&match->list, &ip6t_match);
up(&ip6t_mutex);
return ret;
@@ -1468,7 +1545,8 @@ int ip6t_register_table(struct ip6t_table *table,
= { 0, 0, 0, { 0 }, { 0 }, { } };
newinfo = vmalloc(sizeof(struct ip6t_table_info)
- + SMP_ALIGN(repl->size) * num_possible_cpus());
+ + SMP_ALIGN(repl->size) *
+ (highest_possible_processor_id()+1));
if (!newinfo)
return -ENOMEM;
@@ -1956,6 +2034,57 @@ static void __exit fini(void)
#endif
}
+/*
+ * find specified header up to transport protocol header.
+ * If found target header, the offset to the header is set to *offset
+ * and return 0. otherwise, return -1.
+ *
+ * Notes: - non-1st Fragment Header isn't skipped.
+ * - ESP header isn't skipped.
+ * - The target header may be trancated.
+ */
+int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, u8 target)
+{
+ unsigned int start = (u8*)(skb->nh.ipv6h + 1) - skb->data;
+ u8 nexthdr = skb->nh.ipv6h->nexthdr;
+ unsigned int len = skb->len - start;
+
+ while (nexthdr != target) {
+ struct ipv6_opt_hdr _hdr, *hp;
+ unsigned int hdrlen;
+
+ if ((!ipv6_ext_hdr(nexthdr)) || nexthdr == NEXTHDR_NONE)
+ return -1;
+ hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr);
+ if (hp == NULL)
+ return -1;
+ if (nexthdr == NEXTHDR_FRAGMENT) {
+ unsigned short _frag_off, *fp;
+ fp = skb_header_pointer(skb,
+ start+offsetof(struct frag_hdr,
+ frag_off),
+ sizeof(_frag_off),
+ &_frag_off);
+ if (fp == NULL)
+ return -1;
+
+ if (ntohs(*fp) & ~0x7)
+ return -1;
+ hdrlen = 8;
+ } else if (nexthdr == NEXTHDR_AUTH)
+ hdrlen = (hp->hdrlen + 2) << 2;
+ else
+ hdrlen = ipv6_optlen(hp);
+
+ nexthdr = hp->nexthdr;
+ len -= hdrlen;
+ start += hdrlen;
+ }
+
+ *offset = start;
+ return 0;
+}
+
EXPORT_SYMBOL(ip6t_register_table);
EXPORT_SYMBOL(ip6t_unregister_table);
EXPORT_SYMBOL(ip6t_do_table);
@@ -1964,6 +2093,7 @@ EXPORT_SYMBOL(ip6t_unregister_match);
EXPORT_SYMBOL(ip6t_register_target);
EXPORT_SYMBOL(ip6t_unregister_target);
EXPORT_SYMBOL(ip6t_ext_hdr);
+EXPORT_SYMBOL(ipv6_find_hdr);
module_init(init);
module_exit(fini);
diff --git a/net/ipv6/netfilter/ip6t_HL.c b/net/ipv6/netfilter/ip6t_HL.c
new file mode 100644
index 00000000000..8f5549b7272
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_HL.c
@@ -0,0 +1,118 @@
+/*
+ * Hop Limit modification target for ip6tables
+ * Maciej Soltysiak <solt@dns.toxicfilms.tv>
+ * Based on HW's TTL module
+ *
+ * This software is distributed under the terms of GNU GPL
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_ipv6/ip6t_HL.h>
+
+MODULE_AUTHOR("Maciej Soltysiak <solt@dns.toxicfilms.tv>");
+MODULE_DESCRIPTION("IP tables Hop Limit modification module");
+MODULE_LICENSE("GPL");
+
+static unsigned int ip6t_hl_target(struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ unsigned int hooknum,
+ const void *targinfo, void *userinfo)
+{
+ struct ipv6hdr *ip6h;
+ const struct ip6t_HL_info *info = targinfo;
+ u_int16_t diffs[2];
+ int new_hl;
+
+ if (!skb_make_writable(pskb, (*pskb)->len))
+ return NF_DROP;
+
+ ip6h = (*pskb)->nh.ipv6h;
+
+ switch (info->mode) {
+ case IP6T_HL_SET:
+ new_hl = info->hop_limit;
+ break;
+ case IP6T_HL_INC:
+ new_hl = ip6h->hop_limit + info->hop_limit;
+ if (new_hl > 255)
+ new_hl = 255;
+ break;
+ case IP6T_HL_DEC:
+ new_hl = ip6h->hop_limit - info->hop_limit;
+ if (new_hl < 0)
+ new_hl = 0;
+ break;
+ default:
+ new_hl = ip6h->hop_limit;
+ break;
+ }
+
+ if (new_hl != ip6h->hop_limit) {
+ diffs[0] = htons(((unsigned)ip6h->hop_limit) << 8) ^ 0xFFFF;
+ ip6h->hop_limit = new_hl;
+ diffs[1] = htons(((unsigned)ip6h->hop_limit) << 8);
+ }
+
+ return IP6T_CONTINUE;
+}
+
+static int ip6t_hl_checkentry(const char *tablename,
+ const struct ip6t_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ struct ip6t_HL_info *info = targinfo;
+
+ if (targinfosize != IP6T_ALIGN(sizeof(struct ip6t_HL_info))) {
+ printk(KERN_WARNING "ip6t_HL: targinfosize %u != %Zu\n",
+ targinfosize,
+ IP6T_ALIGN(sizeof(struct ip6t_HL_info)));
+ return 0;
+ }
+
+ if (strcmp(tablename, "mangle")) {
+ printk(KERN_WARNING "ip6t_HL: can only be called from "
+ "\"mangle\" table, not \"%s\"\n", tablename);
+ return 0;
+ }
+
+ if (info->mode > IP6T_HL_MAXMODE) {
+ printk(KERN_WARNING "ip6t_HL: invalid or unknown Mode %u\n",
+ info->mode);
+ return 0;
+ }
+
+ if ((info->mode != IP6T_HL_SET) && (info->hop_limit == 0)) {
+ printk(KERN_WARNING "ip6t_HL: increment/decrement doesn't "
+ "make sense with value 0\n");
+ return 0;
+ }
+
+ return 1;
+}
+
+static struct ip6t_target ip6t_HL = {
+ .name = "HL",
+ .target = ip6t_hl_target,
+ .checkentry = ip6t_hl_checkentry,
+ .me = THIS_MODULE
+};
+
+static int __init init(void)
+{
+ return ip6t_register_target(&ip6t_HL);
+}
+
+static void __exit fini(void)
+{
+ ip6t_unregister_target(&ip6t_HL);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index a692e26a4fa..0cd1d1bd903 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -26,10 +26,6 @@ MODULE_AUTHOR("Jan Rekorajski <baggins@pld.org.pl>");
MODULE_DESCRIPTION("IP6 tables LOG target module");
MODULE_LICENSE("GPL");
-static unsigned int nflog = 1;
-module_param(nflog, int, 0400);
-MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
-
struct in_device;
#include <net/route.h>
#include <linux/netfilter_ipv6/ip6t_LOG.h>
@@ -44,7 +40,7 @@ struct in_device;
static DEFINE_SPINLOCK(log_lock);
/* One level of recursion won't kill us */
-static void dump_packet(const struct ip6t_log_info *info,
+static void dump_packet(const struct nf_loginfo *info,
const struct sk_buff *skb, unsigned int ip6hoff,
int recurse)
{
@@ -53,6 +49,12 @@ static void dump_packet(const struct ip6t_log_info *info,
struct ipv6hdr _ip6h, *ih;
unsigned int ptr;
unsigned int hdrlen = 0;
+ unsigned int logflags;
+
+ if (info->type == NF_LOG_TYPE_LOG)
+ logflags = info->u.log.logflags;
+ else
+ logflags = NF_LOG_MASK;
ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h);
if (ih == NULL) {
@@ -84,7 +86,7 @@ static void dump_packet(const struct ip6t_log_info *info,
}
/* Max length: 48 "OPT (...) " */
- if (info->logflags & IP6T_LOG_IPOPT)
+ if (logflags & IP6T_LOG_IPOPT)
printk("OPT ( ");
switch (currenthdr) {
@@ -119,7 +121,7 @@ static void dump_packet(const struct ip6t_log_info *info,
case IPPROTO_ROUTING:
case IPPROTO_HOPOPTS:
if (fragment) {
- if (info->logflags & IP6T_LOG_IPOPT)
+ if (logflags & IP6T_LOG_IPOPT)
printk(")");
return;
}
@@ -127,7 +129,7 @@ static void dump_packet(const struct ip6t_log_info *info,
break;
/* Max Length */
case IPPROTO_AH:
- if (info->logflags & IP6T_LOG_IPOPT) {
+ if (logflags & IP6T_LOG_IPOPT) {
struct ip_auth_hdr _ahdr, *ah;
/* Max length: 3 "AH " */
@@ -158,7 +160,7 @@ static void dump_packet(const struct ip6t_log_info *info,
hdrlen = (hp->hdrlen+2)<<2;
break;
case IPPROTO_ESP:
- if (info->logflags & IP6T_LOG_IPOPT) {
+ if (logflags & IP6T_LOG_IPOPT) {
struct ip_esp_hdr _esph, *eh;
/* Max length: 4 "ESP " */
@@ -190,7 +192,7 @@ static void dump_packet(const struct ip6t_log_info *info,
printk("Unknown Ext Hdr %u", currenthdr);
return;
}
- if (info->logflags & IP6T_LOG_IPOPT)
+ if (logflags & IP6T_LOG_IPOPT)
printk(") ");
currenthdr = hp->nexthdr;
@@ -218,7 +220,7 @@ static void dump_packet(const struct ip6t_log_info *info,
printk("SPT=%u DPT=%u ",
ntohs(th->source), ntohs(th->dest));
/* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
- if (info->logflags & IP6T_LOG_TCPSEQ)
+ if (logflags & IP6T_LOG_TCPSEQ)
printk("SEQ=%u ACK=%u ",
ntohl(th->seq), ntohl(th->ack_seq));
/* Max length: 13 "WINDOW=65535 " */
@@ -245,7 +247,7 @@ static void dump_packet(const struct ip6t_log_info *info,
/* Max length: 11 "URGP=65535 " */
printk("URGP=%u ", ntohs(th->urg_ptr));
- if ((info->logflags & IP6T_LOG_TCPOPT)
+ if ((logflags & IP6T_LOG_TCPOPT)
&& th->doff * 4 > sizeof(struct tcphdr)) {
u_int8_t _opt[60 - sizeof(struct tcphdr)], *op;
unsigned int i;
@@ -349,7 +351,7 @@ static void dump_packet(const struct ip6t_log_info *info,
}
/* Max length: 15 "UID=4294967295 " */
- if ((info->logflags & IP6T_LOG_UID) && recurse && skb->sk) {
+ if ((logflags & IP6T_LOG_UID) && recurse && skb->sk) {
read_lock_bh(&skb->sk->sk_callback_lock);
if (skb->sk->sk_socket && skb->sk->sk_socket->file)
printk("UID=%u ", skb->sk->sk_socket->file->f_uid);
@@ -357,19 +359,31 @@ static void dump_packet(const struct ip6t_log_info *info,
}
}
+static struct nf_loginfo default_loginfo = {
+ .type = NF_LOG_TYPE_LOG,
+ .u = {
+ .log = {
+ .level = 0,
+ .logflags = NF_LOG_MASK,
+ },
+ },
+};
+
static void
-ip6t_log_packet(unsigned int hooknum,
+ip6t_log_packet(unsigned int pf,
+ unsigned int hooknum,
const struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
- const struct ip6t_log_info *loginfo,
- const char *level_string,
+ const struct nf_loginfo *loginfo,
const char *prefix)
{
+ if (!loginfo)
+ loginfo = &default_loginfo;
+
spin_lock_bh(&log_lock);
- printk(level_string);
- printk("%sIN=%s OUT=%s ",
- prefix == NULL ? loginfo->prefix : prefix,
+ printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
+ prefix,
in ? in->name : "",
out ? out->name : "");
if (in && !out) {
@@ -416,29 +430,17 @@ ip6t_log_target(struct sk_buff **pskb,
void *userinfo)
{
const struct ip6t_log_info *loginfo = targinfo;
- char level_string[4] = "< >";
+ struct nf_loginfo li;
+
+ li.type = NF_LOG_TYPE_LOG;
+ li.u.log.level = loginfo->level;
+ li.u.log.logflags = loginfo->logflags;
- level_string[1] = '0' + (loginfo->level % 8);
- ip6t_log_packet(hooknum, *pskb, in, out, loginfo, level_string, NULL);
+ nf_log_packet(PF_INET6, hooknum, *pskb, in, out, &li, loginfo->prefix);
return IP6T_CONTINUE;
}
-static void
-ip6t_logfn(unsigned int hooknum,
- const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const char *prefix)
-{
- struct ip6t_log_info loginfo = {
- .level = 0,
- .logflags = IP6T_LOG_MASK,
- .prefix = ""
- };
-
- ip6t_log_packet(hooknum, skb, in, out, &loginfo, KERN_WARNING, prefix);
-}
static int ip6t_log_checkentry(const char *tablename,
const struct ip6t_entry *e,
@@ -475,20 +477,29 @@ static struct ip6t_target ip6t_log_reg = {
.me = THIS_MODULE,
};
+static struct nf_logger ip6t_logger = {
+ .name = "ip6t_LOG",
+ .logfn = &ip6t_log_packet,
+ .me = THIS_MODULE,
+};
+
static int __init init(void)
{
if (ip6t_register_target(&ip6t_log_reg))
return -EINVAL;
- if (nflog)
- nf_log_register(PF_INET6, &ip6t_logfn);
+ if (nf_log_register(PF_INET6, &ip6t_logger) < 0) {
+ printk(KERN_WARNING "ip6t_LOG: not logging via system console "
+ "since somebody else already registered for PF_INET6\n");
+ /* we cannot make module load fail here, since otherwise
+ * ip6tables userspace would abort */
+ }
return 0;
}
static void __exit fini(void)
{
- if (nflog)
- nf_log_unregister(PF_INET6, &ip6t_logfn);
+ nf_log_unregister_logger(&ip6t_logger);
ip6t_unregister_target(&ip6t_log_reg);
}
diff --git a/net/ipv6/netfilter/ip6t_MARK.c b/net/ipv6/netfilter/ip6t_MARK.c
index d09ceb05013..0c7584f9217 100644
--- a/net/ipv6/netfilter/ip6t_MARK.c
+++ b/net/ipv6/netfilter/ip6t_MARK.c
@@ -28,10 +28,9 @@ target(struct sk_buff **pskb,
{
const struct ip6t_mark_target_info *markinfo = targinfo;
- if((*pskb)->nfmark != markinfo->mark) {
+ if((*pskb)->nfmark != markinfo->mark)
(*pskb)->nfmark = markinfo->mark;
- (*pskb)->nfcache |= NFC_ALTERED;
- }
+
return IP6T_CONTINUE;
}
@@ -57,8 +56,12 @@ checkentry(const char *tablename,
return 1;
}
-static struct ip6t_target ip6t_mark_reg
-= { { NULL, NULL }, "MARK", target, checkentry, NULL, THIS_MODULE };
+static struct ip6t_target ip6t_mark_reg = {
+ .name = "MARK",
+ .target = target,
+ .checkentry = checkentry,
+ .me = THIS_MODULE
+};
static int __init init(void)
{
diff --git a/net/ipv6/netfilter/ip6t_NFQUEUE.c b/net/ipv6/netfilter/ip6t_NFQUEUE.c
new file mode 100644
index 00000000000..c6e3730e740
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_NFQUEUE.c
@@ -0,0 +1,70 @@
+/* ip6tables module for using new netfilter netlink queue
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_ipv4/ipt_NFQUEUE.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("ip6tables NFQUEUE target");
+MODULE_LICENSE("GPL");
+
+static unsigned int
+target(struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ unsigned int hooknum,
+ const void *targinfo,
+ void *userinfo)
+{
+ const struct ipt_NFQ_info *tinfo = targinfo;
+
+ return NF_QUEUE_NR(tinfo->queuenum);
+}
+
+static int
+checkentry(const char *tablename,
+ const struct ip6t_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ if (targinfosize != IP6T_ALIGN(sizeof(struct ipt_NFQ_info))) {
+ printk(KERN_WARNING "NFQUEUE: targinfosize %u != %Zu\n",
+ targinfosize,
+ IP6T_ALIGN(sizeof(struct ipt_NFQ_info)));
+ return 0;
+ }
+
+ return 1;
+}
+
+static struct ip6t_target ipt_NFQ_reg = {
+ .name = "NFQUEUE",
+ .target = target,
+ .checkentry = checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ return ip6t_register_target(&ipt_NFQ_reg);
+}
+
+static void __exit fini(void)
+{
+ ip6t_unregister_target(&ipt_NFQ_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c
new file mode 100644
index 00000000000..b03e87adca9
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_REJECT.c
@@ -0,0 +1,281 @@
+/*
+ * IP6 tables REJECT target module
+ * Linux INET6 implementation
+ *
+ * Copyright (C)2003 USAGI/WIDE Project
+ *
+ * Authors:
+ * Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp>
+ *
+ * Based on net/ipv4/netfilter/ipt_REJECT.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/icmpv6.h>
+#include <linux/netdevice.h>
+#include <net/ipv6.h>
+#include <net/tcp.h>
+#include <net/icmp.h>
+#include <net/ip6_checksum.h>
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+#include <net/flow.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_ipv6/ip6t_REJECT.h>
+
+MODULE_AUTHOR("Yasuyuki KOZAKAI <yasuyuki.kozakai@toshiba.co.jp>");
+MODULE_DESCRIPTION("IP6 tables REJECT target module");
+MODULE_LICENSE("GPL");
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+/* Send RST reply */
+static void send_reset(struct sk_buff *oldskb)
+{
+ struct sk_buff *nskb;
+ struct tcphdr otcph, *tcph;
+ unsigned int otcplen, hh_len;
+ int tcphoff, needs_ack;
+ struct ipv6hdr *oip6h = oldskb->nh.ipv6h, *ip6h;
+ struct dst_entry *dst = NULL;
+ u8 proto;
+ struct flowi fl;
+
+ if ((!(ipv6_addr_type(&oip6h->saddr) & IPV6_ADDR_UNICAST)) ||
+ (!(ipv6_addr_type(&oip6h->daddr) & IPV6_ADDR_UNICAST))) {
+ DEBUGP("ip6t_REJECT: addr is not unicast.\n");
+ return;
+ }
+
+ proto = oip6h->nexthdr;
+ tcphoff = ipv6_skip_exthdr(oldskb, ((u8*)(oip6h+1) - oldskb->data), &proto);
+
+ if ((tcphoff < 0) || (tcphoff > oldskb->len)) {
+ DEBUGP("ip6t_REJECT: Can't get TCP header.\n");
+ return;
+ }
+
+ otcplen = oldskb->len - tcphoff;
+
+ /* IP header checks: fragment, too short. */
+ if ((proto != IPPROTO_TCP) || (otcplen < sizeof(struct tcphdr))) {
+ DEBUGP("ip6t_REJECT: proto(%d) != IPPROTO_TCP, or too short. otcplen = %d\n",
+ proto, otcplen);
+ return;
+ }
+
+ if (skb_copy_bits(oldskb, tcphoff, &otcph, sizeof(struct tcphdr)))
+ BUG();
+
+ /* No RST for RST. */
+ if (otcph.rst) {
+ DEBUGP("ip6t_REJECT: RST is set\n");
+ return;
+ }
+
+ /* Check checksum. */
+ if (csum_ipv6_magic(&oip6h->saddr, &oip6h->daddr, otcplen, IPPROTO_TCP,
+ skb_checksum(oldskb, tcphoff, otcplen, 0))) {
+ DEBUGP("ip6t_REJECT: TCP checksum is invalid\n");
+ return;
+ }
+
+ memset(&fl, 0, sizeof(fl));
+ fl.proto = IPPROTO_TCP;
+ ipv6_addr_copy(&fl.fl6_src, &oip6h->daddr);
+ ipv6_addr_copy(&fl.fl6_dst, &oip6h->saddr);
+ fl.fl_ip_sport = otcph.dest;
+ fl.fl_ip_dport = otcph.source;
+ dst = ip6_route_output(NULL, &fl);
+ if (dst == NULL)
+ return;
+ if (dst->error || xfrm_lookup(&dst, &fl, NULL, 0))
+ return;
+
+ hh_len = (dst->dev->hard_header_len + 15)&~15;
+ nskb = alloc_skb(hh_len + 15 + dst->header_len + sizeof(struct ipv6hdr)
+ + sizeof(struct tcphdr) + dst->trailer_len,
+ GFP_ATOMIC);
+
+ if (!nskb) {
+ if (net_ratelimit())
+ printk("ip6t_REJECT: Can't alloc skb\n");
+ dst_release(dst);
+ return;
+ }
+
+ nskb->dst = dst;
+
+ skb_reserve(nskb, hh_len + dst->header_len);
+
+ ip6h = nskb->nh.ipv6h = (struct ipv6hdr *)
+ skb_put(nskb, sizeof(struct ipv6hdr));
+ ip6h->version = 6;
+ ip6h->hop_limit = dst_metric(dst, RTAX_HOPLIMIT);
+ ip6h->nexthdr = IPPROTO_TCP;
+ ip6h->payload_len = htons(sizeof(struct tcphdr));
+ ipv6_addr_copy(&ip6h->saddr, &oip6h->daddr);
+ ipv6_addr_copy(&ip6h->daddr, &oip6h->saddr);
+
+ tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr));
+ /* Truncate to length (no data) */
+ tcph->doff = sizeof(struct tcphdr)/4;
+ tcph->source = otcph.dest;
+ tcph->dest = otcph.source;
+
+ if (otcph.ack) {
+ needs_ack = 0;
+ tcph->seq = otcph.ack_seq;
+ tcph->ack_seq = 0;
+ } else {
+ needs_ack = 1;
+ tcph->ack_seq = htonl(ntohl(otcph.seq) + otcph.syn + otcph.fin
+ + otcplen - (otcph.doff<<2));
+ tcph->seq = 0;
+ }
+
+ /* Reset flags */
+ ((u_int8_t *)tcph)[13] = 0;
+ tcph->rst = 1;
+ tcph->ack = needs_ack;
+ tcph->window = 0;
+ tcph->urg_ptr = 0;
+ tcph->check = 0;
+
+ /* Adjust TCP checksum */
+ tcph->check = csum_ipv6_magic(&nskb->nh.ipv6h->saddr,
+ &nskb->nh.ipv6h->daddr,
+ sizeof(struct tcphdr), IPPROTO_TCP,
+ csum_partial((char *)tcph,
+ sizeof(struct tcphdr), 0));
+
+ NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, nskb, NULL, nskb->dst->dev,
+ dst_output);
+}
+
+static inline void
+send_unreach(struct sk_buff *skb_in, unsigned char code, unsigned int hooknum)
+{
+ if (hooknum == NF_IP6_LOCAL_OUT && skb_in->dev == NULL)
+ skb_in->dev = &loopback_dev;
+
+ icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0, NULL);
+}
+
+static unsigned int reject6_target(struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ unsigned int hooknum,
+ const void *targinfo,
+ void *userinfo)
+{
+ const struct ip6t_reject_info *reject = targinfo;
+
+ DEBUGP(KERN_DEBUG "%s: medium point\n", __FUNCTION__);
+ /* WARNING: This code causes reentry within ip6tables.
+ This means that the ip6tables jump stack is now crap. We
+ must return an absolute verdict. --RR */
+ switch (reject->with) {
+ case IP6T_ICMP6_NO_ROUTE:
+ send_unreach(*pskb, ICMPV6_NOROUTE, hooknum);
+ break;
+ case IP6T_ICMP6_ADM_PROHIBITED:
+ send_unreach(*pskb, ICMPV6_ADM_PROHIBITED, hooknum);
+ break;
+ case IP6T_ICMP6_NOT_NEIGHBOUR:
+ send_unreach(*pskb, ICMPV6_NOT_NEIGHBOUR, hooknum);
+ break;
+ case IP6T_ICMP6_ADDR_UNREACH:
+ send_unreach(*pskb, ICMPV6_ADDR_UNREACH, hooknum);
+ break;
+ case IP6T_ICMP6_PORT_UNREACH:
+ send_unreach(*pskb, ICMPV6_PORT_UNREACH, hooknum);
+ break;
+ case IP6T_ICMP6_ECHOREPLY:
+ /* Do nothing */
+ break;
+ case IP6T_TCP_RESET:
+ send_reset(*pskb);
+ break;
+ default:
+ if (net_ratelimit())
+ printk(KERN_WARNING "ip6t_REJECT: case %u not handled yet\n", reject->with);
+ break;
+ }
+
+ return NF_DROP;
+}
+
+static int check(const char *tablename,
+ const struct ip6t_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ const struct ip6t_reject_info *rejinfo = targinfo;
+
+ if (targinfosize != IP6T_ALIGN(sizeof(struct ip6t_reject_info))) {
+ DEBUGP("ip6t_REJECT: targinfosize %u != 0\n", targinfosize);
+ return 0;
+ }
+
+ /* Only allow these for packet filtering. */
+ if (strcmp(tablename, "filter") != 0) {
+ DEBUGP("ip6t_REJECT: bad table `%s'.\n", tablename);
+ return 0;
+ }
+
+ if ((hook_mask & ~((1 << NF_IP6_LOCAL_IN)
+ | (1 << NF_IP6_FORWARD)
+ | (1 << NF_IP6_LOCAL_OUT))) != 0) {
+ DEBUGP("ip6t_REJECT: bad hook mask %X\n", hook_mask);
+ return 0;
+ }
+
+ if (rejinfo->with == IP6T_ICMP6_ECHOREPLY) {
+ printk("ip6t_REJECT: ECHOREPLY is not supported.\n");
+ return 0;
+ } else if (rejinfo->with == IP6T_TCP_RESET) {
+ /* Must specify that it's a TCP packet */
+ if (e->ipv6.proto != IPPROTO_TCP
+ || (e->ipv6.invflags & IP6T_INV_PROTO)) {
+ DEBUGP("ip6t_REJECT: TCP_RESET illegal for non-tcp\n");
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+static struct ip6t_target ip6t_reject_reg = {
+ .name = "REJECT",
+ .target = reject6_target,
+ .checkentry = check,
+ .me = THIS_MODULE
+};
+
+static int __init init(void)
+{
+ if (ip6t_register_target(&ip6t_reject_reg))
+ return -EINVAL;
+ return 0;
+}
+
+static void __exit fini(void)
+{
+ ip6t_unregister_target(&ip6t_reject_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c
index d5b94f142bb..dde37793d20 100644
--- a/net/ipv6/netfilter/ip6t_ah.c
+++ b/net/ipv6/netfilter/ip6t_ah.c
@@ -48,92 +48,21 @@ match(const struct sk_buff *skb,
unsigned int protoff,
int *hotdrop)
{
- struct ip_auth_hdr *ah = NULL, _ah;
+ struct ip_auth_hdr *ah, _ah;
const struct ip6t_ah *ahinfo = matchinfo;
- unsigned int temp;
- int len;
- u8 nexthdr;
unsigned int ptr;
unsigned int hdrlen = 0;
- /*DEBUGP("IPv6 AH entered\n");*/
- /* if (opt->auth == 0) return 0;
- * It does not filled on output */
-
- /* type of the 1st exthdr */
- nexthdr = skb->nh.ipv6h->nexthdr;
- /* pointer to the 1st exthdr */
- ptr = sizeof(struct ipv6hdr);
- /* available length */
- len = skb->len - ptr;
- temp = 0;
-
- while (ip6t_ext_hdr(nexthdr)) {
- struct ipv6_opt_hdr _hdr, *hp;
-
- DEBUGP("ipv6_ah header iteration \n");
-
- /* Is there enough space for the next ext header? */
- if (len < sizeof(struct ipv6_opt_hdr))
- return 0;
- /* No more exthdr -> evaluate */
- if (nexthdr == NEXTHDR_NONE)
- break;
- /* ESP -> evaluate */
- if (nexthdr == NEXTHDR_ESP)
- break;
-
- hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr);
- BUG_ON(hp == NULL);
-
- /* Calculate the header length */
- if (nexthdr == NEXTHDR_FRAGMENT)
- hdrlen = 8;
- else if (nexthdr == NEXTHDR_AUTH)
- hdrlen = (hp->hdrlen+2)<<2;
- else
- hdrlen = ipv6_optlen(hp);
-
- /* AH -> evaluate */
- if (nexthdr == NEXTHDR_AUTH) {
- temp |= MASK_AH;
- break;
- }
-
-
- /* set the flag */
- switch (nexthdr) {
- case NEXTHDR_HOP:
- case NEXTHDR_ROUTING:
- case NEXTHDR_FRAGMENT:
- case NEXTHDR_AUTH:
- case NEXTHDR_DEST:
- break;
- default:
- DEBUGP("ipv6_ah match: unknown nextheader %u\n",nexthdr);
- return 0;
- }
-
- nexthdr = hp->nexthdr;
- len -= hdrlen;
- ptr += hdrlen;
- if (ptr > skb->len) {
- DEBUGP("ipv6_ah: new pointer too large! \n");
- break;
- }
- }
-
- /* AH header not found */
- if (temp != MASK_AH)
+ if (ipv6_find_hdr(skb, &ptr, NEXTHDR_AUTH) < 0)
return 0;
- if (len < sizeof(struct ip_auth_hdr)){
+ ah = skb_header_pointer(skb, ptr, sizeof(_ah), &_ah);
+ if (ah == NULL) {
*hotdrop = 1;
return 0;
}
- ah = skb_header_pointer(skb, ptr, sizeof(_ah), &_ah);
- BUG_ON(ah == NULL);
+ hdrlen = (ah->hdrlen + 2) << 2;
DEBUGP("IPv6 AH LEN %u %u ", hdrlen, ah->hdrlen);
DEBUGP("RES %04X ", ah->reserved);
diff --git a/net/ipv6/netfilter/ip6t_dst.c b/net/ipv6/netfilter/ip6t_dst.c
index 540925e4a7a..c450a635e54 100644
--- a/net/ipv6/netfilter/ip6t_dst.c
+++ b/net/ipv6/netfilter/ip6t_dst.c
@@ -63,8 +63,6 @@ match(const struct sk_buff *skb,
struct ipv6_opt_hdr _optsh, *oh;
const struct ip6t_opts *optinfo = matchinfo;
unsigned int temp;
- unsigned int len;
- u8 nexthdr;
unsigned int ptr;
unsigned int hdrlen = 0;
unsigned int ret = 0;
@@ -72,97 +70,25 @@ match(const struct sk_buff *skb,
u8 _optlen, *lp = NULL;
unsigned int optlen;
- /* type of the 1st exthdr */
- nexthdr = skb->nh.ipv6h->nexthdr;
- /* pointer to the 1st exthdr */
- ptr = sizeof(struct ipv6hdr);
- /* available length */
- len = skb->len - ptr;
- temp = 0;
-
- while (ip6t_ext_hdr(nexthdr)) {
- struct ipv6_opt_hdr _hdr, *hp;
-
- DEBUGP("ipv6_opts header iteration \n");
-
- /* Is there enough space for the next ext header? */
- if (len < (int)sizeof(struct ipv6_opt_hdr))
- return 0;
- /* No more exthdr -> evaluate */
- if (nexthdr == NEXTHDR_NONE) {
- break;
- }
- /* ESP -> evaluate */
- if (nexthdr == NEXTHDR_ESP) {
- break;
- }
-
- hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr);
- BUG_ON(hp == NULL);
-
- /* Calculate the header length */
- if (nexthdr == NEXTHDR_FRAGMENT) {
- hdrlen = 8;
- } else if (nexthdr == NEXTHDR_AUTH)
- hdrlen = (hp->hdrlen+2)<<2;
- else
- hdrlen = ipv6_optlen(hp);
-
- /* OPTS -> evaluate */
#if HOPBYHOP
- if (nexthdr == NEXTHDR_HOP) {
- temp |= MASK_HOPOPTS;
+ if (ipv6_find_hdr(skb, &ptr, NEXTHDR_HOP) < 0)
#else
- if (nexthdr == NEXTHDR_DEST) {
- temp |= MASK_DSTOPTS;
+ if (ipv6_find_hdr(skb, &ptr, NEXTHDR_DEST) < 0)
#endif
- break;
- }
-
+ return 0;
- /* set the flag */
- switch (nexthdr){
- case NEXTHDR_HOP:
- case NEXTHDR_ROUTING:
- case NEXTHDR_FRAGMENT:
- case NEXTHDR_AUTH:
- case NEXTHDR_DEST:
- break;
- default:
- DEBUGP("ipv6_opts match: unknown nextheader %u\n",nexthdr);
- return 0;
- break;
- }
-
- nexthdr = hp->nexthdr;
- len -= hdrlen;
- ptr += hdrlen;
- if ( ptr > skb->len ) {
- DEBUGP("ipv6_opts: new pointer is too large! \n");
- break;
- }
- }
-
- /* OPTIONS header not found */
-#if HOPBYHOP
- if ( temp != MASK_HOPOPTS ) return 0;
-#else
- if ( temp != MASK_DSTOPTS ) return 0;
-#endif
-
- if (len < (int)sizeof(struct ipv6_opt_hdr)){
+ oh = skb_header_pointer(skb, ptr, sizeof(_optsh), &_optsh);
+ if (oh == NULL){
*hotdrop = 1;
return 0;
}
- if (len < hdrlen){
+ hdrlen = ipv6_optlen(oh);
+ if (skb->len - ptr < hdrlen){
/* Packet smaller than it's length field */
return 0;
}
- oh = skb_header_pointer(skb, ptr, sizeof(_optsh), &_optsh);
- BUG_ON(oh == NULL);
-
DEBUGP("IPv6 OPTS LEN %u %u ", hdrlen, oh->hdrlen);
DEBUGP("len %02X %04X %02X ",
diff --git a/net/ipv6/netfilter/ip6t_esp.c b/net/ipv6/netfilter/ip6t_esp.c
index e39dd236fd8..24bc0cde43a 100644
--- a/net/ipv6/netfilter/ip6t_esp.c
+++ b/net/ipv6/netfilter/ip6t_esp.c
@@ -48,87 +48,22 @@ match(const struct sk_buff *skb,
unsigned int protoff,
int *hotdrop)
{
- struct ip_esp_hdr _esp, *eh = NULL;
+ struct ip_esp_hdr _esp, *eh;
const struct ip6t_esp *espinfo = matchinfo;
- unsigned int temp;
- int len;
- u8 nexthdr;
unsigned int ptr;
/* Make sure this isn't an evil packet */
/*DEBUGP("ipv6_esp entered \n");*/
- /* type of the 1st exthdr */
- nexthdr = skb->nh.ipv6h->nexthdr;
- /* pointer to the 1st exthdr */
- ptr = sizeof(struct ipv6hdr);
- /* available length */
- len = skb->len - ptr;
- temp = 0;
-
- while (ip6t_ext_hdr(nexthdr)) {
- struct ipv6_opt_hdr _hdr, *hp;
- int hdrlen;
-
- DEBUGP("ipv6_esp header iteration \n");
-
- /* Is there enough space for the next ext header? */
- if (len < sizeof(struct ipv6_opt_hdr))
- return 0;
- /* No more exthdr -> evaluate */
- if (nexthdr == NEXTHDR_NONE)
- break;
- /* ESP -> evaluate */
- if (nexthdr == NEXTHDR_ESP) {
- temp |= MASK_ESP;
- break;
- }
-
- hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr);
- BUG_ON(hp == NULL);
-
- /* Calculate the header length */
- if (nexthdr == NEXTHDR_FRAGMENT)
- hdrlen = 8;
- else if (nexthdr == NEXTHDR_AUTH)
- hdrlen = (hp->hdrlen+2)<<2;
- else
- hdrlen = ipv6_optlen(hp);
-
- /* set the flag */
- switch (nexthdr) {
- case NEXTHDR_HOP:
- case NEXTHDR_ROUTING:
- case NEXTHDR_FRAGMENT:
- case NEXTHDR_AUTH:
- case NEXTHDR_DEST:
- break;
- default:
- DEBUGP("ipv6_esp match: unknown nextheader %u\n",nexthdr);
- return 0;
- }
-
- nexthdr = hp->nexthdr;
- len -= hdrlen;
- ptr += hdrlen;
- if (ptr > skb->len) {
- DEBUGP("ipv6_esp: new pointer too large! \n");
- break;
- }
- }
-
- /* ESP header not found */
- if (temp != MASK_ESP)
+ if (ipv6_find_hdr(skb, &ptr, NEXTHDR_ESP) < 0)
return 0;
- if (len < sizeof(struct ip_esp_hdr)) {
+ eh = skb_header_pointer(skb, ptr, sizeof(_esp), &_esp);
+ if (eh == NULL) {
*hotdrop = 1;
return 0;
}
- eh = skb_header_pointer(skb, ptr, sizeof(_esp), &_esp);
- BUG_ON(eh == NULL);
-
DEBUGP("IPv6 ESP SPI %u %08X\n", ntohl(eh->spi), ntohl(eh->spi));
return (eh != NULL)
diff --git a/net/ipv6/netfilter/ip6t_frag.c b/net/ipv6/netfilter/ip6t_frag.c
index 4bfa30a9bc8..085d5f8eea2 100644
--- a/net/ipv6/netfilter/ip6t_frag.c
+++ b/net/ipv6/netfilter/ip6t_frag.c
@@ -48,90 +48,18 @@ match(const struct sk_buff *skb,
unsigned int protoff,
int *hotdrop)
{
- struct frag_hdr _frag, *fh = NULL;
+ struct frag_hdr _frag, *fh;
const struct ip6t_frag *fraginfo = matchinfo;
- unsigned int temp;
- int len;
- u8 nexthdr;
unsigned int ptr;
- unsigned int hdrlen = 0;
-
- /* type of the 1st exthdr */
- nexthdr = skb->nh.ipv6h->nexthdr;
- /* pointer to the 1st exthdr */
- ptr = sizeof(struct ipv6hdr);
- /* available length */
- len = skb->len - ptr;
- temp = 0;
-
- while (ip6t_ext_hdr(nexthdr)) {
- struct ipv6_opt_hdr _hdr, *hp;
-
- DEBUGP("ipv6_frag header iteration \n");
-
- /* Is there enough space for the next ext header? */
- if (len < (int)sizeof(struct ipv6_opt_hdr))
- return 0;
- /* No more exthdr -> evaluate */
- if (nexthdr == NEXTHDR_NONE) {
- break;
- }
- /* ESP -> evaluate */
- if (nexthdr == NEXTHDR_ESP) {
- break;
- }
-
- hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr);
- BUG_ON(hp == NULL);
-
- /* Calculate the header length */
- if (nexthdr == NEXTHDR_FRAGMENT) {
- hdrlen = 8;
- } else if (nexthdr == NEXTHDR_AUTH)
- hdrlen = (hp->hdrlen+2)<<2;
- else
- hdrlen = ipv6_optlen(hp);
-
- /* FRAG -> evaluate */
- if (nexthdr == NEXTHDR_FRAGMENT) {
- temp |= MASK_FRAGMENT;
- break;
- }
-
-
- /* set the flag */
- switch (nexthdr){
- case NEXTHDR_HOP:
- case NEXTHDR_ROUTING:
- case NEXTHDR_FRAGMENT:
- case NEXTHDR_AUTH:
- case NEXTHDR_DEST:
- break;
- default:
- DEBUGP("ipv6_frag match: unknown nextheader %u\n",nexthdr);
- return 0;
- break;
- }
-
- nexthdr = hp->nexthdr;
- len -= hdrlen;
- ptr += hdrlen;
- if ( ptr > skb->len ) {
- DEBUGP("ipv6_frag: new pointer too large! \n");
- break;
- }
- }
-
- /* FRAG header not found */
- if ( temp != MASK_FRAGMENT ) return 0;
-
- if (len < sizeof(struct frag_hdr)){
- *hotdrop = 1;
- return 0;
- }
- fh = skb_header_pointer(skb, ptr, sizeof(_frag), &_frag);
- BUG_ON(fh == NULL);
+ if (ipv6_find_hdr(skb, &ptr, NEXTHDR_FRAGMENT) < 0)
+ return 0;
+
+ fh = skb_header_pointer(skb, ptr, sizeof(_frag), &_frag);
+ if (fh == NULL){
+ *hotdrop = 1;
+ return 0;
+ }
DEBUGP("INFO %04X ", fh->frag_off);
DEBUGP("OFFSET %04X ", ntohs(fh->frag_off) & ~0x7);
diff --git a/net/ipv6/netfilter/ip6t_hbh.c b/net/ipv6/netfilter/ip6t_hbh.c
index 27f3650d127..1d09485111d 100644
--- a/net/ipv6/netfilter/ip6t_hbh.c
+++ b/net/ipv6/netfilter/ip6t_hbh.c
@@ -63,8 +63,6 @@ match(const struct sk_buff *skb,
struct ipv6_opt_hdr _optsh, *oh;
const struct ip6t_opts *optinfo = matchinfo;
unsigned int temp;
- unsigned int len;
- u8 nexthdr;
unsigned int ptr;
unsigned int hdrlen = 0;
unsigned int ret = 0;
@@ -72,97 +70,25 @@ match(const struct sk_buff *skb,
u8 _optlen, *lp = NULL;
unsigned int optlen;
- /* type of the 1st exthdr */
- nexthdr = skb->nh.ipv6h->nexthdr;
- /* pointer to the 1st exthdr */
- ptr = sizeof(struct ipv6hdr);
- /* available length */
- len = skb->len - ptr;
- temp = 0;
-
- while (ip6t_ext_hdr(nexthdr)) {
- struct ipv6_opt_hdr _hdr, *hp;
-
- DEBUGP("ipv6_opts header iteration \n");
-
- /* Is there enough space for the next ext header? */
- if (len < (int)sizeof(struct ipv6_opt_hdr))
- return 0;
- /* No more exthdr -> evaluate */
- if (nexthdr == NEXTHDR_NONE) {
- break;
- }
- /* ESP -> evaluate */
- if (nexthdr == NEXTHDR_ESP) {
- break;
- }
-
- hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr);
- BUG_ON(hp == NULL);
-
- /* Calculate the header length */
- if (nexthdr == NEXTHDR_FRAGMENT) {
- hdrlen = 8;
- } else if (nexthdr == NEXTHDR_AUTH)
- hdrlen = (hp->hdrlen+2)<<2;
- else
- hdrlen = ipv6_optlen(hp);
-
- /* OPTS -> evaluate */
#if HOPBYHOP
- if (nexthdr == NEXTHDR_HOP) {
- temp |= MASK_HOPOPTS;
+ if (ipv6_find_hdr(skb, &ptr, NEXTHDR_HOP) < 0)
#else
- if (nexthdr == NEXTHDR_DEST) {
- temp |= MASK_DSTOPTS;
+ if (ipv6_find_hdr(skb, &ptr, NEXTHDR_DEST) < 0)
#endif
- break;
- }
-
+ return 0;
- /* set the flag */
- switch (nexthdr){
- case NEXTHDR_HOP:
- case NEXTHDR_ROUTING:
- case NEXTHDR_FRAGMENT:
- case NEXTHDR_AUTH:
- case NEXTHDR_DEST:
- break;
- default:
- DEBUGP("ipv6_opts match: unknown nextheader %u\n",nexthdr);
- return 0;
- break;
- }
-
- nexthdr = hp->nexthdr;
- len -= hdrlen;
- ptr += hdrlen;
- if ( ptr > skb->len ) {
- DEBUGP("ipv6_opts: new pointer is too large! \n");
- break;
- }
- }
-
- /* OPTIONS header not found */
-#if HOPBYHOP
- if ( temp != MASK_HOPOPTS ) return 0;
-#else
- if ( temp != MASK_DSTOPTS ) return 0;
-#endif
-
- if (len < (int)sizeof(struct ipv6_opt_hdr)){
+ oh = skb_header_pointer(skb, ptr, sizeof(_optsh), &_optsh);
+ if (oh == NULL){
*hotdrop = 1;
return 0;
}
- if (len < hdrlen){
+ hdrlen = ipv6_optlen(oh);
+ if (skb->len - ptr < hdrlen){
/* Packet smaller than it's length field */
return 0;
}
- oh = skb_header_pointer(skb, ptr, sizeof(_optsh), &_optsh);
- BUG_ON(oh == NULL);
-
DEBUGP("IPv6 OPTS LEN %u %u ", hdrlen, oh->hdrlen);
DEBUGP("len %02X %04X %02X ",
diff --git a/net/ipv6/netfilter/ip6t_owner.c b/net/ipv6/netfilter/ip6t_owner.c
index ab0e32d3de4..4de4cdad4b7 100644
--- a/net/ipv6/netfilter/ip6t_owner.c
+++ b/net/ipv6/netfilter/ip6t_owner.c
@@ -11,6 +11,7 @@
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/file.h>
+#include <linux/rcupdate.h>
#include <net/sock.h>
#include <linux/netfilter_ipv6/ip6t_owner.h>
@@ -20,71 +21,6 @@ MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
MODULE_DESCRIPTION("IP6 tables owner matching module");
MODULE_LICENSE("GPL");
-static int
-match_pid(const struct sk_buff *skb, pid_t pid)
-{
- struct task_struct *p;
- struct files_struct *files;
- int i;
-
- read_lock(&tasklist_lock);
- p = find_task_by_pid(pid);
- if (!p)
- goto out;
- task_lock(p);
- files = p->files;
- if(files) {
- spin_lock(&files->file_lock);
- for (i=0; i < files->max_fds; i++) {
- if (fcheck_files(files, i) == skb->sk->sk_socket->file) {
- spin_unlock(&files->file_lock);
- task_unlock(p);
- read_unlock(&tasklist_lock);
- return 1;
- }
- }
- spin_unlock(&files->file_lock);
- }
- task_unlock(p);
-out:
- read_unlock(&tasklist_lock);
- return 0;
-}
-
-static int
-match_sid(const struct sk_buff *skb, pid_t sid)
-{
- struct task_struct *g, *p;
- struct file *file = skb->sk->sk_socket->file;
- int i, found=0;
-
- read_lock(&tasklist_lock);
- do_each_thread(g, p) {
- struct files_struct *files;
- if (p->signal->session != sid)
- continue;
-
- task_lock(p);
- files = p->files;
- if (files) {
- spin_lock(&files->file_lock);
- for (i=0; i < files->max_fds; i++) {
- if (fcheck_files(files, i) == file) {
- found = 1;
- break;
- }
- }
- spin_unlock(&files->file_lock);
- }
- task_unlock(p);
- if (found)
- goto out;
- } while_each_thread(g, p);
-out:
- read_unlock(&tasklist_lock);
-
- return found;
-}
static int
match(const struct sk_buff *skb,
@@ -112,18 +48,6 @@ match(const struct sk_buff *skb,
return 0;
}
- if(info->match & IP6T_OWNER_PID) {
- if (!match_pid(skb, info->pid) ^
- !!(info->invert & IP6T_OWNER_PID))
- return 0;
- }
-
- if(info->match & IP6T_OWNER_SID) {
- if (!match_sid(skb, info->sid) ^
- !!(info->invert & IP6T_OWNER_SID))
- return 0;
- }
-
return 1;
}
@@ -134,6 +58,8 @@ checkentry(const char *tablename,
unsigned int matchsize,
unsigned int hook_mask)
{
+ const struct ip6t_owner_info *info = matchinfo;
+
if (hook_mask
& ~((1 << NF_IP6_LOCAL_OUT) | (1 << NF_IP6_POST_ROUTING))) {
printk("ip6t_owner: only valid for LOCAL_OUT or POST_ROUTING.\n");
@@ -142,14 +68,13 @@ checkentry(const char *tablename,
if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_owner_info)))
return 0;
-#ifdef CONFIG_SMP
- /* files->file_lock can not be used in a BH */
- if (((struct ip6t_owner_info *)matchinfo)->match
- & (IP6T_OWNER_PID|IP6T_OWNER_SID)) {
- printk("ip6t_owner: pid and sid matching is broken on SMP.\n");
+
+ if (info->match & (IP6T_OWNER_PID|IP6T_OWNER_SID)) {
+ printk("ipt_owner: pid and sid matching "
+ "not supported anymore\n");
return 0;
}
-#endif
+
return 1;
}
diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c
index a9526b773d2..beb2fd5cebb 100644
--- a/net/ipv6/netfilter/ip6t_rt.c
+++ b/net/ipv6/netfilter/ip6t_rt.c
@@ -50,98 +50,29 @@ match(const struct sk_buff *skb,
unsigned int protoff,
int *hotdrop)
{
- struct ipv6_rt_hdr _route, *rh = NULL;
+ struct ipv6_rt_hdr _route, *rh;
const struct ip6t_rt *rtinfo = matchinfo;
unsigned int temp;
- unsigned int len;
- u8 nexthdr;
unsigned int ptr;
unsigned int hdrlen = 0;
unsigned int ret = 0;
struct in6_addr *ap, _addr;
- /* type of the 1st exthdr */
- nexthdr = skb->nh.ipv6h->nexthdr;
- /* pointer to the 1st exthdr */
- ptr = sizeof(struct ipv6hdr);
- /* available length */
- len = skb->len - ptr;
- temp = 0;
+ if (ipv6_find_hdr(skb, &ptr, NEXTHDR_ROUTING) < 0)
+ return 0;
- while (ip6t_ext_hdr(nexthdr)) {
- struct ipv6_opt_hdr _hdr, *hp;
-
- DEBUGP("ipv6_rt header iteration \n");
-
- /* Is there enough space for the next ext header? */
- if (len < (int)sizeof(struct ipv6_opt_hdr))
- return 0;
- /* No more exthdr -> evaluate */
- if (nexthdr == NEXTHDR_NONE) {
- break;
- }
- /* ESP -> evaluate */
- if (nexthdr == NEXTHDR_ESP) {
- break;
- }
-
- hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr);
- BUG_ON(hp == NULL);
-
- /* Calculate the header length */
- if (nexthdr == NEXTHDR_FRAGMENT) {
- hdrlen = 8;
- } else if (nexthdr == NEXTHDR_AUTH)
- hdrlen = (hp->hdrlen+2)<<2;
- else
- hdrlen = ipv6_optlen(hp);
-
- /* ROUTING -> evaluate */
- if (nexthdr == NEXTHDR_ROUTING) {
- temp |= MASK_ROUTING;
- break;
- }
-
-
- /* set the flag */
- switch (nexthdr){
- case NEXTHDR_HOP:
- case NEXTHDR_ROUTING:
- case NEXTHDR_FRAGMENT:
- case NEXTHDR_AUTH:
- case NEXTHDR_DEST:
- break;
- default:
- DEBUGP("ipv6_rt match: unknown nextheader %u\n",nexthdr);
- return 0;
- break;
- }
-
- nexthdr = hp->nexthdr;
- len -= hdrlen;
- ptr += hdrlen;
- if ( ptr > skb->len ) {
- DEBUGP("ipv6_rt: new pointer is too large! \n");
- break;
- }
- }
-
- /* ROUTING header not found */
- if ( temp != MASK_ROUTING ) return 0;
-
- if (len < (int)sizeof(struct ipv6_rt_hdr)){
+ rh = skb_header_pointer(skb, ptr, sizeof(_route), &_route);
+ if (rh == NULL){
*hotdrop = 1;
return 0;
}
- if (len < hdrlen){
+ hdrlen = ipv6_optlen(rh);
+ if (skb->len - ptr < hdrlen){
/* Pcket smaller than its length field */
return 0;
}
- rh = skb_header_pointer(skb, ptr, sizeof(_route), &_route);
- BUG_ON(rh == NULL);
-
DEBUGP("IPv6 RT LEN %u %u ", hdrlen, rh->hdrlen);
DEBUGP("TYPE %04X ", rh->type);
DEBUGP("SGS_LEFT %u %02X\n", rh->segments_left, rh->segments_left);
@@ -161,8 +92,8 @@ match(const struct sk_buff *skb,
((rtinfo->hdrlen == hdrlen) ^
!!(rtinfo->invflags & IP6T_RT_INV_LEN))));
DEBUGP("res %02X %02X %02X ",
- (rtinfo->flags & IP6T_RT_RES), ((struct rt0_hdr *)rh)->bitmap,
- !((rtinfo->flags & IP6T_RT_RES) && (((struct rt0_hdr *)rh)->bitmap)));
+ (rtinfo->flags & IP6T_RT_RES), ((struct rt0_hdr *)rh)->reserved,
+ !((rtinfo->flags & IP6T_RT_RES) && (((struct rt0_hdr *)rh)->reserved)));
ret = (rh != NULL)
&&
@@ -179,12 +110,12 @@ match(const struct sk_buff *skb,
!!(rtinfo->invflags & IP6T_RT_INV_TYP)));
if (ret && (rtinfo->flags & IP6T_RT_RES)) {
- u_int32_t *bp, _bitmap;
- bp = skb_header_pointer(skb,
- ptr + offsetof(struct rt0_hdr, bitmap),
- sizeof(_bitmap), &_bitmap);
+ u_int32_t *rp, _reserved;
+ rp = skb_header_pointer(skb,
+ ptr + offsetof(struct rt0_hdr, reserved),
+ sizeof(_reserved), &_reserved);
- ret = (*bp == 0);
+ ret = (*rp == 0);
}
DEBUGP("#%d ",rtinfo->addrnr);
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 334a5967831..50a13e75d70 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -140,9 +140,7 @@ fold_field(void *mib[], int offt)
unsigned long res = 0;
int i;
- for (i = 0; i < NR_CPUS; i++) {
- if (!cpu_possible(i))
- continue;
+ for_each_cpu(i) {
res += *(((unsigned long *)per_cpu_ptr(mib[0], i)) + offt);
res += *(((unsigned long *)per_cpu_ptr(mib[1], i)) + offt);
}
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index e2b848ec985..a1265a320b1 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -49,6 +49,7 @@
#include <net/transp_v6.h>
#include <net/udp.h>
#include <net/inet_common.h>
+#include <net/tcp_states.h>
#include <net/rawv6.h>
#include <net/xfrm.h>
@@ -81,7 +82,8 @@ static void raw_v6_unhash(struct sock *sk)
/* Grumble... icmp and ip_input want to get at this... */
struct sock *__raw_v6_lookup(struct sock *sk, unsigned short num,
- struct in6_addr *loc_addr, struct in6_addr *rmt_addr)
+ struct in6_addr *loc_addr, struct in6_addr *rmt_addr,
+ int dif)
{
struct hlist_node *node;
int is_multicast = ipv6_addr_is_multicast(loc_addr);
@@ -94,6 +96,9 @@ struct sock *__raw_v6_lookup(struct sock *sk, unsigned short num,
!ipv6_addr_equal(&np->daddr, rmt_addr))
continue;
+ if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)
+ continue;
+
if (!ipv6_addr_any(&np->rcv_saddr)) {
if (ipv6_addr_equal(&np->rcv_saddr, loc_addr))
goto found;
@@ -137,11 +142,12 @@ static __inline__ int icmpv6_filter(struct sock *sk, struct sk_buff *skb)
*
* Caller owns SKB so we must make clones.
*/
-void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
+int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
{
struct in6_addr *saddr;
struct in6_addr *daddr;
struct sock *sk;
+ int delivered = 0;
__u8 hash;
saddr = &skb->nh.ipv6h->saddr;
@@ -160,9 +166,10 @@ void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
if (sk == NULL)
goto out;
- sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr);
+ sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr, IP6CB(skb)->iif);
while (sk) {
+ delivered = 1;
if (nexthdr != IPPROTO_ICMPV6 || !icmpv6_filter(sk, skb)) {
struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
@@ -170,10 +177,12 @@ void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
if (clone)
rawv6_rcv(sk, clone);
}
- sk = __raw_v6_lookup(sk_next(sk), nexthdr, daddr, saddr);
+ sk = __raw_v6_lookup(sk_next(sk), nexthdr, daddr, saddr,
+ IP6CB(skb)->iif);
}
out:
read_unlock(&raw_v6_lock);
+ return delivered;
}
/* This cleans up af_inet6 a bit. -DaveM */
@@ -328,12 +337,13 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb)
if (skb->ip_summed != CHECKSUM_UNNECESSARY) {
if (skb->ip_summed == CHECKSUM_HW) {
+ skb_postpull_rcsum(skb, skb->nh.raw,
+ skb->h.raw - skb->nh.raw);
skb->ip_summed = CHECKSUM_UNNECESSARY;
if (csum_ipv6_magic(&skb->nh.ipv6h->saddr,
&skb->nh.ipv6h->daddr,
skb->len, inet->num, skb->csum)) {
- LIMIT_NETDEBUG(
- printk(KERN_DEBUG "raw v6 hw csum failure.\n"));
+ LIMIT_NETDEBUG(KERN_DEBUG "raw v6 hw csum failure.\n");
skb->ip_summed = CHECKSUM_NONE;
}
}
@@ -617,7 +627,7 @@ static void rawv6_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
if (type && code) {
get_user(fl->fl_icmp_type, type);
- __get_user(fl->fl_icmp_code, code);
+ get_user(fl->fl_icmp_code, code);
probed = 1;
}
break;
@@ -645,6 +655,7 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
struct flowi fl;
int addr_len = msg->msg_namelen;
int hlimit = -1;
+ int tclass = -1;
u16 proto;
int err;
@@ -730,7 +741,7 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
memset(opt, 0, sizeof(struct ipv6_txoptions));
opt->tot_len = sizeof(struct ipv6_txoptions);
- err = datagram_send_ctl(msg, &fl, opt, &hlimit);
+ err = datagram_send_ctl(msg, &fl, opt, &hlimit, &tclass);
if (err < 0) {
fl6_sock_release(flowlabel);
return err;
@@ -745,8 +756,7 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
}
if (opt == NULL)
opt = np->opt;
- if (flowlabel)
- opt = fl6_merge_options(&opt_space, flowlabel, opt);
+ opt = fl6_merge_options(&opt_space, flowlabel, opt);
fl.proto = proto;
rawv6_probe_proto_opt(&fl, msg);
@@ -772,10 +782,8 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
if (final_p)
ipv6_addr_copy(&fl.fl6_dst, final_p);
- if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
- dst_release(dst);
+ if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0)
goto out;
- }
if (hlimit < 0) {
if (ipv6_addr_is_multicast(&fl.fl6_dst))
@@ -788,6 +796,12 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
hlimit = ipv6_get_hoplimit(dst->dev);
}
+ if (tclass < 0) {
+ tclass = np->cork.tclass;
+ if (tclass < 0)
+ tclass = 0;
+ }
+
if (msg->msg_flags&MSG_CONFIRM)
goto do_confirm;
@@ -796,8 +810,9 @@ back_from_confirm:
err = rawv6_send_hdrinc(sk, msg->msg_iov, len, &fl, (struct rt6_info*)dst, msg->msg_flags);
} else {
lock_sock(sk);
- err = ip6_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0,
- hlimit, opt, &fl, (struct rt6_info*)dst, msg->msg_flags);
+ err = ip6_append_data(sk, ip_generic_getfrag, msg->msg_iov,
+ len, 0, hlimit, tclass, opt, &fl, (struct rt6_info*)dst,
+ msg->msg_flags);
if (err)
ip6_flush_pending_frames(sk);
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 59e7c631787..e4fe9ee484d 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -479,12 +479,9 @@ static void ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
/* Point into the IP datagram 'data' part. */
if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data))
goto err;
- if (end-offset < skb->len) {
- if (pskb_trim(skb, end - offset))
- goto err;
- if (skb->ip_summed != CHECKSUM_UNNECESSARY)
- skb->ip_summed = CHECKSUM_NONE;
- }
+
+ if (pskb_trim_rcsum(skb, end - offset))
+ goto err;
/* Find out which fragments are in front and at the back of us
* in the chain of fragments so far. We must know where to put
@@ -562,7 +559,7 @@ static void ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
if (skb->dev)
fq->iif = skb->dev->ifindex;
skb->dev = NULL;
- fq->stamp = skb->stamp;
+ skb_get_timestamp(skb, &fq->stamp);
fq->meat += skb->len;
atomic_add(skb->truesize, &ip6_frag_mem);
@@ -664,7 +661,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff **skb_in,
head->next = NULL;
head->dev = dev;
- head->stamp = fq->stamp;
+ skb_set_timestamp(head, &fq->stamp);
head->nh.ipv6h->payload_len = htons(payload_len);
*skb_in = head;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 878789b3122..227e99ed510 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -483,7 +483,7 @@ restart:
goto out;
}
- rt = rt6_device_match(rt, skb->dev->ifindex, 0);
+ rt = rt6_device_match(rt, skb->dev->ifindex, strict);
BACKTRACK();
if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) {
@@ -1372,7 +1372,7 @@ int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
* Drop the packet on the floor
*/
-int ip6_pkt_discard(struct sk_buff *skb)
+static int ip6_pkt_discard(struct sk_buff *skb)
{
IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
@@ -1380,7 +1380,7 @@ int ip6_pkt_discard(struct sk_buff *skb)
return 0;
}
-int ip6_pkt_discard_out(struct sk_buff *skb)
+static int ip6_pkt_discard_out(struct sk_buff *skb)
{
skb->dev = skb->dst->dev;
return ip6_pkt_discard(skb);
@@ -1850,16 +1850,16 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh,
skb = alloc_skb(size, gfp_any());
if (!skb) {
- netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, ENOBUFS);
+ netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, ENOBUFS);
return;
}
if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) {
kfree_skb(skb);
- netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, EINVAL);
+ netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, EINVAL);
return;
}
- NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_ROUTE;
- netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_ROUTE, gfp_any());
+ NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_ROUTE;
+ netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_ROUTE, gfp_any());
}
/*
@@ -1960,8 +1960,6 @@ static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
return arg.len;
}
-extern struct rt6_statistics rt6_stats;
-
static int rt6_stats_seq_show(struct seq_file *seq, void *v)
{
seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index e553e5b80d6..c3123c9e1a8 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -770,7 +770,7 @@ static int ipip6_tunnel_init(struct net_device *dev)
return 0;
}
-int __init ipip6_fb_tunnel_init(struct net_device *dev)
+static int __init ipip6_fb_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel = dev->priv;
struct iphdr *iph = &tunnel->parms.iph;
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index 3a18e0e6ffe..8eff9fa1e98 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -14,9 +14,6 @@
#include <net/ipv6.h>
#include <net/addrconf.h>
-extern ctl_table ipv6_route_table[];
-extern ctl_table ipv6_icmp_table[];
-
#ifdef CONFIG_SYSCTL
static ctl_table ipv6_table[] = {
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index f6e288dc116..d693cb988b7 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -47,6 +47,7 @@
#include <net/tcp.h>
#include <net/ndisc.h>
+#include <net/inet6_hashtables.h>
#include <net/ipv6.h>
#include <net/transp_v6.h>
#include <net/addrconf.h>
@@ -75,34 +76,11 @@ static int tcp_v6_xmit(struct sk_buff *skb, int ipfragok);
static struct tcp_func ipv6_mapped;
static struct tcp_func ipv6_specific;
-/* I have no idea if this is a good hash for v6 or not. -DaveM */
-static __inline__ int tcp_v6_hashfn(struct in6_addr *laddr, u16 lport,
- struct in6_addr *faddr, u16 fport)
+static inline int tcp_v6_bind_conflict(const struct sock *sk,
+ const struct inet_bind_bucket *tb)
{
- int hashent = (lport ^ fport);
-
- hashent ^= (laddr->s6_addr32[3] ^ faddr->s6_addr32[3]);
- hashent ^= hashent>>16;
- hashent ^= hashent>>8;
- return (hashent & (tcp_ehash_size - 1));
-}
-
-static __inline__ int tcp_v6_sk_hashfn(struct sock *sk)
-{
- struct inet_sock *inet = inet_sk(sk);
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct in6_addr *laddr = &np->rcv_saddr;
- struct in6_addr *faddr = &np->daddr;
- __u16 lport = inet->num;
- __u16 fport = inet->dport;
- return tcp_v6_hashfn(laddr, lport, faddr, fport);
-}
-
-static inline int tcp_v6_bind_conflict(struct sock *sk,
- struct tcp_bind_bucket *tb)
-{
- struct sock *sk2;
- struct hlist_node *node;
+ const struct sock *sk2;
+ const struct hlist_node *node;
/* We must walk the whole port owner list in this case. -DaveM */
sk_for_each_bound(sk2, node, &tb->owners) {
@@ -126,8 +104,8 @@ static inline int tcp_v6_bind_conflict(struct sock *sk,
*/
static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
{
- struct tcp_bind_hashbucket *head;
- struct tcp_bind_bucket *tb;
+ struct inet_bind_hashbucket *head;
+ struct inet_bind_bucket *tb;
struct hlist_node *node;
int ret;
@@ -138,37 +116,42 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
int remaining = (high - low) + 1;
int rover;
- spin_lock(&tcp_portalloc_lock);
- if (tcp_port_rover < low)
+ spin_lock(&tcp_hashinfo.portalloc_lock);
+ if (tcp_hashinfo.port_rover < low)
rover = low;
else
- rover = tcp_port_rover;
+ rover = tcp_hashinfo.port_rover;
do { rover++;
if (rover > high)
rover = low;
- head = &tcp_bhash[tcp_bhashfn(rover)];
+ head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];
spin_lock(&head->lock);
- tb_for_each(tb, node, &head->chain)
+ inet_bind_bucket_for_each(tb, node, &head->chain)
if (tb->port == rover)
goto next;
break;
next:
spin_unlock(&head->lock);
} while (--remaining > 0);
- tcp_port_rover = rover;
- spin_unlock(&tcp_portalloc_lock);
-
- /* Exhausted local port range during search? */
+ tcp_hashinfo.port_rover = rover;
+ spin_unlock(&tcp_hashinfo.portalloc_lock);
+
+ /* Exhausted local port range during search? It is not
+ * possible for us to be holding one of the bind hash
+ * locks if this test triggers, because if 'remaining'
+ * drops to zero, we broke out of the do/while loop at
+ * the top level, not from the 'break;' statement.
+ */
ret = 1;
- if (remaining <= 0)
+ if (unlikely(remaining <= 0))
goto fail;
/* OK, here is the one we will use. */
snum = rover;
} else {
- head = &tcp_bhash[tcp_bhashfn(snum)];
+ head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
spin_lock(&head->lock);
- tb_for_each(tb, node, &head->chain)
+ inet_bind_bucket_for_each(tb, node, &head->chain)
if (tb->port == snum)
goto tb_found;
}
@@ -187,8 +170,11 @@ tb_found:
}
tb_not_found:
ret = 1;
- if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
- goto fail_unlock;
+ if (tb == NULL) {
+ tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum);
+ if (tb == NULL)
+ goto fail_unlock;
+ }
if (hlist_empty(&tb->owners)) {
if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
tb->fastreuse = 1;
@@ -199,9 +185,9 @@ tb_not_found:
tb->fastreuse = 0;
success:
- if (!tcp_sk(sk)->bind_hash)
- tcp_bind_hash(sk, tb, snum);
- BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
+ if (!inet_csk(sk)->icsk_bind_hash)
+ inet_bind_hash(sk, tb, snum);
+ BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb);
ret = 0;
fail_unlock:
@@ -219,13 +205,15 @@ static __inline__ void __tcp_v6_hash(struct sock *sk)
BUG_TRAP(sk_unhashed(sk));
if (sk->sk_state == TCP_LISTEN) {
- list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
- lock = &tcp_lhash_lock;
- tcp_listen_wlock();
+ list = &tcp_hashinfo.listening_hash[inet_sk_listen_hashfn(sk)];
+ lock = &tcp_hashinfo.lhash_lock;
+ inet_listen_wlock(&tcp_hashinfo);
} else {
- sk->sk_hashent = tcp_v6_sk_hashfn(sk);
- list = &tcp_ehash[sk->sk_hashent].chain;
- lock = &tcp_ehash[sk->sk_hashent].lock;
+ unsigned int hash;
+ sk->sk_hash = hash = inet6_sk_ehashfn(sk);
+ hash &= (tcp_hashinfo.ehash_size - 1);
+ list = &tcp_hashinfo.ehash[hash].chain;
+ lock = &tcp_hashinfo.ehash[hash].lock;
write_lock(lock);
}
@@ -250,131 +238,11 @@ static void tcp_v6_hash(struct sock *sk)
}
}
-static struct sock *tcp_v6_lookup_listener(struct in6_addr *daddr, unsigned short hnum, int dif)
-{
- struct sock *sk;
- struct hlist_node *node;
- struct sock *result = NULL;
- int score, hiscore;
-
- hiscore=0;
- read_lock(&tcp_lhash_lock);
- sk_for_each(sk, node, &tcp_listening_hash[tcp_lhashfn(hnum)]) {
- if (inet_sk(sk)->num == hnum && sk->sk_family == PF_INET6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
-
- score = 1;
- if (!ipv6_addr_any(&np->rcv_saddr)) {
- if (!ipv6_addr_equal(&np->rcv_saddr, daddr))
- continue;
- score++;
- }
- if (sk->sk_bound_dev_if) {
- if (sk->sk_bound_dev_if != dif)
- continue;
- score++;
- }
- if (score == 3) {
- result = sk;
- break;
- }
- if (score > hiscore) {
- hiscore = score;
- result = sk;
- }
- }
- }
- if (result)
- sock_hold(result);
- read_unlock(&tcp_lhash_lock);
- return result;
-}
-
-/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
- * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
- *
- * The sockhash lock must be held as a reader here.
- */
-
-static inline struct sock *__tcp_v6_lookup_established(struct in6_addr *saddr, u16 sport,
- struct in6_addr *daddr, u16 hnum,
- int dif)
-{
- struct tcp_ehash_bucket *head;
- struct sock *sk;
- struct hlist_node *node;
- __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
- int hash;
-
- /* Optimize here for direct hit, only listening connections can
- * have wildcards anyways.
- */
- hash = tcp_v6_hashfn(daddr, hnum, saddr, sport);
- head = &tcp_ehash[hash];
- read_lock(&head->lock);
- sk_for_each(sk, node, &head->chain) {
- /* For IPV6 do the cheaper port and family tests first. */
- if(TCP_IPV6_MATCH(sk, saddr, daddr, ports, dif))
- goto hit; /* You sunk my battleship! */
- }
- /* Must check for a TIME_WAIT'er before going to listener hash. */
- sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
- /* FIXME: acme: check this... */
- struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
-
- if(*((__u32 *)&(tw->tw_dport)) == ports &&
- sk->sk_family == PF_INET6) {
- if(ipv6_addr_equal(&tw->tw_v6_daddr, saddr) &&
- ipv6_addr_equal(&tw->tw_v6_rcv_saddr, daddr) &&
- (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif))
- goto hit;
- }
- }
- read_unlock(&head->lock);
- return NULL;
-
-hit:
- sock_hold(sk);
- read_unlock(&head->lock);
- return sk;
-}
-
-
-static inline struct sock *__tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
- struct in6_addr *daddr, u16 hnum,
- int dif)
-{
- struct sock *sk;
-
- sk = __tcp_v6_lookup_established(saddr, sport, daddr, hnum, dif);
-
- if (sk)
- return sk;
-
- return tcp_v6_lookup_listener(daddr, hnum, dif);
-}
-
-inline struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
- struct in6_addr *daddr, u16 dport,
- int dif)
-{
- struct sock *sk;
-
- local_bh_disable();
- sk = __tcp_v6_lookup(saddr, sport, daddr, ntohs(dport), dif);
- local_bh_enable();
-
- return sk;
-}
-
-EXPORT_SYMBOL_GPL(tcp_v6_lookup);
-
-
/*
* Open request hash tables.
*/
-static u32 tcp_v6_synq_hash(struct in6_addr *raddr, u16 rport, u32 rnd)
+static u32 tcp_v6_synq_hash(const struct in6_addr *raddr, const u16 rport, const u32 rnd)
{
u32 a, b, c;
@@ -394,14 +262,15 @@ static u32 tcp_v6_synq_hash(struct in6_addr *raddr, u16 rport, u32 rnd)
return c & (TCP_SYNQ_HSIZE - 1);
}
-static struct request_sock *tcp_v6_search_req(struct tcp_sock *tp,
+static struct request_sock *tcp_v6_search_req(const struct sock *sk,
struct request_sock ***prevp,
__u16 rport,
struct in6_addr *raddr,
struct in6_addr *laddr,
int iif)
{
- struct listen_sock *lopt = tp->accept_queue.listen_opt;
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
struct request_sock *req, **prev;
for (prev = &lopt->syn_table[tcp_v6_synq_hash(raddr, rport, lopt->hash_rnd)];
@@ -446,44 +315,48 @@ static __u32 tcp_v6_init_sequence(struct sock *sk, struct sk_buff *skb)
}
}
-static int __tcp_v6_check_established(struct sock *sk, __u16 lport,
- struct tcp_tw_bucket **twp)
+static int __tcp_v6_check_established(struct sock *sk, const __u16 lport,
+ struct inet_timewait_sock **twp)
{
struct inet_sock *inet = inet_sk(sk);
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct in6_addr *daddr = &np->rcv_saddr;
- struct in6_addr *saddr = &np->daddr;
- int dif = sk->sk_bound_dev_if;
- u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
- int hash = tcp_v6_hashfn(daddr, inet->num, saddr, inet->dport);
- struct tcp_ehash_bucket *head = &tcp_ehash[hash];
+ const struct ipv6_pinfo *np = inet6_sk(sk);
+ const struct in6_addr *daddr = &np->rcv_saddr;
+ const struct in6_addr *saddr = &np->daddr;
+ const int dif = sk->sk_bound_dev_if;
+ const u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
+ unsigned int hash = inet6_ehashfn(daddr, inet->num, saddr, inet->dport);
+ struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash);
struct sock *sk2;
- struct hlist_node *node;
- struct tcp_tw_bucket *tw;
+ const struct hlist_node *node;
+ struct inet_timewait_sock *tw;
+ prefetch(head->chain.first);
write_lock(&head->lock);
/* Check TIME-WAIT sockets first. */
- sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
- tw = (struct tcp_tw_bucket*)sk2;
+ sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
+ const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk2);
+
+ tw = inet_twsk(sk2);
if(*((__u32 *)&(tw->tw_dport)) == ports &&
sk2->sk_family == PF_INET6 &&
- ipv6_addr_equal(&tw->tw_v6_daddr, saddr) &&
- ipv6_addr_equal(&tw->tw_v6_rcv_saddr, daddr) &&
+ ipv6_addr_equal(&tcp6tw->tw_v6_daddr, saddr) &&
+ ipv6_addr_equal(&tcp6tw->tw_v6_rcv_saddr, daddr) &&
sk2->sk_bound_dev_if == sk->sk_bound_dev_if) {
+ const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
struct tcp_sock *tp = tcp_sk(sk);
- if (tw->tw_ts_recent_stamp &&
- (!twp || (sysctl_tcp_tw_reuse &&
- xtime.tv_sec -
- tw->tw_ts_recent_stamp > 1))) {
+ if (tcptw->tw_ts_recent_stamp &&
+ (!twp ||
+ (sysctl_tcp_tw_reuse &&
+ xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) {
/* See comment in tcp_ipv4.c */
- tp->write_seq = tw->tw_snd_nxt + 65535 + 2;
+ tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
if (!tp->write_seq)
tp->write_seq = 1;
- tp->rx_opt.ts_recent = tw->tw_ts_recent;
- tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
+ tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
+ tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
sock_hold(sk2);
goto unique;
} else
@@ -494,14 +367,14 @@ static int __tcp_v6_check_established(struct sock *sk, __u16 lport,
/* And established part... */
sk_for_each(sk2, node, &head->chain) {
- if(TCP_IPV6_MATCH(sk2, saddr, daddr, ports, dif))
+ if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif))
goto not_unique;
}
unique:
BUG_TRAP(sk_unhashed(sk));
__sk_add_node(sk, &head->chain);
- sk->sk_hashent = hash;
+ sk->sk_hash = hash;
sock_prot_inc_use(sk->sk_prot);
write_unlock(&head->lock);
@@ -510,10 +383,10 @@ unique:
NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
} else if (tw) {
/* Silly. Should hash-dance instead... */
- tcp_tw_deschedule(tw);
+ inet_twsk_deschedule(tw, &tcp_death_row);
NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
- tcp_tw_put(tw);
+ inet_twsk_put(tw);
}
return 0;
@@ -535,8 +408,8 @@ static inline u32 tcpv6_port_offset(const struct sock *sk)
static int tcp_v6_hash_connect(struct sock *sk)
{
unsigned short snum = inet_sk(sk)->num;
- struct tcp_bind_hashbucket *head;
- struct tcp_bind_bucket *tb;
+ struct inet_bind_hashbucket *head;
+ struct inet_bind_bucket *tb;
int ret;
if (!snum) {
@@ -548,19 +421,19 @@ static int tcp_v6_hash_connect(struct sock *sk)
static u32 hint;
u32 offset = hint + tcpv6_port_offset(sk);
struct hlist_node *node;
- struct tcp_tw_bucket *tw = NULL;
+ struct inet_timewait_sock *tw = NULL;
local_bh_disable();
for (i = 1; i <= range; i++) {
port = low + (i + offset) % range;
- head = &tcp_bhash[tcp_bhashfn(port)];
+ head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
spin_lock(&head->lock);
/* Does not bother with rcv_saddr checks,
* because the established check is already
* unique enough.
*/
- tb_for_each(tb, node, &head->chain) {
+ inet_bind_bucket_for_each(tb, node, &head->chain) {
if (tb->port == port) {
BUG_TRAP(!hlist_empty(&tb->owners));
if (tb->fastreuse >= 0)
@@ -573,7 +446,7 @@ static int tcp_v6_hash_connect(struct sock *sk)
}
}
- tb = tcp_bucket_create(head, port);
+ tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
if (!tb) {
spin_unlock(&head->lock);
break;
@@ -592,7 +465,7 @@ ok:
hint += i;
/* Head lock still held and bh's disabled */
- tcp_bind_hash(sk, tb, port);
+ inet_bind_hash(sk, tb, port);
if (sk_unhashed(sk)) {
inet_sk(sk)->sport = htons(port);
__tcp_v6_hash(sk);
@@ -600,16 +473,16 @@ ok:
spin_unlock(&head->lock);
if (tw) {
- tcp_tw_deschedule(tw);
- tcp_tw_put(tw);
+ inet_twsk_deschedule(tw, &tcp_death_row);
+ inet_twsk_put(tw);
}
ret = 0;
goto out;
}
- head = &tcp_bhash[tcp_bhashfn(snum)];
- tb = tcp_sk(sk)->bind_hash;
+ head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
+ tb = inet_csk(sk)->icsk_bind_hash;
spin_lock_bh(&head->lock);
if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
@@ -626,11 +499,6 @@ out:
}
}
-static __inline__ int tcp_v6_iif(struct sk_buff *skb)
-{
- return IP6CB(skb)->iif;
-}
-
static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
int addr_len)
{
@@ -766,10 +634,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
if (final_p)
ipv6_addr_copy(&fl.fl6_dst, final_p);
- if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
- dst_release(dst);
+ if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0)
goto failure;
- }
if (saddr == NULL) {
saddr = &fl.fl6_src;
@@ -822,14 +688,15 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
int type, int code, int offset, __u32 info)
{
struct ipv6hdr *hdr = (struct ipv6hdr*)skb->data;
- struct tcphdr *th = (struct tcphdr *)(skb->data+offset);
+ const struct tcphdr *th = (struct tcphdr *)(skb->data+offset);
struct ipv6_pinfo *np;
struct sock *sk;
int err;
struct tcp_sock *tp;
__u32 seq;
- sk = tcp_v6_lookup(&hdr->daddr, th->dest, &hdr->saddr, th->source, skb->dev->ifindex);
+ sk = inet6_lookup(&tcp_hashinfo, &hdr->daddr, th->dest, &hdr->saddr,
+ th->source, skb->dev->ifindex);
if (sk == NULL) {
ICMP6_INC_STATS_BH(__in6_dev_get(skb->dev), ICMP6_MIB_INERRORS);
@@ -837,7 +704,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
}
if (sk->sk_state == TCP_TIME_WAIT) {
- tcp_tw_put((struct tcp_tw_bucket*)sk);
+ inet_twsk_put((struct inet_timewait_sock *)sk);
return;
}
@@ -915,8 +782,8 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (sock_owned_by_user(sk))
goto out;
- req = tcp_v6_search_req(tp, &prev, th->dest, &hdr->daddr,
- &hdr->saddr, tcp_v6_iif(skb));
+ req = tcp_v6_search_req(sk, &prev, th->dest, &hdr->daddr,
+ &hdr->saddr, inet6_iif(skb));
if (!req)
goto out;
@@ -930,7 +797,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
goto out;
}
- tcp_synq_drop(sk, req, prev);
+ inet_csk_reqsk_queue_drop(sk, req, prev);
goto out;
case TCP_SYN_SENT:
@@ -982,7 +849,7 @@ static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req,
if (dst == NULL) {
opt = np->opt;
if (opt == NULL &&
- np->rxopt.bits.srcrt == 2 &&
+ np->rxopt.bits.osrcrt == 2 &&
treq->pktopts) {
struct sk_buff *pktopts = treq->pktopts;
struct inet6_skb_parm *rxopt = IP6CB(pktopts);
@@ -1021,7 +888,6 @@ static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req,
}
done:
- dst_release(dst);
if (opt && opt != np->opt)
sock_kfree_s(sk, opt, opt->tot_len);
return err;
@@ -1048,11 +914,10 @@ static int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb)
struct inet6_skb_parm *opt = IP6CB(skb);
if (np->rxopt.all) {
- if ((opt->hop && np->rxopt.bits.hopopts) ||
- ((IPV6_FLOWINFO_MASK&*(u32*)skb->nh.raw) &&
- np->rxopt.bits.rxflow) ||
- (opt->srcrt && np->rxopt.bits.srcrt) ||
- ((opt->dst1 || opt->dst0) && np->rxopt.bits.dstopts))
+ if ((opt->hop && (np->rxopt.bits.hopopts || np->rxopt.bits.ohopopts)) ||
+ ((IPV6_FLOWINFO_MASK & *(u32*)skb->nh.raw) && np->rxopt.bits.rxflow) ||
+ (opt->srcrt && (np->rxopt.bits.srcrt || np->rxopt.bits.osrcrt)) ||
+ ((opt->dst1 || opt->dst0) && (np->rxopt.bits.dstopts || np->rxopt.bits.odstopts)))
return 1;
}
return 0;
@@ -1127,17 +992,15 @@ static void tcp_v6_send_reset(struct sk_buff *skb)
buff->csum);
fl.proto = IPPROTO_TCP;
- fl.oif = tcp_v6_iif(skb);
+ fl.oif = inet6_iif(skb);
fl.fl_ip_dport = t1->dest;
fl.fl_ip_sport = t1->source;
/* sk = NULL, but it is safe for now. RST socket required. */
if (!ip6_dst_lookup(NULL, &buff->dst, &fl)) {
- if ((xfrm_lookup(&buff->dst, &fl, NULL, 0)) < 0) {
- dst_release(buff->dst);
+ if ((xfrm_lookup(&buff->dst, &fl, NULL, 0)) < 0)
return;
- }
ip6_xmit(NULL, buff, &fl, NULL, 0);
TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
@@ -1196,15 +1059,13 @@ static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32
buff->csum);
fl.proto = IPPROTO_TCP;
- fl.oif = tcp_v6_iif(skb);
+ fl.oif = inet6_iif(skb);
fl.fl_ip_dport = t1->dest;
fl.fl_ip_sport = t1->source;
if (!ip6_dst_lookup(NULL, &buff->dst, &fl)) {
- if ((xfrm_lookup(&buff->dst, &fl, NULL, 0)) < 0) {
- dst_release(buff->dst);
+ if ((xfrm_lookup(&buff->dst, &fl, NULL, 0)) < 0)
return;
- }
ip6_xmit(NULL, buff, &fl, NULL, 0);
TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
return;
@@ -1215,12 +1076,14 @@ static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32
static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
{
- struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
+ struct inet_timewait_sock *tw = inet_twsk(sk);
+ const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
- tcp_v6_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
- tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
+ tcp_v6_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
+ tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
+ tcptw->tw_ts_recent);
- tcp_tw_put(tw);
+ inet_twsk_put(tw);
}
static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
@@ -1232,28 +1095,25 @@ static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
{
struct request_sock *req, **prev;
- struct tcphdr *th = skb->h.th;
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcphdr *th = skb->h.th;
struct sock *nsk;
/* Find possible connection requests. */
- req = tcp_v6_search_req(tp, &prev, th->source, &skb->nh.ipv6h->saddr,
- &skb->nh.ipv6h->daddr, tcp_v6_iif(skb));
+ req = tcp_v6_search_req(sk, &prev, th->source, &skb->nh.ipv6h->saddr,
+ &skb->nh.ipv6h->daddr, inet6_iif(skb));
if (req)
return tcp_check_req(sk, skb, req, prev);
- nsk = __tcp_v6_lookup_established(&skb->nh.ipv6h->saddr,
- th->source,
- &skb->nh.ipv6h->daddr,
- ntohs(th->dest),
- tcp_v6_iif(skb));
+ nsk = __inet6_lookup_established(&tcp_hashinfo, &skb->nh.ipv6h->saddr,
+ th->source, &skb->nh.ipv6h->daddr,
+ ntohs(th->dest), inet6_iif(skb));
if (nsk) {
if (nsk->sk_state != TCP_TIME_WAIT) {
bh_lock_sock(nsk);
return nsk;
}
- tcp_tw_put((struct tcp_tw_bucket*)nsk);
+ inet_twsk_put((struct inet_timewait_sock *)nsk);
return NULL;
}
@@ -1266,12 +1126,12 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
static void tcp_v6_synq_add(struct sock *sk, struct request_sock *req)
{
- struct tcp_sock *tp = tcp_sk(sk);
- struct listen_sock *lopt = tp->accept_queue.listen_opt;
- u32 h = tcp_v6_synq_hash(&tcp6_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
+ const u32 h = tcp_v6_synq_hash(&tcp6_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
- reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
- tcp_synq_added(sk);
+ reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, TCP_TIMEOUT_INIT);
+ inet_csk_reqsk_queue_added(sk, TCP_TIMEOUT_INIT);
}
@@ -1296,13 +1156,13 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
/*
* There are no SYN attacks on IPv6, yet...
*/
- if (tcp_synq_is_full(sk) && !isn) {
+ if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
if (net_ratelimit())
printk(KERN_INFO "TCPv6: dropping request, synflood is possible\n");
goto drop;
}
- if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
+ if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
goto drop;
req = reqsk_alloc(&tcp6_request_sock_ops);
@@ -1324,8 +1184,8 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
TCP_ECN_create_request(req, skb->h.th);
treq->pktopts = NULL;
if (ipv6_opt_accepted(sk, skb) ||
- np->rxopt.bits.rxinfo ||
- np->rxopt.bits.rxhlim) {
+ np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
+ np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
atomic_inc(&skb->users);
treq->pktopts = skb;
}
@@ -1334,7 +1194,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
/* So that link locals have meaning */
if (!sk->sk_bound_dev_if &&
ipv6_addr_type(&treq->rmt_addr) & IPV6_ADDR_LINKLOCAL)
- treq->iif = tcp_v6_iif(skb);
+ treq->iif = inet6_iif(skb);
if (isn == 0)
isn = tcp_v6_init_sequence(sk,skb);
@@ -1399,15 +1259,14 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newsk->sk_backlog_rcv = tcp_v4_do_rcv;
newnp->pktoptions = NULL;
newnp->opt = NULL;
- newnp->mcast_oif = tcp_v6_iif(skb);
+ newnp->mcast_oif = inet6_iif(skb);
newnp->mcast_hops = skb->nh.ipv6h->hop_limit;
- /* Charge newly allocated IPv6 socket. Though it is mapped,
- * it is IPv6 yet.
+ /*
+ * No need to charge this sock to the relevant IPv6 refcnt debug socks count
+ * here, tcp_create_openreq_child now does this for us, see the comment in
+ * that function for the gory details. -acme
*/
-#ifdef INET_REFCNT_DEBUG
- atomic_inc(&inet6_sock_nr);
-#endif
/* It is tricky place. Until this moment IPv4 tcp
worked with IPv6 af_tcp.af_specific.
@@ -1423,7 +1282,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
if (sk_acceptq_is_full(sk))
goto out_overflow;
- if (np->rxopt.bits.srcrt == 2 &&
+ if (np->rxopt.bits.osrcrt == 2 &&
opt == NULL && treq->pktopts) {
struct inet6_skb_parm *rxopt = IP6CB(treq->pktopts);
if (rxopt->srcrt)
@@ -1462,10 +1321,11 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
if (newsk == NULL)
goto out;
- /* Charge newly allocated IPv6 socket */
-#ifdef INET_REFCNT_DEBUG
- atomic_inc(&inet6_sock_nr);
-#endif
+ /*
+ * No need to charge this sock to the relevant IPv6 refcnt debug socks
+ * count here, tcp_create_openreq_child now does this for us, see the
+ * comment in that function for the gory details. -acme
+ */
ip6_dst_store(newsk, dst, NULL);
newsk->sk_route_caps = dst->dev->features &
@@ -1504,7 +1364,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
skb_set_owner_r(newnp->pktoptions, newsk);
}
newnp->opt = NULL;
- newnp->mcast_oif = tcp_v6_iif(skb);
+ newnp->mcast_oif = inet6_iif(skb);
newnp->mcast_hops = skb->nh.ipv6h->hop_limit;
/* Clone native IPv6 options from listening socket (if any)
@@ -1531,7 +1391,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6;
__tcp_v6_hash(newsk);
- tcp_inherit_port(sk, newsk);
+ inet_inherit_port(&tcp_hashinfo, sk, newsk);
return newsk;
@@ -1552,7 +1412,7 @@ static int tcp_v6_checksum_init(struct sk_buff *skb)
if (!tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr,
&skb->nh.ipv6h->daddr,skb->csum))
return 0;
- LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v6 csum failed\n"));
+ LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v6 csum failed\n");
}
if (skb->len <= 76) {
if (tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr,
@@ -1678,9 +1538,9 @@ ipv6_pktoptions:
tp = tcp_sk(sk);
if (TCP_SKB_CB(opt_skb)->end_seq == tp->rcv_nxt &&
!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
- if (np->rxopt.bits.rxinfo)
- np->mcast_oif = tcp_v6_iif(opt_skb);
- if (np->rxopt.bits.rxhlim)
+ if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo)
+ np->mcast_oif = inet6_iif(opt_skb);
+ if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim)
np->mcast_hops = opt_skb->nh.ipv6h->hop_limit;
if (ipv6_opt_accepted(sk, opt_skb)) {
skb_set_owner_r(opt_skb, sk);
@@ -1734,8 +1594,9 @@ static int tcp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
TCP_SKB_CB(skb)->flags = ipv6_get_dsfield(skb->nh.ipv6h);
TCP_SKB_CB(skb)->sacked = 0;
- sk = __tcp_v6_lookup(&skb->nh.ipv6h->saddr, th->source,
- &skb->nh.ipv6h->daddr, ntohs(th->dest), tcp_v6_iif(skb));
+ sk = __inet6_lookup(&tcp_hashinfo, &skb->nh.ipv6h->saddr, th->source,
+ &skb->nh.ipv6h->daddr, ntohs(th->dest),
+ inet6_iif(skb));
if (!sk)
goto no_tcp_socket;
@@ -1790,26 +1651,29 @@ discard_and_relse:
do_time_wait:
if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
- tcp_tw_put((struct tcp_tw_bucket *) sk);
+ inet_twsk_put((struct inet_timewait_sock *)sk);
goto discard_it;
}
if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
TCP_INC_STATS_BH(TCP_MIB_INERRS);
- tcp_tw_put((struct tcp_tw_bucket *) sk);
+ inet_twsk_put((struct inet_timewait_sock *)sk);
goto discard_it;
}
- switch(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
- skb, th, skb->len)) {
+ switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
+ skb, th)) {
case TCP_TW_SYN:
{
struct sock *sk2;
- sk2 = tcp_v6_lookup_listener(&skb->nh.ipv6h->daddr, ntohs(th->dest), tcp_v6_iif(skb));
+ sk2 = inet6_lookup_listener(&tcp_hashinfo,
+ &skb->nh.ipv6h->daddr,
+ ntohs(th->dest), inet6_iif(skb));
if (sk2 != NULL) {
- tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
- tcp_tw_put((struct tcp_tw_bucket *)sk);
+ struct inet_timewait_sock *tw = inet_twsk(sk);
+ inet_twsk_deschedule(tw, &tcp_death_row);
+ inet_twsk_put(tw);
sk = sk2;
goto process;
}
@@ -1864,7 +1728,6 @@ static int tcp_v6_rebuild_header(struct sock *sk)
if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
sk->sk_err_soft = -err;
- dst_release(dst);
return err;
}
@@ -1917,7 +1780,6 @@ static int tcp_v6_xmit(struct sk_buff *skb, int ipfragok)
if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
sk->sk_route_caps = 0;
- dst_release(dst);
return err;
}
@@ -1978,7 +1840,7 @@ static struct tcp_func ipv6_specific = {
static struct tcp_func ipv6_mapped = {
.queue_xmit = ip_queue_xmit,
.send_check = tcp_v4_send_check,
- .rebuild_header = tcp_v4_rebuild_header,
+ .rebuild_header = inet_sk_rebuild_header,
.conn_request = tcp_v6_conn_request,
.syn_recv_sock = tcp_v6_syn_recv_sock,
.remember_stamp = tcp_v4_remember_stamp,
@@ -1997,13 +1859,14 @@ static struct tcp_func ipv6_mapped = {
*/
static int tcp_v6_init_sock(struct sock *sk)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
skb_queue_head_init(&tp->out_of_order_queue);
tcp_init_xmit_timers(sk);
tcp_prequeue_init(tp);
- tp->rto = TCP_TIMEOUT_INIT;
+ icsk->icsk_rto = TCP_TIMEOUT_INIT;
tp->mdev = TCP_TIMEOUT_INIT;
/* So many TCP implementations out there (incorrectly) count the
@@ -2025,7 +1888,7 @@ static int tcp_v6_init_sock(struct sock *sk)
sk->sk_state = TCP_CLOSE;
tp->af_specific = &ipv6_specific;
- tp->ca_ops = &tcp_init_congestion_ops;
+ icsk->icsk_ca_ops = &tcp_init_congestion_ops;
sk->sk_write_space = sk_stream_write_space;
sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
@@ -2039,8 +1902,6 @@ static int tcp_v6_init_sock(struct sock *sk)
static int tcp_v6_destroy_sock(struct sock *sk)
{
- extern int tcp_v4_destroy_sock(struct sock *sk);
-
tcp_v4_destroy_sock(sk);
return inet6_destroy_sock(sk);
}
@@ -2086,18 +1947,20 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
unsigned long timer_expires;
struct inet_sock *inet = inet_sk(sp);
struct tcp_sock *tp = tcp_sk(sp);
+ const struct inet_connection_sock *icsk = inet_csk(sp);
struct ipv6_pinfo *np = inet6_sk(sp);
dest = &np->daddr;
src = &np->rcv_saddr;
destp = ntohs(inet->dport);
srcp = ntohs(inet->sport);
- if (tp->pending == TCP_TIME_RETRANS) {
+
+ if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
timer_active = 1;
- timer_expires = tp->timeout;
- } else if (tp->pending == TCP_TIME_PROBE0) {
+ timer_expires = icsk->icsk_timeout;
+ } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
timer_active = 4;
- timer_expires = tp->timeout;
+ timer_expires = icsk->icsk_timeout;
} else if (timer_pending(&sp->sk_timer)) {
timer_active = 2;
timer_expires = sp->sk_timer.expires;
@@ -2118,28 +1981,31 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq,
timer_active,
jiffies_to_clock_t(timer_expires - jiffies),
- tp->retransmits,
+ icsk->icsk_retransmits,
sock_i_uid(sp),
- tp->probes_out,
+ icsk->icsk_probes_out,
sock_i_ino(sp),
atomic_read(&sp->sk_refcnt), sp,
- tp->rto, tp->ack.ato, (tp->ack.quick<<1)|tp->ack.pingpong,
+ icsk->icsk_rto,
+ icsk->icsk_ack.ato,
+ (icsk->icsk_ack.quick << 1 ) | icsk->icsk_ack.pingpong,
tp->snd_cwnd, tp->snd_ssthresh>=0xFFFF?-1:tp->snd_ssthresh
);
}
static void get_timewait6_sock(struct seq_file *seq,
- struct tcp_tw_bucket *tw, int i)
+ struct inet_timewait_sock *tw, int i)
{
struct in6_addr *dest, *src;
__u16 destp, srcp;
+ struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw);
int ttd = tw->tw_ttd - jiffies;
if (ttd < 0)
ttd = 0;
- dest = &tw->tw_v6_daddr;
- src = &tw->tw_v6_rcv_saddr;
+ dest = &tcp6tw->tw_v6_daddr;
+ src = &tcp6tw->tw_v6_rcv_saddr;
destp = ntohs(tw->tw_dport);
srcp = ntohs(tw->tw_sport);
@@ -2214,7 +2080,7 @@ struct proto tcpv6_prot = {
.close = tcp_close,
.connect = tcp_v6_connect,
.disconnect = tcp_disconnect,
- .accept = tcp_accept,
+ .accept = inet_csk_accept,
.ioctl = tcp_ioctl,
.init = tcp_v6_init_sock,
.destroy = tcp_v6_destroy_sock,
@@ -2231,11 +2097,13 @@ struct proto tcpv6_prot = {
.sockets_allocated = &tcp_sockets_allocated,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
+ .orphan_count = &tcp_orphan_count,
.sysctl_mem = sysctl_tcp_mem,
.sysctl_wmem = sysctl_tcp_wmem,
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp6_sock),
+ .twsk_obj_size = sizeof(struct tcp6_timewait_sock),
.rsk_prot = &tcp6_request_sock_ops,
};
@@ -2245,8 +2113,6 @@ static struct inet6_protocol tcpv6_protocol = {
.flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
};
-extern struct proto_ops inet6_stream_ops;
-
static struct inet_protosw tcpv6_protosw = {
.type = SOCK_STREAM,
.protocol = IPPROTO_TCP,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index eff050ac704..bf9519341fd 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -51,6 +51,7 @@
#include <net/udp.h>
#include <net/raw.h>
#include <net/inet_common.h>
+#include <net/tcp_states.h>
#include <net/ip6_checksum.h>
#include <net/xfrm.h>
@@ -58,7 +59,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
-DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6);
+DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6) __read_mostly;
/* Grrr, addr_type already calculated by caller, but I don't want
* to add some silly "cookie" argument to this method just for that.
@@ -98,7 +99,7 @@ static int udp_v6_get_port(struct sock *sk, unsigned short snum)
next:;
}
result = best;
- for(;; result += UDP_HTABLE_SIZE) {
+ for(i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++, result += UDP_HTABLE_SIZE) {
if (result > sysctl_local_port_range[1])
result = sysctl_local_port_range[0]
+ ((result - sysctl_local_port_range[0]) &
@@ -106,6 +107,8 @@ static int udp_v6_get_port(struct sock *sk, unsigned short snum)
if (!udp_lport_inuse(result))
break;
}
+ if (i >= (1 << 16) / UDP_HTABLE_SIZE)
+ goto fail;
gotit:
udp_port_rover = snum = result;
} else {
@@ -404,9 +407,8 @@ static struct sock *udp_v6_mcast_next(struct sock *sk,
continue;
if (!ipv6_addr_any(&np->rcv_saddr)) {
- if (ipv6_addr_equal(&np->rcv_saddr, loc_addr))
- return s;
- continue;
+ if (!ipv6_addr_equal(&np->rcv_saddr, loc_addr))
+ continue;
}
if(!inet6_mc_check(s, loc_addr, rmt_addr))
continue;
@@ -477,13 +479,12 @@ static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
/* RFC 2460 section 8.1 says that we SHOULD log
this error. Well, it is reasonable.
*/
- LIMIT_NETDEBUG(
- printk(KERN_INFO "IPv6: udp checksum is 0\n"));
+ LIMIT_NETDEBUG(KERN_INFO "IPv6: udp checksum is 0\n");
goto discard;
}
if (ulen < skb->len) {
- if (__pskb_trim(skb, ulen))
+ if (pskb_trim_rcsum(skb, ulen))
goto discard;
saddr = &skb->nh.ipv6h->saddr;
daddr = &skb->nh.ipv6h->daddr;
@@ -493,7 +494,7 @@ static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
if (skb->ip_summed==CHECKSUM_HW) {
skb->ip_summed = CHECKSUM_UNNECESSARY;
if (csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum)) {
- LIMIT_NETDEBUG(printk(KERN_DEBUG "udp v6 hw csum failure.\n"));
+ LIMIT_NETDEBUG(KERN_DEBUG "udp v6 hw csum failure.\n");
skb->ip_summed = CHECKSUM_NONE;
}
}
@@ -637,8 +638,10 @@ static int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk,
int addr_len = msg->msg_namelen;
int ulen = len;
int hlimit = -1;
+ int tclass = -1;
int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
int err;
+ int connected = 0;
/* destination address check */
if (sin6) {
@@ -748,6 +751,7 @@ do_udp_sendmsg:
fl->fl_ip_dport = inet->dport;
daddr = &np->daddr;
fl->fl6_flowlabel = np->flow_label;
+ connected = 1;
}
if (!fl->oif)
@@ -758,7 +762,7 @@ do_udp_sendmsg:
memset(opt, 0, sizeof(struct ipv6_txoptions));
opt->tot_len = sizeof(*opt);
- err = datagram_send_ctl(msg, fl, opt, &hlimit);
+ err = datagram_send_ctl(msg, fl, opt, &hlimit, &tclass);
if (err < 0) {
fl6_sock_release(flowlabel);
return err;
@@ -770,11 +774,11 @@ do_udp_sendmsg:
}
if (!(opt->opt_nflen|opt->opt_flen))
opt = NULL;
+ connected = 0;
}
if (opt == NULL)
opt = np->opt;
- if (flowlabel)
- opt = fl6_merge_options(&opt_space, flowlabel, opt);
+ opt = fl6_merge_options(&opt_space, flowlabel, opt);
fl->proto = IPPROTO_UDP;
ipv6_addr_copy(&fl->fl6_dst, daddr);
@@ -788,10 +792,13 @@ do_udp_sendmsg:
ipv6_addr_copy(&final, &fl->fl6_dst);
ipv6_addr_copy(&fl->fl6_dst, rt0->addr);
final_p = &final;
+ connected = 0;
}
- if (!fl->oif && ipv6_addr_is_multicast(&fl->fl6_dst))
+ if (!fl->oif && ipv6_addr_is_multicast(&fl->fl6_dst)) {
fl->oif = np->mcast_oif;
+ connected = 0;
+ }
err = ip6_dst_lookup(sk, &dst, fl);
if (err)
@@ -799,10 +806,8 @@ do_udp_sendmsg:
if (final_p)
ipv6_addr_copy(&fl->fl6_dst, final_p);
- if ((err = xfrm_lookup(&dst, fl, sk, 0)) < 0) {
- dst_release(dst);
+ if ((err = xfrm_lookup(&dst, fl, sk, 0)) < 0)
goto out;
- }
if (hlimit < 0) {
if (ipv6_addr_is_multicast(&fl->fl6_dst))
@@ -815,6 +820,12 @@ do_udp_sendmsg:
hlimit = ipv6_get_hoplimit(dst->dev);
}
+ if (tclass < 0) {
+ tclass = np->tclass;
+ if (tclass < 0)
+ tclass = 0;
+ }
+
if (msg->msg_flags&MSG_CONFIRM)
goto do_confirm;
back_from_confirm:
@@ -825,7 +836,7 @@ back_from_confirm:
/* ... which is an evident application bug. --ANK */
release_sock(sk);
- LIMIT_NETDEBUG(printk(KERN_DEBUG "udp cork app bug 2\n"));
+ LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n");
err = -EINVAL;
goto out;
}
@@ -834,18 +845,25 @@ back_from_confirm:
do_append_data:
up->len += ulen;
- err = ip6_append_data(sk, ip_generic_getfrag, msg->msg_iov, ulen, sizeof(struct udphdr),
- hlimit, opt, fl, (struct rt6_info*)dst,
- corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
+ err = ip6_append_data(sk, ip_generic_getfrag, msg->msg_iov, ulen,
+ sizeof(struct udphdr), hlimit, tclass, opt, fl,
+ (struct rt6_info*)dst,
+ corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
if (err)
udp_v6_flush_pending_frames(sk);
else if (!corkreq)
err = udp_v6_push_pending_frames(sk, up);
- if (dst)
- ip6_dst_store(sk, dst,
- ipv6_addr_equal(&fl->fl6_dst, &np->daddr) ?
- &np->daddr : NULL);
+ if (dst) {
+ if (connected) {
+ ip6_dst_store(sk, dst,
+ ipv6_addr_equal(&fl->fl6_dst, &np->daddr) ?
+ &np->daddr : NULL);
+ } else {
+ dst_release(dst);
+ }
+ }
+
if (err > 0)
err = np->recverr ? net_xmit_errno(err) : 0;
release_sock(sk);
@@ -1054,8 +1072,6 @@ struct proto udpv6_prot = {
.obj_size = sizeof(struct udp6_sock),
};
-extern struct proto_ops inet6_dgram_ops;
-
static struct inet_protosw udpv6_protosw = {
.type = SOCK_DGRAM,
.protocol = IPPROTO_UDP,
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index 60c26c87277..fbef7826a74 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -79,7 +79,7 @@ static u32 xfrm6_tunnel_spi;
#define XFRM6_TUNNEL_SPI_MIN 1
#define XFRM6_TUNNEL_SPI_MAX 0xffffffff
-static kmem_cache_t *xfrm6_tunnel_spi_kmem;
+static kmem_cache_t *xfrm6_tunnel_spi_kmem __read_mostly;
#define XFRM6_TUNNEL_SPI_BYADDR_HSIZE 256
#define XFRM6_TUNNEL_SPI_BYSPI_HSIZE 256
diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index 5a27e5df588..34b3bb86840 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -44,7 +44,6 @@
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/string.h>
-#include <linux/tcp.h>
#include <linux/types.h>
#include <linux/termios.h>
@@ -52,6 +51,7 @@
#include <net/p8022.h>
#include <net/psnap.h>
#include <net/sock.h>
+#include <net/tcp_states.h>
#include <asm/uaccess.h>
@@ -1627,7 +1627,7 @@ out:
return rc;
}
-static int ipx_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+static int ipx_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
/* NULL here for pt means the packet was looped back */
struct ipx_interface *intrfc;
@@ -1796,8 +1796,8 @@ static int ipx_recvmsg(struct kiocb *iocb, struct socket *sock,
copied);
if (rc)
goto out_free;
- if (skb->stamp.tv_sec)
- sk->sk_stamp = skb->stamp;
+ if (skb->tstamp.off_sec)
+ skb_get_timestamp(skb, &sk->sk_stamp);
msg->msg_namelen = sizeof(*sipx);
@@ -1940,9 +1940,7 @@ static struct notifier_block ipx_dev_notifier = {
};
extern struct datalink_proto *make_EII_client(void);
-extern struct datalink_proto *make_8023_client(void);
extern void destroy_EII_client(struct datalink_proto *);
-extern void destroy_8023_client(struct datalink_proto *);
static unsigned char ipx_8022_type = 0xE0;
static unsigned char ipx_snap_id[5] = { 0x0, 0x0, 0x0, 0x81, 0x37 };
diff --git a/net/ipx/ipx_proc.c b/net/ipx/ipx_proc.c
index b6761913445..1f73d9ea434 100644
--- a/net/ipx/ipx_proc.c
+++ b/net/ipx/ipx_proc.c
@@ -10,7 +10,7 @@
#include <linux/proc_fs.h>
#include <linux/spinlock.h>
#include <linux/seq_file.h>
-#include <linux/tcp.h>
+#include <net/tcp_states.h>
#include <net/ipx.h>
static __inline__ struct ipx_interface *ipx_get_interface_idx(loff_t pos)
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 92c6e8d4e73..6f92f9c6299 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -56,7 +56,7 @@
#include <asm/uaccess.h>
#include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <net/irda/af_irda.h>
diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c
index 5d1e61168eb..6f20b4206e0 100644
--- a/net/irda/ircomm/ircomm_tty.c
+++ b/net/irda/ircomm/ircomm_tty.c
@@ -567,10 +567,8 @@ static void ircomm_tty_close(struct tty_struct *tty, struct file *filp)
self->tty = NULL;
if (self->blocked_open) {
- if (self->close_delay) {
- current->state = TASK_INTERRUPTIBLE;
- schedule_timeout(self->close_delay);
- }
+ if (self->close_delay)
+ schedule_timeout_interruptible(self->close_delay);
wake_up_interruptible(&self->open_wait);
}
@@ -863,8 +861,7 @@ static void ircomm_tty_wait_until_sent(struct tty_struct *tty, int timeout)
spin_lock_irqsave(&self->spinlock, flags);
while (self->tx_skb && self->tx_skb->len) {
spin_unlock_irqrestore(&self->spinlock, flags);
- current->state = TASK_INTERRUPTIBLE;
- schedule_timeout(poll_time);
+ schedule_timeout_interruptible(poll_time);
spin_lock_irqsave(&self->spinlock, flags);
if (signal_pending(current))
break;
diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c
index 071cd2cefd8..953e255d2bc 100644
--- a/net/irda/irlan/irlan_eth.c
+++ b/net/irda/irlan/irlan_eth.c
@@ -310,7 +310,7 @@ void irlan_eth_send_gratuitous_arp(struct net_device *dev)
#ifdef CONFIG_INET
IRDA_DEBUG(4, "IrLAN: Sending gratuitous ARP\n");
rcu_read_lock();
- in_dev = __in_dev_get(dev);
+ in_dev = __in_dev_get_rcu(dev);
if (in_dev == NULL)
goto out;
if (in_dev->ifa_list)
diff --git a/net/irda/irlan/irlan_filter.c b/net/irda/irlan/irlan_filter.c
index 343c5d4a1a1..ca7d358dab5 100644
--- a/net/irda/irlan/irlan_filter.c
+++ b/net/irda/irlan/irlan_filter.c
@@ -27,6 +27,7 @@
#include <linux/seq_file.h>
#include <net/irda/irlan_common.h>
+#include <net/irda/irlan_filter.h>
/*
* Function irlan_filter_request (self, skb)
diff --git a/net/irda/irlap_frame.c b/net/irda/irlap_frame.c
index 6dafbb43b52..3e9a06abbdd 100644
--- a/net/irda/irlap_frame.c
+++ b/net/irda/irlap_frame.c
@@ -988,9 +988,6 @@ void irlap_resend_rejected_frames(struct irlap_cb *self, int command)
IRDA_DEBUG(0, "%s(), unable to copy\n", __FUNCTION__);
return;
}
- /* Unlink tx_skb from list */
- tx_skb->next = tx_skb->prev = NULL;
- tx_skb->list = NULL;
/* Clear old Nr field + poll bit */
tx_skb->data[1] &= 0x0f;
@@ -1063,9 +1060,6 @@ void irlap_resend_rejected_frame(struct irlap_cb *self, int command)
IRDA_DEBUG(0, "%s(), unable to copy\n", __FUNCTION__);
return;
}
- /* Unlink tx_skb from list */
- tx_skb->next = tx_skb->prev = NULL;
- tx_skb->list = NULL;
/* Clear old Nr field + poll bit */
tx_skb->data[1] &= 0x0f;
@@ -1309,7 +1303,7 @@ static void irlap_recv_test_frame(struct irlap_cb *self, struct sk_buff *skb,
* Jean II
*/
int irlap_driver_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *ptype)
+ struct packet_type *ptype, struct net_device *orig_dev)
{
struct irlap_info info;
struct irlap_cb *self;
diff --git a/net/irda/irlmp.c b/net/irda/irlmp.c
index 7a4a4d7fbe6..c19e9ce05a3 100644
--- a/net/irda/irlmp.c
+++ b/net/irda/irlmp.c
@@ -53,7 +53,6 @@ struct irlmp_cb *irlmp = NULL;
/* These can be altered by the sysctl interface */
int sysctl_discovery = 0;
int sysctl_discovery_timeout = 3; /* 3 seconds by default */
-EXPORT_SYMBOL(sysctl_discovery_timeout);
int sysctl_discovery_slots = 6; /* 6 slots by default */
int sysctl_lap_keepalive_time = LM_IDLE_TIMEOUT * 1000 / HZ;
char sysctl_devname[65];
@@ -67,7 +66,6 @@ const char *irlmp_reasons[] = {
"LM_INIT_DISCONNECT",
"ERROR, NOT USED",
};
-EXPORT_SYMBOL(irlmp_reasons);
/*
* Function irlmp_init (void)
@@ -675,7 +673,6 @@ struct lsap_cb *irlmp_dup(struct lsap_cb *orig, void *instance)
return new;
}
-EXPORT_SYMBOL(irlmp_dup);
/*
* Function irlmp_disconnect_request (handle, userdata)
diff --git a/net/irda/irmod.c b/net/irda/irmod.c
index 6ffaed4544e..634901dd156 100644
--- a/net/irda/irmod.c
+++ b/net/irda/irmod.c
@@ -54,7 +54,7 @@ extern int irsock_init(void);
extern void irsock_cleanup(void);
/* irlap_frame.c */
extern int irlap_driver_rcv(struct sk_buff *, struct net_device *,
- struct packet_type *);
+ struct packet_type *, struct net_device *);
/*
* Module parameters
diff --git a/net/irda/irnet/irnet.h b/net/irda/irnet/irnet.h
index 9004f7349a7..b391cb3893d 100644
--- a/net/irda/irnet/irnet.h
+++ b/net/irda/irnet/irnet.h
@@ -517,9 +517,6 @@ extern int
irda_irnet_init(void); /* Initialise IrDA part of IrNET */
extern void
irda_irnet_cleanup(void); /* Teardown IrDA part of IrNET */
-/* ---------------------------- MODULE ---------------------------- */
-extern int
- irnet_init(void); /* Initialise IrNET module */
/**************************** VARIABLES ****************************/
diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c
index f8f984bb992..e53bf9e0053 100644
--- a/net/irda/irnet/irnet_ppp.c
+++ b/net/irda/irnet/irnet_ppp.c
@@ -1107,7 +1107,7 @@ ppp_irnet_cleanup(void)
/*
* Module main entry point
*/
-int __init
+static int __init
irnet_init(void)
{
int err;
diff --git a/net/irda/irqueue.c b/net/irda/irqueue.c
index b0dd3ea3599..1ba8c710663 100644
--- a/net/irda/irqueue.c
+++ b/net/irda/irqueue.c
@@ -822,7 +822,6 @@ void* hashbin_find_next( hashbin_t* hashbin, long hashv, const char* name,
return entry;
}
-EXPORT_SYMBOL(hashbin_find_next);
/*
* Function hashbin_get_first (hashbin)
diff --git a/net/irda/irttp.c b/net/irda/irttp.c
index 6602d901f8b..8aff254cb41 100644
--- a/net/irda/irttp.c
+++ b/net/irda/irttp.c
@@ -38,7 +38,7 @@
#include <net/irda/parameters.h>
#include <net/irda/irttp.h>
-static struct irttp_cb *irttp = NULL;
+static struct irttp_cb *irttp;
static void __irttp_close_tsap(struct tsap_cb *self);
@@ -86,12 +86,9 @@ static pi_param_info_t param_info = { pi_major_call_table, 1, 0x0f, 4 };
*/
int __init irttp_init(void)
{
- /* Initialize the irttp structure. */
- if (irttp == NULL) {
- irttp = kmalloc(sizeof(struct irttp_cb), GFP_KERNEL);
- if (irttp == NULL)
- return -ENOMEM;
- }
+ irttp = kmalloc(sizeof(struct irttp_cb), GFP_KERNEL);
+ if (irttp == NULL)
+ return -ENOMEM;
memset(irttp, 0, sizeof(struct irttp_cb));
irttp->magic = TTP_MAGIC;
@@ -100,6 +97,7 @@ int __init irttp_init(void)
if (!irttp->tsaps) {
IRDA_ERROR("%s: can't allocate IrTTP hashbin!\n",
__FUNCTION__);
+ kfree(irttp);
return -ENOMEM;
}
@@ -115,7 +113,6 @@ int __init irttp_init(void)
void __exit irttp_cleanup(void)
{
/* Check for main structure */
- IRDA_ASSERT(irttp != NULL, return;);
IRDA_ASSERT(irttp->magic == TTP_MAGIC, return;);
/*
@@ -382,7 +379,6 @@ struct tsap_cb *irttp_open_tsap(__u8 stsap_sel, int credit, notify_t *notify)
struct lsap_cb *lsap;
notify_t ttp_notify;
- IRDA_ASSERT(irttp != NULL, return NULL;);
IRDA_ASSERT(irttp->magic == TTP_MAGIC, return NULL;);
/* The IrLMP spec (IrLMP 1.1 p10) says that we have the right to
@@ -1880,8 +1876,6 @@ static int irttp_seq_open(struct inode *inode, struct file *file)
struct seq_file *seq;
int rc = -ENOMEM;
struct irttp_iter_state *s;
-
- IRDA_ASSERT(irttp != NULL, return -EINVAL;);
s = kmalloc(sizeof(*s), GFP_KERNEL);
if (!s)
diff --git a/net/irda/qos.c b/net/irda/qos.c
index df732d56cc5..ddfb5c502a9 100644
--- a/net/irda/qos.c
+++ b/net/irda/qos.c
@@ -37,6 +37,7 @@
#include <net/irda/parameters.h>
#include <net/irda/qos.h>
#include <net/irda/irlap.h>
+#include <net/irda/irlap_frame.h>
/*
* Maximum values of the baud rate we negociate with the other end.
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 4879743b945..39031684b65 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -185,7 +185,7 @@ static int pfkey_release(struct socket *sock)
}
static int pfkey_broadcast_one(struct sk_buff *skb, struct sk_buff **skb2,
- int allocation, struct sock *sk)
+ gfp_t allocation, struct sock *sk)
{
int err = -ENOBUFS;
@@ -217,7 +217,7 @@ static int pfkey_broadcast_one(struct sk_buff *skb, struct sk_buff **skb2,
#define BROADCAST_ONE 1
#define BROADCAST_REGISTERED 2
#define BROADCAST_PROMISC_ONLY 4
-static int pfkey_broadcast(struct sk_buff *skb, int allocation,
+static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation,
int broadcast_flags, struct sock *one_sk)
{
struct sock *sk;
@@ -1416,7 +1416,8 @@ static int pfkey_get(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr,
return 0;
}
-static struct sk_buff *compose_sadb_supported(struct sadb_msg *orig, int allocation)
+static struct sk_buff *compose_sadb_supported(struct sadb_msg *orig,
+ gfp_t allocation)
{
struct sk_buff *skb;
struct sadb_msg *hdr;
@@ -2153,6 +2154,7 @@ out:
static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
{
+ unsigned int dir;
int err;
struct sadb_x_policy *pol;
struct xfrm_policy *xp;
@@ -2161,7 +2163,11 @@ static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
if ((pol = ext_hdrs[SADB_X_EXT_POLICY-1]) == NULL)
return -EINVAL;
- xp = xfrm_policy_byid(0, pol->sadb_x_policy_id,
+ dir = xfrm_policy_id2dir(pol->sadb_x_policy_id);
+ if (dir >= XFRM_POLICY_MAX)
+ return -EINVAL;
+
+ xp = xfrm_policy_byid(dir, pol->sadb_x_policy_id,
hdr->sadb_msg_type == SADB_X_SPDDELETE2);
if (xp == NULL)
return -ENOENT;
@@ -2173,9 +2179,9 @@ static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
if (hdr->sadb_msg_type == SADB_X_SPDDELETE2) {
c.data.byid = 1;
c.event = XFRM_MSG_DELPOLICY;
- km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c);
+ km_policy_notify(xp, dir, &c);
} else {
- err = key_pol_get_resp(sk, xp, hdr, pol->sadb_x_policy_dir-1);
+ err = key_pol_get_resp(sk, xp, hdr, dir);
}
xfrm_pol_put(xp);
diff --git a/net/lapb/lapb_subr.c b/net/lapb/lapb_subr.c
index 5de05a0bc0f..8b5eefd70f0 100644
--- a/net/lapb/lapb_subr.c
+++ b/net/lapb/lapb_subr.c
@@ -78,7 +78,7 @@ void lapb_requeue_frames(struct lapb_cb *lapb)
if (!skb_prev)
skb_queue_head(&lapb->write_queue, skb);
else
- skb_append(skb_prev, skb);
+ skb_append(skb_prev, skb, &lapb->write_queue);
skb_prev = skb;
}
}
diff --git a/net/llc/Makefile b/net/llc/Makefile
index 5ebd4ed2bd4..4e260cff3c5 100644
--- a/net/llc/Makefile
+++ b/net/llc/Makefile
@@ -22,3 +22,4 @@ llc2-y := llc_if.o llc_c_ev.o llc_c_ac.o llc_conn.o llc_c_st.o llc_pdu.o \
llc_sap.o llc_s_ac.o llc_s_ev.o llc_s_st.o af_llc.o llc_station.o
llc2-$(CONFIG_PROC_FS) += llc_proc.o
+llc2-$(CONFIG_SYSCTL) += sysctl_net_llc.o
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 20b4cfebd74..59d02cbbeb9 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -21,15 +21,16 @@
* See the GNU General Public License for more details.
*/
#include <linux/config.h>
+#include <linux/compiler.h>
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/tcp.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <net/llc.h>
#include <net/llc_sap.h>
#include <net/llc_pdu.h>
#include <net/llc_conn.h>
+#include <net/tcp_states.h>
/* remember: uninitialized global data is zeroed because its in .bss */
static u16 llc_ui_sap_last_autoport = LLC_SAP_DYN_START;
@@ -37,10 +38,9 @@ static u16 llc_ui_sap_link_no_max[256];
static struct sockaddr_llc llc_ui_addrnull;
static struct proto_ops llc_ui_ops;
-static int llc_ui_wait_for_conn(struct sock *sk, int timeout);
-static int llc_ui_wait_for_disc(struct sock *sk, int timeout);
-static int llc_ui_wait_for_data(struct sock *sk, int timeout);
-static int llc_ui_wait_for_busy_core(struct sock *sk, int timeout);
+static int llc_ui_wait_for_conn(struct sock *sk, long timeout);
+static int llc_ui_wait_for_disc(struct sock *sk, long timeout);
+static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout);
#if 0
#define dprintk(args...) printk(KERN_DEBUG args)
@@ -116,12 +116,12 @@ static int llc_ui_send_data(struct sock* sk, struct sk_buff *skb, int noblock)
struct llc_sock* llc = llc_sk(sk);
int rc = 0;
- if (llc_data_accept_state(llc->state) || llc->p_flag) {
- int timeout = sock_sndtimeo(sk, noblock);
+ if (unlikely(llc_data_accept_state(llc->state) || llc->p_flag)) {
+ long timeout = sock_sndtimeo(sk, noblock);
rc = llc_ui_wait_for_busy_core(sk, timeout);
}
- if (!rc)
+ if (unlikely(!rc))
rc = llc_build_and_send_pkt(sk, skb);
return rc;
}
@@ -155,7 +155,7 @@ static int llc_ui_create(struct socket *sock, int protocol)
struct sock *sk;
int rc = -ESOCKTNOSUPPORT;
- if (sock->type == SOCK_DGRAM || sock->type == SOCK_STREAM) {
+ if (likely(sock->type == SOCK_DGRAM || sock->type == SOCK_STREAM)) {
rc = -ENOMEM;
sk = llc_sk_alloc(PF_LLC, GFP_KERNEL, &llc_proto);
if (sk) {
@@ -177,7 +177,7 @@ static int llc_ui_release(struct socket *sock)
struct sock *sk = sock->sk;
struct llc_sock *llc;
- if (!sk)
+ if (unlikely(sk == NULL))
goto out;
sock_hold(sk);
lock_sock(sk);
@@ -189,10 +189,6 @@ static int llc_ui_release(struct socket *sock)
if (!sock_flag(sk, SOCK_ZAPPED))
llc_sap_remove_socket(llc->sap, sk);
release_sock(sk);
- if (llc->sap && hlist_empty(&llc->sap->sk_list.list)) {
- llc_release_sockets(llc->sap);
- llc_sap_close(llc->sap);
- }
if (llc->dev)
dev_put(llc->dev);
sock_put(sk);
@@ -221,6 +217,7 @@ static int llc_ui_autoport(void)
llc_ui_sap_last_autoport = i + 2;
goto out;
}
+ llc_sap_put(sap);
}
llc_ui_sap_last_autoport = LLC_SAP_DYN_START;
tries++;
@@ -231,20 +228,13 @@ out:
}
/**
- * llc_ui_autobind - Bind a socket to a specific address.
- * @sk: Socket to bind an address to.
- * @addr: Address the user wants the socket bound to.
+ * llc_ui_autobind - automatically bind a socket to a sap
+ * @sock: socket to bind
+ * @addr: address to connect to
+ *
+ * Used by llc_ui_connect and llc_ui_sendmsg when the user hasn't
+ * specifically used llc_ui_bind to bind to an specific address/sap
*
- * Bind a socket to a specific address. For llc a user is able to bind to
- * a specific sap only or mac + sap. If the user only specifies a sap and
- * a null dmac (all zeros) the user is attempting to bind to an entire
- * sap. This will stop anyone else on the local system from using that
- * sap. If someone else has a mac + sap open the bind to null + sap will
- * fail.
- * If the user desires to bind to a specific mac + sap, it is possible to
- * have multiple sap connections via multiple macs.
- * Bind and autobind for that matter must enforce the correct sap usage
- * otherwise all hell will break loose.
* Returns: 0 upon success, negative otherwise.
*/
static int llc_ui_autobind(struct socket *sock, struct sockaddr_llc *addr)
@@ -285,11 +275,7 @@ out:
* @addrlen: Length of the uaddr structure.
*
* Bind a socket to a specific address. For llc a user is able to bind to
- * a specific sap only or mac + sap. If the user only specifies a sap and
- * a null dmac (all zeros) the user is attempting to bind to an entire
- * sap. This will stop anyone else on the local system from using that
- * sap. If someone else has a mac + sap open the bind to null + sap will
- * fail.
+ * a specific sap only or mac + sap.
* If the user desires to bind to a specific mac + sap, it is possible to
* have multiple sap connections via multiple macs.
* Bind and autobind for that matter must enforce the correct sap usage
@@ -305,10 +291,16 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen)
int rc = -EINVAL;
dprintk("%s: binding %02X\n", __FUNCTION__, addr->sllc_sap);
- if (!sock_flag(sk, SOCK_ZAPPED) || addrlen != sizeof(*addr))
+ if (unlikely(!sock_flag(sk, SOCK_ZAPPED) || addrlen != sizeof(*addr)))
goto out;
rc = -EAFNOSUPPORT;
- if (addr->sllc_family != AF_LLC)
+ if (unlikely(addr->sllc_family != AF_LLC))
+ goto out;
+ rc = -ENODEV;
+ rtnl_lock();
+ llc->dev = dev_getbyhwaddr(addr->sllc_arphrd, addr->sllc_mac);
+ rtnl_unlock();
+ if (!llc->dev)
goto out;
if (!addr->sllc_sap) {
rc = -EUSERS;
@@ -322,6 +314,7 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen)
rc = -EBUSY; /* some other network layer is using the sap */
if (!sap)
goto out;
+ llc_sap_hold(sap);
} else {
struct llc_addr laddr, daddr;
struct sock *ask;
@@ -338,7 +331,7 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen)
ask = llc_lookup_established(sap, &daddr, &laddr);
if (ask) {
sock_put(ask);
- goto out;
+ goto out_put;
}
}
llc->laddr.lsap = addr->sllc_sap;
@@ -348,6 +341,8 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen)
llc_sap_add_socket(sap, sk);
sock_reset_flag(sk, SOCK_ZAPPED);
rc = 0;
+out_put:
+ llc_sap_put(sap);
out:
return rc;
}
@@ -369,7 +364,7 @@ static int llc_ui_shutdown(struct socket *sock, int how)
int rc = -ENOTCONN;
lock_sock(sk);
- if (sk->sk_state != TCP_ESTABLISHED)
+ if (unlikely(sk->sk_state != TCP_ESTABLISHED))
goto out;
rc = -EINVAL;
if (how != 2)
@@ -404,14 +399,18 @@ static int llc_ui_connect(struct socket *sock, struct sockaddr *uaddr,
struct sock *sk = sock->sk;
struct llc_sock *llc = llc_sk(sk);
struct sockaddr_llc *addr = (struct sockaddr_llc *)uaddr;
- struct net_device *dev;
int rc = -EINVAL;
lock_sock(sk);
- if (addrlen != sizeof(*addr))
+ if (unlikely(addrlen != sizeof(*addr)))
goto out;
rc = -EAFNOSUPPORT;
- if (addr->sllc_family != AF_LLC)
+ if (unlikely(addr->sllc_family != AF_LLC))
+ goto out;
+ if (unlikely(sk->sk_type != SOCK_STREAM))
+ goto out;
+ rc = -EALREADY;
+ if (unlikely(sock->state == SS_CONNECTING))
goto out;
/* bind connection to sap if user hasn't done it. */
if (sock_flag(sk, SOCK_ZAPPED)) {
@@ -419,19 +418,13 @@ static int llc_ui_connect(struct socket *sock, struct sockaddr *uaddr,
rc = llc_ui_autobind(sock, addr);
if (rc)
goto out;
- llc->daddr.lsap = addr->sllc_sap;
- memcpy(llc->daddr.mac, addr->sllc_mac, IFHWADDRLEN);
}
- dev = llc->dev;
- if (sk->sk_type != SOCK_STREAM)
- goto out;
- rc = -EALREADY;
- if (sock->state == SS_CONNECTING)
- goto out;
+ llc->daddr.lsap = addr->sllc_sap;
+ memcpy(llc->daddr.mac, addr->sllc_mac, IFHWADDRLEN);
sock->state = SS_CONNECTING;
sk->sk_state = TCP_SYN_SENT;
llc->link = llc_ui_next_link_no(llc->sap->laddr.lsap);
- rc = llc_establish_connection(sk, dev->dev_addr,
+ rc = llc_establish_connection(sk, llc->dev->dev_addr,
addr->sllc_mac, addr->sllc_sap);
if (rc) {
dprintk("%s: llc_ui_send_conn failed :-(\n", __FUNCTION__);
@@ -439,12 +432,30 @@ static int llc_ui_connect(struct socket *sock, struct sockaddr *uaddr,
sk->sk_state = TCP_CLOSE;
goto out;
}
- rc = llc_ui_wait_for_conn(sk, sk->sk_rcvtimeo);
- if (rc)
- dprintk("%s: llc_ui_wait_for_conn failed=%d\n", __FUNCTION__, rc);
+
+ if (sk->sk_state == TCP_SYN_SENT) {
+ const long timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
+
+ if (!timeo || !llc_ui_wait_for_conn(sk, timeo))
+ goto out;
+
+ rc = sock_intr_errno(timeo);
+ if (signal_pending(current))
+ goto out;
+ }
+
+ if (sk->sk_state == TCP_CLOSE)
+ goto sock_error;
+
+ sock->state = SS_CONNECTED;
+ rc = 0;
out:
release_sock(sk);
return rc;
+sock_error:
+ rc = sock_error(sk) ? : -ECONNABORTED;
+ sock->state = SS_UNCONNECTED;
+ goto out;
}
/**
@@ -461,10 +472,10 @@ static int llc_ui_listen(struct socket *sock, int backlog)
int rc = -EINVAL;
lock_sock(sk);
- if (sock->state != SS_UNCONNECTED)
+ if (unlikely(sock->state != SS_UNCONNECTED))
goto out;
rc = -EOPNOTSUPP;
- if (sk->sk_type != SOCK_STREAM)
+ if (unlikely(sk->sk_type != SOCK_STREAM))
goto out;
rc = -EAGAIN;
if (sock_flag(sk, SOCK_ZAPPED))
@@ -483,20 +494,14 @@ out:
return rc;
}
-static int llc_ui_wait_for_disc(struct sock *sk, int timeout)
+static int llc_ui_wait_for_disc(struct sock *sk, long timeout)
{
- DECLARE_WAITQUEUE(wait, current);
- int rc;
+ DEFINE_WAIT(wait);
+ int rc = 0;
- add_wait_queue_exclusive(sk->sk_sleep, &wait);
- for (;;) {
- __set_current_state(TASK_INTERRUPTIBLE);
- rc = 0;
- if (sk->sk_state != TCP_CLOSE) {
- release_sock(sk);
- timeout = schedule_timeout(timeout);
- lock_sock(sk);
- } else
+ while (1) {
+ prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+ if (sk_wait_event(sk, &timeout, sk->sk_state == TCP_CLOSE))
break;
rc = -ERESTARTSYS;
if (signal_pending(current))
@@ -504,65 +509,40 @@ static int llc_ui_wait_for_disc(struct sock *sk, int timeout)
rc = -EAGAIN;
if (!timeout)
break;
+ rc = 0;
}
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(sk->sk_sleep, &wait);
+ finish_wait(sk->sk_sleep, &wait);
return rc;
}
-static int llc_ui_wait_for_conn(struct sock *sk, int timeout)
+static int llc_ui_wait_for_conn(struct sock *sk, long timeout)
{
- DECLARE_WAITQUEUE(wait, current);
- int rc;
+ DEFINE_WAIT(wait);
- add_wait_queue_exclusive(sk->sk_sleep, &wait);
- for (;;) {
- __set_current_state(TASK_INTERRUPTIBLE);
- rc = -EAGAIN;
- if (sk->sk_state == TCP_CLOSE)
- break;
- rc = 0;
- if (sk->sk_state != TCP_ESTABLISHED) {
- release_sock(sk);
- timeout = schedule_timeout(timeout);
- lock_sock(sk);
- } else
+ while (1) {
+ prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+ if (sk_wait_event(sk, &timeout, sk->sk_state != TCP_SYN_SENT))
break;
- rc = -ERESTARTSYS;
- if (signal_pending(current))
- break;
- rc = -EAGAIN;
- if (!timeout)
+ if (signal_pending(current) || !timeout)
break;
}
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(sk->sk_sleep, &wait);
- return rc;
+ finish_wait(sk->sk_sleep, &wait);
+ return timeout;
}
-static int llc_ui_wait_for_data(struct sock *sk, int timeout)
+static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout)
{
- DECLARE_WAITQUEUE(wait, current);
- int rc = 0;
+ DEFINE_WAIT(wait);
+ struct llc_sock *llc = llc_sk(sk);
+ int rc;
- add_wait_queue_exclusive(sk->sk_sleep, &wait);
- for (;;) {
- __set_current_state(TASK_INTERRUPTIBLE);
- if (sk->sk_shutdown & RCV_SHUTDOWN)
- break;
- /*
- * Well, if we have backlog, try to process it now.
- */
- if (sk->sk_backlog.tail) {
- release_sock(sk);
- lock_sock(sk);
- }
+ while (1) {
+ prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
rc = 0;
- if (skb_queue_empty(&sk->sk_receive_queue)) {
- release_sock(sk);
- timeout = schedule_timeout(timeout);
- lock_sock(sk);
- } else
+ if (sk_wait_event(sk, &timeout,
+ (sk->sk_shutdown & RCV_SHUTDOWN) ||
+ (!llc_data_accept_state(llc->state) &&
+ !llc->p_flag)))
break;
rc = -ERESTARTSYS;
if (signal_pending(current))
@@ -571,40 +551,35 @@ static int llc_ui_wait_for_data(struct sock *sk, int timeout)
if (!timeout)
break;
}
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(sk->sk_sleep, &wait);
+ finish_wait(sk->sk_sleep, &wait);
return rc;
}
-static int llc_ui_wait_for_busy_core(struct sock *sk, int timeout)
+static int llc_wait_data(struct sock *sk, long timeo)
{
- DECLARE_WAITQUEUE(wait, current);
- struct llc_sock *llc = llc_sk(sk);
int rc;
- add_wait_queue_exclusive(sk->sk_sleep, &wait);
- for (;;) {
- dprintk("%s: looping...\n", __FUNCTION__);
- __set_current_state(TASK_INTERRUPTIBLE);
- rc = -ENOTCONN;
- if (sk->sk_shutdown & RCV_SHUTDOWN)
+ while (1) {
+ /*
+ * POSIX 1003.1g mandates this order.
+ */
+ if (sk->sk_err) {
+ rc = sock_error(sk);
break;
+ }
rc = 0;
- if (llc_data_accept_state(llc->state) || llc->p_flag) {
- release_sock(sk);
- timeout = schedule_timeout(timeout);
- lock_sock(sk);
- } else
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
break;
- rc = -ERESTARTSYS;
+ rc = -EAGAIN;
+ if (!timeo)
+ break;
+ rc = sock_intr_errno(timeo);
if (signal_pending(current))
break;
- rc = -EAGAIN;
- if (!timeout)
+ rc = 0;
+ if (sk_wait_data(sk, &timeo))
break;
}
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(sk->sk_sleep, &wait);
return rc;
}
@@ -627,15 +602,18 @@ static int llc_ui_accept(struct socket *sock, struct socket *newsock, int flags)
dprintk("%s: accepting on %02X\n", __FUNCTION__,
llc_sk(sk)->laddr.lsap);
lock_sock(sk);
- if (sk->sk_type != SOCK_STREAM)
+ if (unlikely(sk->sk_type != SOCK_STREAM))
goto out;
rc = -EINVAL;
- if (sock->state != SS_UNCONNECTED || sk->sk_state != TCP_LISTEN)
+ if (unlikely(sock->state != SS_UNCONNECTED ||
+ sk->sk_state != TCP_LISTEN))
goto out;
/* wait for a connection to arrive. */
- rc = llc_ui_wait_for_data(sk, sk->sk_rcvtimeo);
- if (rc)
- goto out;
+ if (skb_queue_empty(&sk->sk_receive_queue)) {
+ rc = llc_wait_data(sk, sk->sk_rcvtimeo);
+ if (rc)
+ goto out;
+ }
dprintk("%s: got a new connection on %02X\n", __FUNCTION__,
llc_sk(sk)->laddr.lsap);
skb = skb_dequeue(&sk->sk_receive_queue);
@@ -657,7 +635,6 @@ static int llc_ui_accept(struct socket *sock, struct socket *newsock, int flags)
/* put original socket back into a clean listen state. */
sk->sk_state = TCP_LISTEN;
sk->sk_ack_backlog--;
- skb->sk = NULL;
dprintk("%s: ok success on %02X, client on %02X\n", __FUNCTION__,
llc_sk(sk)->addr.sllc_sap, newllc->daddr.lsap);
frees:
@@ -671,56 +648,167 @@ out:
* llc_ui_recvmsg - copy received data to the socket user.
* @sock: Socket to copy data from.
* @msg: Various user space related information.
- * @size: Size of user buffer.
+ * @len: Size of user buffer.
* @flags: User specified flags.
*
* Copy received data to the socket user.
* Returns non-negative upon success, negative otherwise.
*/
static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock,
- struct msghdr *msg, size_t size, int flags)
+ struct msghdr *msg, size_t len, int flags)
{
- struct sock *sk = sock->sk;
struct sockaddr_llc *uaddr = (struct sockaddr_llc *)msg->msg_name;
- struct sk_buff *skb;
+ const int nonblock = flags & MSG_DONTWAIT;
+ struct sk_buff *skb = NULL;
+ struct sock *sk = sock->sk;
+ struct llc_sock *llc = llc_sk(sk);
size_t copied = 0;
- int rc = -ENOMEM, timeout;
- int noblock = flags & MSG_DONTWAIT;
+ u32 peek_seq = 0;
+ u32 *seq;
+ unsigned long used;
+ int target; /* Read at least this many bytes */
+ long timeo;
- dprintk("%s: receiving in %02X from %02X\n", __FUNCTION__,
- llc_sk(sk)->laddr.lsap, llc_sk(sk)->daddr.lsap);
lock_sock(sk);
- timeout = sock_rcvtimeo(sk, noblock);
- rc = llc_ui_wait_for_data(sk, timeout);
- if (rc) {
- dprintk("%s: llc_ui_wait_for_data failed recv "
- "in %02X from %02X\n", __FUNCTION__,
- llc_sk(sk)->laddr.lsap, llc_sk(sk)->daddr.lsap);
+ copied = -ENOTCONN;
+ if (sk->sk_state == TCP_LISTEN)
goto out;
- }
- skb = skb_dequeue(&sk->sk_receive_queue);
- if (!skb) /* shutdown */
- goto out;
- copied = skb->len;
- if (copied > size)
- copied = size;
- rc = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
- if (rc)
- goto dgram_free;
- if (skb->len > copied) {
- skb_pull(skb, copied);
- skb_queue_head(&sk->sk_receive_queue, skb);
- }
- if (uaddr)
- memcpy(uaddr, llc_ui_skb_cb(skb), sizeof(*uaddr));
- msg->msg_namelen = sizeof(*uaddr);
- if (!skb->list) {
-dgram_free:
- kfree_skb(skb);
- }
+
+ timeo = sock_rcvtimeo(sk, nonblock);
+
+ seq = &llc->copied_seq;
+ if (flags & MSG_PEEK) {
+ peek_seq = llc->copied_seq;
+ seq = &peek_seq;
+ }
+
+ target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+ copied = 0;
+
+ do {
+ u32 offset;
+
+ /*
+ * We need to check signals first, to get correct SIGURG
+ * handling. FIXME: Need to check this doesn't impact 1003.1g
+ * and move it down to the bottom of the loop
+ */
+ if (signal_pending(current)) {
+ if (copied)
+ break;
+ copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
+ break;
+ }
+
+ /* Next get a buffer. */
+
+ skb = skb_peek(&sk->sk_receive_queue);
+ if (skb) {
+ offset = *seq;
+ goto found_ok_skb;
+ }
+ /* Well, if we have backlog, try to process it now yet. */
+
+ if (copied >= target && !sk->sk_backlog.tail)
+ break;
+
+ if (copied) {
+ if (sk->sk_err ||
+ sk->sk_state == TCP_CLOSE ||
+ (sk->sk_shutdown & RCV_SHUTDOWN) ||
+ !timeo ||
+ (flags & MSG_PEEK))
+ break;
+ } else {
+ if (sock_flag(sk, SOCK_DONE))
+ break;
+
+ if (sk->sk_err) {
+ copied = sock_error(sk);
+ break;
+ }
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ break;
+
+ if (sk->sk_state == TCP_CLOSE) {
+ if (!sock_flag(sk, SOCK_DONE)) {
+ /*
+ * This occurs when user tries to read
+ * from never connected socket.
+ */
+ copied = -ENOTCONN;
+ break;
+ }
+ break;
+ }
+ if (!timeo) {
+ copied = -EAGAIN;
+ break;
+ }
+ }
+
+ if (copied >= target) { /* Do not sleep, just process backlog. */
+ release_sock(sk);
+ lock_sock(sk);
+ } else
+ sk_wait_data(sk, &timeo);
+
+ if ((flags & MSG_PEEK) && peek_seq != llc->copied_seq) {
+ if (net_ratelimit())
+ printk(KERN_DEBUG "LLC(%s:%d): Application "
+ "bug, race in MSG_PEEK.\n",
+ current->comm, current->pid);
+ peek_seq = llc->copied_seq;
+ }
+ continue;
+ found_ok_skb:
+ /* Ok so how much can we use? */
+ used = skb->len - offset;
+ if (len < used)
+ used = len;
+
+ if (!(flags & MSG_TRUNC)) {
+ int rc = skb_copy_datagram_iovec(skb, offset,
+ msg->msg_iov, used);
+ if (rc) {
+ /* Exception. Bailout! */
+ if (!copied)
+ copied = -EFAULT;
+ break;
+ }
+ }
+
+ *seq += used;
+ copied += used;
+ len -= used;
+
+ if (used + offset < skb->len)
+ continue;
+
+ if (!(flags & MSG_PEEK)) {
+ sk_eat_skb(sk, skb);
+ *seq = 0;
+ }
+ } while (len > 0);
+
+ /*
+ * According to UNIX98, msg_name/msg_namelen are ignored
+ * on connected socket. -ANK
+ * But... af_llc still doesn't have separate sets of methods for
+ * SOCK_DGRAM and SOCK_STREAM :-( So we have to do this test, will
+ * eventually fix this tho :-) -acme
+ */
+ if (sk->sk_type == SOCK_DGRAM)
+ goto copy_uaddr;
out:
release_sock(sk);
- return rc ? : copied;
+ return copied;
+copy_uaddr:
+ if (uaddr != NULL && skb != NULL) {
+ memcpy(uaddr, llc_ui_skb_cb(skb), sizeof(*uaddr));
+ msg->msg_namelen = sizeof(*uaddr);
+ }
+ goto out;
}
/**
@@ -740,7 +828,6 @@ static int llc_ui_sendmsg(struct kiocb *iocb, struct socket *sock,
struct sockaddr_llc *addr = (struct sockaddr_llc *)msg->msg_name;
int flags = msg->msg_flags;
int noblock = flags & MSG_DONTWAIT;
- struct net_device *dev;
struct sk_buff *skb;
size_t size = 0;
int rc = -EINVAL, copied = 0, hdrlen;
@@ -763,19 +850,17 @@ static int llc_ui_sendmsg(struct kiocb *iocb, struct socket *sock,
if (rc)
goto release;
}
- dev = llc->dev;
- hdrlen = dev->hard_header_len + llc_ui_header_len(sk, addr);
+ hdrlen = llc->dev->hard_header_len + llc_ui_header_len(sk, addr);
size = hdrlen + len;
- if (size > dev->mtu)
- size = dev->mtu;
+ if (size > llc->dev->mtu)
+ size = llc->dev->mtu;
copied = size - hdrlen;
release_sock(sk);
skb = sock_alloc_send_skb(sk, size, noblock, &rc);
lock_sock(sk);
if (!skb)
goto release;
- skb->sk = sk;
- skb->dev = dev;
+ skb->dev = llc->dev;
skb->protocol = llc_proto_type(addr->sllc_arphrd);
skb_reserve(skb, hdrlen);
rc = memcpy_fromiovec(skb_put(skb, copied), msg->msg_iov, copied);
@@ -800,15 +885,13 @@ static int llc_ui_sendmsg(struct kiocb *iocb, struct socket *sock,
if (!(sk->sk_type == SOCK_STREAM && !addr->sllc_ua))
goto out;
rc = llc_ui_send_data(sk, skb, noblock);
- if (rc)
- dprintk("%s: llc_ui_send_data failed: %d\n", __FUNCTION__, rc);
out:
- if (rc)
+ if (rc) {
kfree_skb(skb);
release:
- if (rc)
dprintk("%s: failed sending from %02X to %02X: %d\n",
__FUNCTION__, llc->laddr.lsap, llc->daddr.lsap, rc);
+ }
release_sock(sk);
return rc ? : copied;
}
@@ -895,7 +978,7 @@ static int llc_ui_setsockopt(struct socket *sock, int level, int optname,
int rc = -EINVAL, opt;
lock_sock(sk);
- if (level != SOL_LLC || optlen != sizeof(int))
+ if (unlikely(level != SOL_LLC || optlen != sizeof(int)))
goto out;
rc = get_user(opt, (int __user *)optval);
if (rc)
@@ -915,22 +998,22 @@ static int llc_ui_setsockopt(struct socket *sock, int level, int optname,
case LLC_OPT_ACK_TMR_EXP:
if (opt > LLC_OPT_MAX_ACK_TMR_EXP)
goto out;
- llc->ack_timer.expire = opt;
+ llc->ack_timer.expire = opt * HZ;
break;
case LLC_OPT_P_TMR_EXP:
if (opt > LLC_OPT_MAX_P_TMR_EXP)
goto out;
- llc->pf_cycle_timer.expire = opt;
+ llc->pf_cycle_timer.expire = opt * HZ;
break;
case LLC_OPT_REJ_TMR_EXP:
if (opt > LLC_OPT_MAX_REJ_TMR_EXP)
goto out;
- llc->rej_sent_timer.expire = opt;
+ llc->rej_sent_timer.expire = opt * HZ;
break;
case LLC_OPT_BUSY_TMR_EXP:
if (opt > LLC_OPT_MAX_BUSY_TMR_EXP)
goto out;
- llc->busy_state_timer.expire = opt;
+ llc->busy_state_timer.expire = opt * HZ;
break;
case LLC_OPT_TX_WIN:
if (opt > LLC_OPT_MAX_WIN)
@@ -970,7 +1053,7 @@ static int llc_ui_getsockopt(struct socket *sock, int level, int optname,
int val = 0, len = 0, rc = -EINVAL;
lock_sock(sk);
- if (level != SOL_LLC)
+ if (unlikely(level != SOL_LLC))
goto out;
rc = get_user(len, optlen);
if (rc)
@@ -980,17 +1063,17 @@ static int llc_ui_getsockopt(struct socket *sock, int level, int optname,
goto out;
switch (optname) {
case LLC_OPT_RETRY:
- val = llc->n2; break;
+ val = llc->n2; break;
case LLC_OPT_SIZE:
- val = llc->n1; break;
+ val = llc->n1; break;
case LLC_OPT_ACK_TMR_EXP:
- val = llc->ack_timer.expire; break;
+ val = llc->ack_timer.expire / HZ; break;
case LLC_OPT_P_TMR_EXP:
- val = llc->pf_cycle_timer.expire; break;
+ val = llc->pf_cycle_timer.expire / HZ; break;
case LLC_OPT_REJ_TMR_EXP:
- val = llc->rej_sent_timer.expire; break;
+ val = llc->rej_sent_timer.expire / HZ; break;
case LLC_OPT_BUSY_TMR_EXP:
- val = llc->busy_state_timer.expire; break;
+ val = llc->busy_state_timer.expire / HZ; break;
case LLC_OPT_TX_WIN:
val = llc->k; break;
case LLC_OPT_RX_WIN:
@@ -1034,8 +1117,12 @@ static struct proto_ops llc_ui_ops = {
.sendpage = sock_no_sendpage,
};
-extern void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb);
-extern void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb);
+static char llc_proc_err_msg[] __initdata =
+ KERN_CRIT "LLC: Unable to register the proc_fs entries\n";
+static char llc_sysctl_err_msg[] __initdata =
+ KERN_CRIT "LLC: Unable to register the sysctl entries\n";
+static char llc_sock_err_msg[] __initdata =
+ KERN_CRIT "LLC: Unable to register the network family\n";
static int __init llc2_init(void)
{
@@ -1048,13 +1135,28 @@ static int __init llc2_init(void)
llc_station_init();
llc_ui_sap_last_autoport = LLC_SAP_DYN_START;
rc = llc_proc_init();
- if (rc != 0)
+ if (rc != 0) {
+ printk(llc_proc_err_msg);
goto out_unregister_llc_proto;
- sock_register(&llc_ui_family_ops);
+ }
+ rc = llc_sysctl_init();
+ if (rc) {
+ printk(llc_sysctl_err_msg);
+ goto out_proc;
+ }
+ rc = sock_register(&llc_ui_family_ops);
+ if (rc) {
+ printk(llc_sock_err_msg);
+ goto out_sysctl;
+ }
llc_add_pack(LLC_DEST_SAP, llc_sap_handler);
llc_add_pack(LLC_DEST_CONN, llc_conn_handler);
out:
return rc;
+out_sysctl:
+ llc_sysctl_exit();
+out_proc:
+ llc_proc_exit();
out_unregister_llc_proto:
proto_unregister(&llc_proto);
goto out;
@@ -1067,6 +1169,7 @@ static void __exit llc2_exit(void)
llc_remove_pack(LLC_DEST_CONN);
sock_unregister(PF_LLC);
llc_proc_exit();
+ llc_sysctl_exit();
proto_unregister(&llc_proto);
}
diff --git a/net/llc/llc_c_ac.c b/net/llc/llc_c_ac.c
index b218be4c10e..b0bcfb1f12d 100644
--- a/net/llc/llc_c_ac.c
+++ b/net/llc/llc_c_ac.c
@@ -60,23 +60,10 @@ int llc_conn_ac_clear_remote_busy(struct sock *sk, struct sk_buff *skb)
int llc_conn_ac_conn_ind(struct sock *sk, struct sk_buff *skb)
{
- int rc = -ENOTCONN;
- u8 dsap;
- struct llc_sap *sap;
-
- llc_pdu_decode_dsap(skb, &dsap);
- sap = llc_sap_find(dsap);
- if (sap) {
- struct llc_conn_state_ev *ev = llc_conn_ev(skb);
- struct llc_sock *llc = llc_sk(sk);
+ struct llc_conn_state_ev *ev = llc_conn_ev(skb);
- llc_pdu_decode_sa(skb, llc->daddr.mac);
- llc_pdu_decode_da(skb, llc->laddr.mac);
- llc->dev = skb->dev;
- ev->ind_prim = LLC_CONN_PRIM;
- rc = 0;
- }
- return rc;
+ ev->ind_prim = LLC_CONN_PRIM;
+ return 0;
}
int llc_conn_ac_conn_confirm(struct sock *sk, struct sk_buff *skb)
@@ -120,10 +107,8 @@ int llc_conn_ac_disc_ind(struct sock *sk, struct sk_buff *skb)
reason = LLC_DISC_REASON_RX_DISC_CMD_PDU;
} else if (ev->type == LLC_CONN_EV_TYPE_ACK_TMR)
reason = LLC_DISC_REASON_ACK_TMR_EXP;
- else {
- reason = 0;
+ else
rc = -EINVAL;
- }
if (!rc) {
ev->reason = reason;
ev->ind_prim = LLC_DISC_PRIM;
@@ -160,9 +145,6 @@ int llc_conn_ac_rst_ind(struct sock *sk, struct sk_buff *skb)
LLC_U_PDU_CMD(pdu) == LLC_2_PDU_CMD_SABME) {
reason = LLC_RESET_REASON_REMOTE;
rc = 0;
- } else {
- reason = 0;
- rc = 1;
}
break;
case LLC_CONN_EV_TYPE_ACK_TMR:
@@ -172,8 +154,7 @@ int llc_conn_ac_rst_ind(struct sock *sk, struct sk_buff *skb)
if (llc->retry_count > llc->n2) {
reason = LLC_RESET_REASON_LOCAL;
rc = 0;
- } else
- rc = 1;
+ }
break;
}
if (!rc) {
@@ -217,18 +198,17 @@ int llc_conn_ac_stop_rej_tmr_if_data_flag_eq_2(struct sock *sk,
int llc_conn_ac_send_disc_cmd_p_set_x(struct sock *sk, struct sk_buff *skb)
{
int rc = -ENOBUFS;
- struct sk_buff *nskb = llc_alloc_frame();
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev);
if (nskb) {
- struct llc_sock *llc = llc_sk(sk);
struct llc_sap *sap = llc->sap;
- nskb->dev = llc->dev;
llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap,
llc->daddr.lsap, LLC_PDU_CMD);
llc_pdu_init_as_disc_cmd(nskb, 1);
rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
- if (rc)
+ if (unlikely(rc))
goto free;
llc_conn_send_pdu(sk, nskb);
llc_conn_ac_set_p_flag_1(sk, skb);
@@ -243,20 +223,19 @@ free:
int llc_conn_ac_send_dm_rsp_f_set_p(struct sock *sk, struct sk_buff *skb)
{
int rc = -ENOBUFS;
- struct sk_buff *nskb = llc_alloc_frame();
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev);
if (nskb) {
- struct llc_sock *llc = llc_sk(sk);
struct llc_sap *sap = llc->sap;
u8 f_bit;
- nskb->dev = llc->dev;
llc_pdu_decode_pf_bit(skb, &f_bit);
llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap,
llc->daddr.lsap, LLC_PDU_RSP);
llc_pdu_init_as_dm_rsp(nskb, f_bit);
rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
- if (rc)
+ if (unlikely(rc))
goto free;
llc_conn_send_pdu(sk, nskb);
}
@@ -270,19 +249,17 @@ free:
int llc_conn_ac_send_dm_rsp_f_set_1(struct sock *sk, struct sk_buff *skb)
{
int rc = -ENOBUFS;
- struct sk_buff *nskb = llc_alloc_frame();
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev);
if (nskb) {
- struct llc_sock *llc = llc_sk(sk);
struct llc_sap *sap = llc->sap;
- u8 f_bit = 1;
- nskb->dev = llc->dev;
llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap,
llc->daddr.lsap, LLC_PDU_RSP);
- llc_pdu_init_as_dm_rsp(nskb, f_bit);
+ llc_pdu_init_as_dm_rsp(nskb, 1);
rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
- if (rc)
+ if (unlikely(rc))
goto free;
llc_conn_send_pdu(sk, nskb);
}
@@ -306,17 +283,16 @@ int llc_conn_ac_send_frmr_rsp_f_set_x(struct sock *sk, struct sk_buff *skb)
llc_pdu_decode_pf_bit(skb, &f_bit);
else
f_bit = 0;
- nskb = llc_alloc_frame();
+ nskb = llc_alloc_frame(sk, llc->dev);
if (nskb) {
struct llc_sap *sap = llc->sap;
- nskb->dev = llc->dev;
llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap,
llc->daddr.lsap, LLC_PDU_RSP);
llc_pdu_init_as_frmr_rsp(nskb, pdu, f_bit, llc->vS,
llc->vR, INCORRECT);
rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
- if (rc)
+ if (unlikely(rc))
goto free;
llc_conn_send_pdu(sk, nskb);
}
@@ -330,21 +306,19 @@ free:
int llc_conn_ac_resend_frmr_rsp_f_set_0(struct sock *sk, struct sk_buff *skb)
{
int rc = -ENOBUFS;
- struct sk_buff *nskb = llc_alloc_frame();
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev);
if (nskb) {
- u8 f_bit = 0;
- struct llc_sock *llc = llc_sk(sk);
struct llc_sap *sap = llc->sap;
struct llc_pdu_sn *pdu = (struct llc_pdu_sn *)&llc->rx_pdu_hdr;
- nskb->dev = llc->dev;
llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap,
llc->daddr.lsap, LLC_PDU_RSP);
- llc_pdu_init_as_frmr_rsp(nskb, pdu, f_bit, llc->vS,
+ llc_pdu_init_as_frmr_rsp(nskb, pdu, 0, llc->vS,
llc->vR, INCORRECT);
rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
- if (rc)
+ if (unlikely(rc))
goto free;
llc_conn_send_pdu(sk, nskb);
}
@@ -360,21 +334,20 @@ int llc_conn_ac_resend_frmr_rsp_f_set_p(struct sock *sk, struct sk_buff *skb)
u8 f_bit;
int rc = -ENOBUFS;
struct sk_buff *nskb;
+ struct llc_sock *llc = llc_sk(sk);
llc_pdu_decode_pf_bit(skb, &f_bit);
- nskb = llc_alloc_frame();
+ nskb = llc_alloc_frame(sk, llc->dev);
if (nskb) {
- struct llc_sock *llc = llc_sk(sk);
struct llc_sap *sap = llc->sap;
struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
- nskb->dev = llc->dev;
llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap,
llc->daddr.lsap, LLC_PDU_RSP);
llc_pdu_init_as_frmr_rsp(nskb, pdu, f_bit, llc->vS,
llc->vR, INCORRECT);
rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
- if (rc)
+ if (unlikely(rc))
goto free;
llc_conn_send_pdu(sk, nskb);
}
@@ -395,7 +368,7 @@ int llc_conn_ac_send_i_cmd_p_set_1(struct sock *sk, struct sk_buff *skb)
llc->daddr.lsap, LLC_PDU_CMD);
llc_pdu_init_as_i_cmd(skb, 1, llc->vS, llc->vR);
rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac);
- if (!rc) {
+ if (likely(!rc)) {
llc_conn_send_pdu(sk, skb);
llc_conn_ac_inc_vs_by_1(sk, skb);
}
@@ -412,7 +385,7 @@ static int llc_conn_ac_send_i_cmd_p_set_0(struct sock *sk, struct sk_buff *skb)
llc->daddr.lsap, LLC_PDU_CMD);
llc_pdu_init_as_i_cmd(skb, 0, llc->vS, llc->vR);
rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac);
- if (!rc) {
+ if (likely(!rc)) {
llc_conn_send_pdu(sk, skb);
llc_conn_ac_inc_vs_by_1(sk, skb);
}
@@ -429,7 +402,7 @@ int llc_conn_ac_send_i_xxx_x_set_0(struct sock *sk, struct sk_buff *skb)
llc->daddr.lsap, LLC_PDU_CMD);
llc_pdu_init_as_i_cmd(skb, 0, llc->vS, llc->vR);
rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac);
- if (!rc) {
+ if (likely(!rc)) {
llc_conn_send_pdu(sk, skb);
llc_conn_ac_inc_vs_by_1(sk, skb);
}
@@ -451,18 +424,17 @@ int llc_conn_ac_resend_i_xxx_x_set_0_or_send_rr(struct sock *sk,
u8 nr;
struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
int rc = -ENOBUFS;
- struct sk_buff *nskb = llc_alloc_frame();
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev);
if (nskb) {
- struct llc_sock *llc = llc_sk(sk);
struct llc_sap *sap = llc->sap;
- nskb->dev = llc->dev;
llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap,
llc->daddr.lsap, LLC_PDU_RSP);
llc_pdu_init_as_rr_rsp(nskb, 0, llc->vR);
rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
- if (!rc)
+ if (likely(!rc))
llc_conn_send_pdu(sk, nskb);
else
kfree_skb(skb);
@@ -487,18 +459,17 @@ int llc_conn_ac_resend_i_rsp_f_set_1(struct sock *sk, struct sk_buff *skb)
int llc_conn_ac_send_rej_cmd_p_set_1(struct sock *sk, struct sk_buff *skb)
{
int rc = -ENOBUFS;
- struct sk_buff *nskb = llc_alloc_frame();
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev);
if (nskb) {
- struct llc_sock *llc = llc_sk(sk);
struct llc_sap *sap = llc->sap;
- nskb->dev = llc->dev;
llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
llc->daddr.lsap, LLC_PDU_CMD);
llc_pdu_init_as_rej_cmd(nskb, 1, llc->vR);
rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
- if (rc)
+ if (unlikely(rc))
goto free;
llc_conn_send_pdu(sk, nskb);
}
@@ -512,19 +483,17 @@ free:
int llc_conn_ac_send_rej_rsp_f_set_1(struct sock *sk, struct sk_buff *skb)
{
int rc = -ENOBUFS;
- struct sk_buff *nskb = llc_alloc_frame();
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev);
if (nskb) {
- u8 f_bit = 1;
- struct llc_sock *llc = llc_sk(sk);
struct llc_sap *sap = llc->sap;
- nskb->dev = llc->dev;
llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
llc->daddr.lsap, LLC_PDU_RSP);
- llc_pdu_init_as_rej_rsp(nskb, f_bit, llc->vR);
+ llc_pdu_init_as_rej_rsp(nskb, 1, llc->vR);
rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
- if (rc)
+ if (unlikely(rc))
goto free;
llc_conn_send_pdu(sk, nskb);
}
@@ -538,19 +507,17 @@ free:
int llc_conn_ac_send_rej_xxx_x_set_0(struct sock *sk, struct sk_buff *skb)
{
int rc = -ENOBUFS;
- struct sk_buff *nskb = llc_alloc_frame();
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev);
if (nskb) {
- struct llc_sock *llc = llc_sk(sk);
struct llc_sap *sap = llc->sap;
- u8 f_bit = 0;
- nskb->dev = llc->dev;
llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
llc->daddr.lsap, LLC_PDU_RSP);
- llc_pdu_init_as_rej_rsp(nskb, f_bit, llc->vR);
+ llc_pdu_init_as_rej_rsp(nskb, 0, llc->vR);
rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
- if (rc)
+ if (unlikely(rc))
goto free;
llc_conn_send_pdu(sk, nskb);
}
@@ -564,18 +531,17 @@ free:
int llc_conn_ac_send_rnr_cmd_p_set_1(struct sock *sk, struct sk_buff *skb)
{
int rc = -ENOBUFS;
- struct sk_buff *nskb = llc_alloc_frame();
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev);
if (nskb) {
- struct llc_sock *llc = llc_sk(sk);
struct llc_sap *sap = llc->sap;
- nskb->dev = llc->dev;
llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
llc->daddr.lsap, LLC_PDU_CMD);
llc_pdu_init_as_rnr_cmd(nskb, 1, llc->vR);
rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
- if (rc)
+ if (unlikely(rc))
goto free;
llc_conn_send_pdu(sk, nskb);
}
@@ -589,19 +555,17 @@ free:
int llc_conn_ac_send_rnr_rsp_f_set_1(struct sock *sk, struct sk_buff *skb)
{
int rc = -ENOBUFS;
- struct sk_buff *nskb = llc_alloc_frame();
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev);
if (nskb) {
- struct llc_sock *llc = llc_sk(sk);
struct llc_sap *sap = llc->sap;
- u8 f_bit = 1;
- nskb->dev = llc->dev;
llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
llc->daddr.lsap, LLC_PDU_RSP);
- llc_pdu_init_as_rnr_rsp(nskb, f_bit, llc->vR);
+ llc_pdu_init_as_rnr_rsp(nskb, 1, llc->vR);
rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
- if (rc)
+ if (unlikely(rc))
goto free;
llc_conn_send_pdu(sk, nskb);
}
@@ -615,19 +579,17 @@ free:
int llc_conn_ac_send_rnr_xxx_x_set_0(struct sock *sk, struct sk_buff *skb)
{
int rc = -ENOBUFS;
- struct sk_buff *nskb = llc_alloc_frame();
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev);
if (nskb) {
- u8 f_bit = 0;
- struct llc_sock *llc = llc_sk(sk);
struct llc_sap *sap = llc->sap;
- nskb->dev = llc->dev;
llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
llc->daddr.lsap, LLC_PDU_RSP);
- llc_pdu_init_as_rnr_rsp(nskb, f_bit, llc->vR);
+ llc_pdu_init_as_rnr_rsp(nskb, 0, llc->vR);
rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
- if (rc)
+ if (unlikely(rc))
goto free;
llc_conn_send_pdu(sk, nskb);
}
@@ -645,7 +607,7 @@ int llc_conn_ac_set_remote_busy(struct sock *sk, struct sk_buff *skb)
if (!llc->remote_busy_flag) {
llc->remote_busy_flag = 1;
mod_timer(&llc->busy_state_timer.timer,
- jiffies + llc->busy_state_timer.expire * HZ);
+ jiffies + llc->busy_state_timer.expire);
}
return 0;
}
@@ -653,18 +615,17 @@ int llc_conn_ac_set_remote_busy(struct sock *sk, struct sk_buff *skb)
int llc_conn_ac_opt_send_rnr_xxx_x_set_0(struct sock *sk, struct sk_buff *skb)
{
int rc = -ENOBUFS;
- struct sk_buff *nskb = llc_alloc_frame();
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev);
if (nskb) {
- struct llc_sock *llc = llc_sk(sk);
struct llc_sap *sap = llc->sap;
- nskb->dev = llc->dev;
llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
llc->daddr.lsap, LLC_PDU_RSP);
llc_pdu_init_as_rnr_rsp(nskb, 0, llc->vR);
rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
- if (rc)
+ if (unlikely(rc))
goto free;
llc_conn_send_pdu(sk, nskb);
}
@@ -678,18 +639,17 @@ free:
int llc_conn_ac_send_rr_cmd_p_set_1(struct sock *sk, struct sk_buff *skb)
{
int rc = -ENOBUFS;
- struct sk_buff *nskb = llc_alloc_frame();
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev);
if (nskb) {
- struct llc_sock *llc = llc_sk(sk);
struct llc_sap *sap = llc->sap;
- nskb->dev = llc->dev;
llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
llc->daddr.lsap, LLC_PDU_CMD);
llc_pdu_init_as_rr_cmd(nskb, 1, llc->vR);
rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
- if (rc)
+ if (unlikely(rc))
goto free;
llc_conn_send_pdu(sk, nskb);
}
@@ -703,19 +663,18 @@ free:
int llc_conn_ac_send_rr_rsp_f_set_1(struct sock *sk, struct sk_buff *skb)
{
int rc = -ENOBUFS;
- struct sk_buff *nskb = llc_alloc_frame();
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev);
if (nskb) {
- struct llc_sock *llc = llc_sk(sk);
struct llc_sap *sap = llc->sap;
u8 f_bit = 1;
- nskb->dev = llc->dev;
llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
llc->daddr.lsap, LLC_PDU_RSP);
llc_pdu_init_as_rr_rsp(nskb, f_bit, llc->vR);
rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
- if (rc)
+ if (unlikely(rc))
goto free;
llc_conn_send_pdu(sk, nskb);
}
@@ -729,19 +688,17 @@ free:
int llc_conn_ac_send_ack_rsp_f_set_1(struct sock *sk, struct sk_buff *skb)
{
int rc = -ENOBUFS;
- struct sk_buff *nskb = llc_alloc_frame();
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev);
if (nskb) {
- struct llc_sock *llc = llc_sk(sk);
struct llc_sap *sap = llc->sap;
- u8 f_bit = 1;
- nskb->dev = llc->dev;
llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
llc->daddr.lsap, LLC_PDU_RSP);
- llc_pdu_init_as_rr_rsp(nskb, f_bit, llc->vR);
+ llc_pdu_init_as_rr_rsp(nskb, 1, llc->vR);
rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
- if (rc)
+ if (unlikely(rc))
goto free;
llc_conn_send_pdu(sk, nskb);
}
@@ -755,18 +712,17 @@ free:
int llc_conn_ac_send_rr_xxx_x_set_0(struct sock *sk, struct sk_buff *skb)
{
int rc = -ENOBUFS;
- struct sk_buff *nskb = llc_alloc_frame();
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev);
if (nskb) {
- struct llc_sock *llc = llc_sk(sk);
struct llc_sap *sap = llc->sap;
- nskb->dev = llc->dev;
llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
llc->daddr.lsap, LLC_PDU_RSP);
llc_pdu_init_as_rr_rsp(nskb, 0, llc->vR);
rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
- if (rc)
+ if (unlikely(rc))
goto free;
llc_conn_send_pdu(sk, nskb);
}
@@ -780,18 +736,17 @@ free:
int llc_conn_ac_send_ack_xxx_x_set_0(struct sock *sk, struct sk_buff *skb)
{
int rc = -ENOBUFS;
- struct sk_buff *nskb = llc_alloc_frame();
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev);
if (nskb) {
- struct llc_sock *llc = llc_sk(sk);
struct llc_sap *sap = llc->sap;
- nskb->dev = llc->dev;
llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
llc->daddr.lsap, LLC_PDU_RSP);
llc_pdu_init_as_rr_rsp(nskb, 0, llc->vR);
rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
- if (rc)
+ if (unlikely(rc))
goto free;
llc_conn_send_pdu(sk, nskb);
}
@@ -815,8 +770,8 @@ void llc_conn_set_p_flag(struct sock *sk, u8 value)
int llc_conn_ac_send_sabme_cmd_p_set_x(struct sock *sk, struct sk_buff *skb)
{
int rc = -ENOBUFS;
- struct sk_buff *nskb = llc_alloc_frame();
struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev);
if (nskb) {
struct llc_sap *sap = llc->sap;
@@ -824,12 +779,11 @@ int llc_conn_ac_send_sabme_cmd_p_set_x(struct sock *sk, struct sk_buff *skb)
if (llc->dev->flags & IFF_LOOPBACK)
dmac = llc->dev->dev_addr;
- nskb->dev = llc->dev;
llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap,
llc->daddr.lsap, LLC_PDU_CMD);
llc_pdu_init_as_sabme_cmd(nskb, 1);
rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, dmac);
- if (rc)
+ if (unlikely(rc))
goto free;
llc_conn_send_pdu(sk, nskb);
llc_conn_set_p_flag(sk, 1);
@@ -845,11 +799,11 @@ int llc_conn_ac_send_ua_rsp_f_set_p(struct sock *sk, struct sk_buff *skb)
{
u8 f_bit;
int rc = -ENOBUFS;
- struct sk_buff *nskb = llc_alloc_frame();
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev);
llc_pdu_decode_pf_bit(skb, &f_bit);
if (nskb) {
- struct llc_sock *llc = llc_sk(sk);
struct llc_sap *sap = llc->sap;
nskb->dev = llc->dev;
@@ -857,7 +811,7 @@ int llc_conn_ac_send_ua_rsp_f_set_p(struct sock *sk, struct sk_buff *skb)
llc->daddr.lsap, LLC_PDU_RSP);
llc_pdu_init_as_ua_rsp(nskb, f_bit);
rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
- if (rc)
+ if (unlikely(rc))
goto free;
llc_conn_send_pdu(sk, nskb);
}
@@ -886,7 +840,7 @@ int llc_conn_ac_start_p_timer(struct sock *sk, struct sk_buff *skb)
llc_conn_set_p_flag(sk, 1);
mod_timer(&llc->pf_cycle_timer.timer,
- jiffies + llc->pf_cycle_timer.expire * HZ);
+ jiffies + llc->pf_cycle_timer.expire);
return 0;
}
@@ -957,7 +911,7 @@ static int llc_conn_ac_send_i_rsp_f_set_ackpf(struct sock *sk,
llc->daddr.lsap, LLC_PDU_RSP);
llc_pdu_init_as_i_cmd(skb, llc->ack_pf, llc->vS, llc->vR);
rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac);
- if (!rc) {
+ if (likely(!rc)) {
llc_conn_send_pdu(sk, skb);
llc_conn_ac_inc_vs_by_1(sk, skb);
}
@@ -1001,18 +955,17 @@ static int llc_conn_ac_send_rr_rsp_f_set_ackpf(struct sock *sk,
struct sk_buff *skb)
{
int rc = -ENOBUFS;
- struct sk_buff *nskb = llc_alloc_frame();
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev);
if (nskb) {
- struct llc_sock *llc = llc_sk(sk);
struct llc_sap *sap = llc->sap;
- nskb->dev = llc->dev;
llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
llc->daddr.lsap, LLC_PDU_RSP);
llc_pdu_init_as_rr_rsp(nskb, llc->ack_pf, llc->vR);
rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
- if (rc)
+ if (unlikely(rc))
goto free;
llc_conn_send_pdu(sk, nskb);
}
@@ -1165,7 +1118,7 @@ int llc_conn_ac_start_ack_timer(struct sock *sk, struct sk_buff *skb)
{
struct llc_sock *llc = llc_sk(sk);
- mod_timer(&llc->ack_timer.timer, jiffies + llc->ack_timer.expire * HZ);
+ mod_timer(&llc->ack_timer.timer, jiffies + llc->ack_timer.expire);
return 0;
}
@@ -1174,7 +1127,7 @@ int llc_conn_ac_start_rej_timer(struct sock *sk, struct sk_buff *skb)
struct llc_sock *llc = llc_sk(sk);
mod_timer(&llc->rej_sent_timer.timer,
- jiffies + llc->rej_sent_timer.expire * HZ);
+ jiffies + llc->rej_sent_timer.expire);
return 0;
}
@@ -1185,7 +1138,7 @@ int llc_conn_ac_start_ack_tmr_if_not_running(struct sock *sk,
if (!timer_pending(&llc->ack_timer.timer))
mod_timer(&llc->ack_timer.timer,
- jiffies + llc->ack_timer.expire * HZ);
+ jiffies + llc->ack_timer.expire);
return 0;
}
@@ -1233,7 +1186,7 @@ int llc_conn_ac_upd_nr_received(struct sock *sk, struct sk_buff *skb)
}
if (unacked)
mod_timer(&llc->ack_timer.timer,
- jiffies + llc->ack_timer.expire * HZ);
+ jiffies + llc->ack_timer.expire);
} else if (llc->failed_data_req) {
u8 f_bit;
@@ -1354,13 +1307,13 @@ int llc_conn_ac_set_vs_nr(struct sock *sk, struct sk_buff *skb)
return 0;
}
-int llc_conn_ac_inc_vs_by_1(struct sock *sk, struct sk_buff *skb)
+static int llc_conn_ac_inc_vs_by_1(struct sock *sk, struct sk_buff *skb)
{
llc_sk(sk)->vS = (llc_sk(sk)->vS + 1) % 128;
return 0;
}
-void llc_conn_pf_cycle_tmr_cb(unsigned long timeout_data)
+static void llc_conn_tmr_common_cb(unsigned long timeout_data, u8 type)
{
struct sock *sk = (struct sock *)timeout_data;
struct sk_buff *skb = alloc_skb(0, GFP_ATOMIC);
@@ -1369,59 +1322,31 @@ void llc_conn_pf_cycle_tmr_cb(unsigned long timeout_data)
if (skb) {
struct llc_conn_state_ev *ev = llc_conn_ev(skb);
- skb->sk = sk;
- ev->type = LLC_CONN_EV_TYPE_P_TMR;
+ skb_set_owner_r(skb, sk);
+ ev->type = type;
llc_process_tmr_ev(sk, skb);
}
bh_unlock_sock(sk);
}
-void llc_conn_busy_tmr_cb(unsigned long timeout_data)
+void llc_conn_pf_cycle_tmr_cb(unsigned long timeout_data)
{
- struct sock *sk = (struct sock *)timeout_data;
- struct sk_buff *skb = alloc_skb(0, GFP_ATOMIC);
-
- bh_lock_sock(sk);
- if (skb) {
- struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+ llc_conn_tmr_common_cb(timeout_data, LLC_CONN_EV_TYPE_P_TMR);
+}
- skb->sk = sk;
- ev->type = LLC_CONN_EV_TYPE_BUSY_TMR;
- llc_process_tmr_ev(sk, skb);
- }
- bh_unlock_sock(sk);
+void llc_conn_busy_tmr_cb(unsigned long timeout_data)
+{
+ llc_conn_tmr_common_cb(timeout_data, LLC_CONN_EV_TYPE_BUSY_TMR);
}
void llc_conn_ack_tmr_cb(unsigned long timeout_data)
{
- struct sock* sk = (struct sock *)timeout_data;
- struct sk_buff *skb = alloc_skb(0, GFP_ATOMIC);
-
- bh_lock_sock(sk);
- if (skb) {
- struct llc_conn_state_ev *ev = llc_conn_ev(skb);
-
- skb->sk = sk;
- ev->type = LLC_CONN_EV_TYPE_ACK_TMR;
- llc_process_tmr_ev(sk, skb);
- }
- bh_unlock_sock(sk);
+ llc_conn_tmr_common_cb(timeout_data, LLC_CONN_EV_TYPE_ACK_TMR);
}
void llc_conn_rej_tmr_cb(unsigned long timeout_data)
{
- struct sock *sk = (struct sock *)timeout_data;
- struct sk_buff *skb = alloc_skb(0, GFP_ATOMIC);
-
- bh_lock_sock(sk);
- if (skb) {
- struct llc_conn_state_ev *ev = llc_conn_ev(skb);
-
- skb->sk = sk;
- ev->type = LLC_CONN_EV_TYPE_REJ_TMR;
- llc_process_tmr_ev(sk, skb);
- }
- bh_unlock_sock(sk);
+ llc_conn_tmr_common_cb(timeout_data, LLC_CONN_EV_TYPE_REJ_TMR);
}
int llc_conn_ac_rst_vs(struct sock *sk, struct sk_buff *skb)
diff --git a/net/llc/llc_c_ev.c b/net/llc/llc_c_ev.c
index d5bdb53a348..c5deda24661 100644
--- a/net/llc/llc_c_ev.c
+++ b/net/llc/llc_c_ev.c
@@ -37,6 +37,7 @@
#include <net/llc_conn.h>
#include <net/llc_sap.h>
#include <net/sock.h>
+#include <net/llc_c_ac.h>
#include <net/llc_c_ev.h>
#include <net/llc_pdu.h>
@@ -46,8 +47,6 @@
#define dprintk(args...)
#endif
-extern u16 llc_circular_between(u8 a, u8 b, u8 c);
-
/**
* llc_util_ns_inside_rx_window - check if sequence number is in rx window
* @ns: sequence number of received pdu.
@@ -99,7 +98,7 @@ out:
int llc_conn_ev_conn_req(struct sock *sk, struct sk_buff *skb)
{
- struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+ const struct llc_conn_state_ev *ev = llc_conn_ev(skb);
return ev->prim == LLC_CONN_PRIM &&
ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1;
@@ -107,7 +106,7 @@ int llc_conn_ev_conn_req(struct sock *sk, struct sk_buff *skb)
int llc_conn_ev_data_req(struct sock *sk, struct sk_buff *skb)
{
- struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+ const struct llc_conn_state_ev *ev = llc_conn_ev(skb);
return ev->prim == LLC_DATA_PRIM &&
ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1;
@@ -115,7 +114,7 @@ int llc_conn_ev_data_req(struct sock *sk, struct sk_buff *skb)
int llc_conn_ev_disc_req(struct sock *sk, struct sk_buff *skb)
{
- struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+ const struct llc_conn_state_ev *ev = llc_conn_ev(skb);
return ev->prim == LLC_DISC_PRIM &&
ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1;
@@ -123,7 +122,7 @@ int llc_conn_ev_disc_req(struct sock *sk, struct sk_buff *skb)
int llc_conn_ev_rst_req(struct sock *sk, struct sk_buff *skb)
{
- struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+ const struct llc_conn_state_ev *ev = llc_conn_ev(skb);
return ev->prim == LLC_RESET_PRIM &&
ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1;
@@ -131,7 +130,7 @@ int llc_conn_ev_rst_req(struct sock *sk, struct sk_buff *skb)
int llc_conn_ev_local_busy_detected(struct sock *sk, struct sk_buff *skb)
{
- struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+ const struct llc_conn_state_ev *ev = llc_conn_ev(skb);
return ev->type == LLC_CONN_EV_TYPE_SIMPLE &&
ev->prim_type == LLC_CONN_EV_LOCAL_BUSY_DETECTED ? 0 : 1;
@@ -139,7 +138,7 @@ int llc_conn_ev_local_busy_detected(struct sock *sk, struct sk_buff *skb)
int llc_conn_ev_local_busy_cleared(struct sock *sk, struct sk_buff *skb)
{
- struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+ const struct llc_conn_state_ev *ev = llc_conn_ev(skb);
return ev->type == LLC_CONN_EV_TYPE_SIMPLE &&
ev->prim_type == LLC_CONN_EV_LOCAL_BUSY_CLEARED ? 0 : 1;
@@ -152,7 +151,7 @@ int llc_conn_ev_rx_bad_pdu(struct sock *sk, struct sk_buff *skb)
int llc_conn_ev_rx_disc_cmd_pbit_set_x(struct sock *sk, struct sk_buff *skb)
{
- struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+ const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_U(pdu) &&
LLC_U_PDU_CMD(pdu) == LLC_2_PDU_CMD_DISC ? 0 : 1;
@@ -160,7 +159,7 @@ int llc_conn_ev_rx_disc_cmd_pbit_set_x(struct sock *sk, struct sk_buff *skb)
int llc_conn_ev_rx_dm_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb)
{
- struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+ const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_U(pdu) &&
LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_DM ? 0 : 1;
@@ -168,7 +167,7 @@ int llc_conn_ev_rx_dm_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb)
int llc_conn_ev_rx_frmr_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb)
{
- struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+ const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_U(pdu) &&
LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_FRMR ? 0 : 1;
@@ -176,7 +175,7 @@ int llc_conn_ev_rx_frmr_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb)
int llc_conn_ev_rx_i_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb)
{
- struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
return llc_conn_space(sk, skb) &&
LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) &&
@@ -186,7 +185,7 @@ int llc_conn_ev_rx_i_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb)
int llc_conn_ev_rx_i_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb)
{
- struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
return llc_conn_space(sk, skb) &&
LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) &&
@@ -197,9 +196,9 @@ int llc_conn_ev_rx_i_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb)
int llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns(struct sock *sk,
struct sk_buff *skb)
{
- struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
- u8 vr = llc_sk(sk)->vR;
- u8 ns = LLC_I_GET_NS(pdu);
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ const u8 vr = llc_sk(sk)->vR;
+ const u8 ns = LLC_I_GET_NS(pdu);
return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) &&
LLC_I_PF_IS_0(pdu) && ns != vr &&
@@ -209,9 +208,9 @@ int llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns(struct sock *sk,
int llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns(struct sock *sk,
struct sk_buff *skb)
{
- struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
- u8 vr = llc_sk(sk)->vR;
- u8 ns = LLC_I_GET_NS(pdu);
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ const u8 vr = llc_sk(sk)->vR;
+ const u8 ns = LLC_I_GET_NS(pdu);
return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) &&
LLC_I_PF_IS_1(pdu) && ns != vr &&
@@ -221,10 +220,11 @@ int llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns(struct sock *sk,
int llc_conn_ev_rx_i_cmd_pbit_set_x_inval_ns(struct sock *sk,
struct sk_buff *skb)
{
- struct llc_pdu_sn * pdu = llc_pdu_sn_hdr(skb);
- u8 vr = llc_sk(sk)->vR;
- u8 ns = LLC_I_GET_NS(pdu);
- u16 rc = LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) && ns != vr &&
+ const struct llc_pdu_sn * pdu = llc_pdu_sn_hdr(skb);
+ const u8 vr = llc_sk(sk)->vR;
+ const u8 ns = LLC_I_GET_NS(pdu);
+ const u16 rc = LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) &&
+ ns != vr &&
llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1;
if (!rc)
dprintk("%s: matched, state=%d, ns=%d, vr=%d\n",
@@ -234,7 +234,7 @@ int llc_conn_ev_rx_i_cmd_pbit_set_x_inval_ns(struct sock *sk,
int llc_conn_ev_rx_i_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb)
{
- struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
return llc_conn_space(sk, skb) &&
LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) &&
@@ -244,7 +244,7 @@ int llc_conn_ev_rx_i_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb)
int llc_conn_ev_rx_i_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb)
{
- struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) &&
LLC_I_PF_IS_1(pdu) &&
@@ -253,7 +253,7 @@ int llc_conn_ev_rx_i_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb)
int llc_conn_ev_rx_i_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb)
{
- struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
return llc_conn_space(sk, skb) &&
LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) &&
@@ -263,9 +263,9 @@ int llc_conn_ev_rx_i_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb)
int llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns(struct sock *sk,
struct sk_buff *skb)
{
- struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
- u8 vr = llc_sk(sk)->vR;
- u8 ns = LLC_I_GET_NS(pdu);
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ const u8 vr = llc_sk(sk)->vR;
+ const u8 ns = LLC_I_GET_NS(pdu);
return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) &&
LLC_I_PF_IS_0(pdu) && ns != vr &&
@@ -275,9 +275,9 @@ int llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns(struct sock *sk,
int llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns(struct sock *sk,
struct sk_buff *skb)
{
- struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
- u8 vr = llc_sk(sk)->vR;
- u8 ns = LLC_I_GET_NS(pdu);
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ const u8 vr = llc_sk(sk)->vR;
+ const u8 ns = LLC_I_GET_NS(pdu);
return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) &&
LLC_I_PF_IS_1(pdu) && ns != vr &&
@@ -287,9 +287,9 @@ int llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns(struct sock *sk,
int llc_conn_ev_rx_i_rsp_fbit_set_x_unexpd_ns(struct sock *sk,
struct sk_buff *skb)
{
- struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
- u8 vr = llc_sk(sk)->vR;
- u8 ns = LLC_I_GET_NS(pdu);
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ const u8 vr = llc_sk(sk)->vR;
+ const u8 ns = LLC_I_GET_NS(pdu);
return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && ns != vr &&
!llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1;
@@ -298,10 +298,11 @@ int llc_conn_ev_rx_i_rsp_fbit_set_x_unexpd_ns(struct sock *sk,
int llc_conn_ev_rx_i_rsp_fbit_set_x_inval_ns(struct sock *sk,
struct sk_buff *skb)
{
- struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
- u8 vr = llc_sk(sk)->vR;
- u8 ns = LLC_I_GET_NS(pdu);
- u16 rc = LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && ns != vr &&
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ const u8 vr = llc_sk(sk)->vR;
+ const u8 ns = LLC_I_GET_NS(pdu);
+ const u16 rc = LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) &&
+ ns != vr &&
llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1;
if (!rc)
dprintk("%s: matched, state=%d, ns=%d, vr=%d\n",
@@ -311,7 +312,7 @@ int llc_conn_ev_rx_i_rsp_fbit_set_x_inval_ns(struct sock *sk,
int llc_conn_ev_rx_rej_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb)
{
- struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
LLC_S_PF_IS_0(pdu) &&
@@ -320,7 +321,7 @@ int llc_conn_ev_rx_rej_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb)
int llc_conn_ev_rx_rej_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb)
{
- struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
LLC_S_PF_IS_1(pdu) &&
@@ -329,7 +330,7 @@ int llc_conn_ev_rx_rej_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb)
int llc_conn_ev_rx_rej_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb)
{
- struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
LLC_S_PF_IS_0(pdu) &&
@@ -338,7 +339,7 @@ int llc_conn_ev_rx_rej_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb)
int llc_conn_ev_rx_rej_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb)
{
- struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
LLC_S_PF_IS_1(pdu) &&
@@ -347,7 +348,7 @@ int llc_conn_ev_rx_rej_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb)
int llc_conn_ev_rx_rej_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb)
{
- struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+ const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_REJ ? 0 : 1;
@@ -355,7 +356,7 @@ int llc_conn_ev_rx_rej_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb)
int llc_conn_ev_rx_rnr_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb)
{
- struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
LLC_S_PF_IS_0(pdu) &&
@@ -364,7 +365,7 @@ int llc_conn_ev_rx_rnr_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb)
int llc_conn_ev_rx_rnr_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb)
{
- struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
LLC_S_PF_IS_1(pdu) &&
@@ -373,7 +374,7 @@ int llc_conn_ev_rx_rnr_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb)
int llc_conn_ev_rx_rnr_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb)
{
- struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
LLC_S_PF_IS_0(pdu) &&
@@ -382,7 +383,7 @@ int llc_conn_ev_rx_rnr_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb)
int llc_conn_ev_rx_rnr_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb)
{
- struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
LLC_S_PF_IS_1(pdu) &&
@@ -391,7 +392,7 @@ int llc_conn_ev_rx_rnr_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb)
int llc_conn_ev_rx_rr_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb)
{
- struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
LLC_S_PF_IS_0(pdu) &&
@@ -400,7 +401,7 @@ int llc_conn_ev_rx_rr_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb)
int llc_conn_ev_rx_rr_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb)
{
- struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
LLC_S_PF_IS_1(pdu) &&
@@ -409,7 +410,7 @@ int llc_conn_ev_rx_rr_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb)
int llc_conn_ev_rx_rr_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb)
{
- struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
return llc_conn_space(sk, skb) &&
LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
@@ -419,7 +420,7 @@ int llc_conn_ev_rx_rr_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb)
int llc_conn_ev_rx_rr_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb)
{
- struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
return llc_conn_space(sk, skb) &&
LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
@@ -429,7 +430,7 @@ int llc_conn_ev_rx_rr_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb)
int llc_conn_ev_rx_sabme_cmd_pbit_set_x(struct sock *sk, struct sk_buff *skb)
{
- struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+ const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_U(pdu) &&
LLC_U_PDU_CMD(pdu) == LLC_2_PDU_CMD_SABME ? 0 : 1;
@@ -446,7 +447,7 @@ int llc_conn_ev_rx_ua_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb)
int llc_conn_ev_rx_xxx_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb)
{
u16 rc = 1;
- struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
if (LLC_PDU_IS_CMD(pdu)) {
if (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) {
@@ -461,7 +462,7 @@ int llc_conn_ev_rx_xxx_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb)
int llc_conn_ev_rx_xxx_cmd_pbit_set_x(struct sock *sk, struct sk_buff *skb)
{
u16 rc = 1;
- struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+ const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
if (LLC_PDU_IS_CMD(pdu)) {
if (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu))
@@ -477,32 +478,10 @@ int llc_conn_ev_rx_xxx_cmd_pbit_set_x(struct sock *sk, struct sk_buff *skb)
return rc;
}
-int llc_conn_ev_rx_xxx_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb)
-{
- u16 rc = 1;
- struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
-
- if (LLC_PDU_IS_RSP(pdu)) {
- if (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) {
- if (LLC_I_PF_IS_1(pdu))
- rc = 0;
- } else if (LLC_PDU_TYPE_IS_U(pdu))
- switch (LLC_U_PDU_RSP(pdu)) {
- case LLC_2_PDU_RSP_UA:
- case LLC_2_PDU_RSP_DM:
- case LLC_2_PDU_RSP_FRMR:
- if (LLC_U_PF_IS_1(pdu))
- rc = 0;
- break;
- }
- }
- return rc;
-}
-
int llc_conn_ev_rx_xxx_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb)
{
u16 rc = 1;
- struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+ const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
if (LLC_PDU_IS_RSP(pdu)) {
if (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu))
@@ -524,9 +503,9 @@ int llc_conn_ev_rx_zzz_cmd_pbit_set_x_inval_nr(struct sock *sk,
struct sk_buff *skb)
{
u16 rc = 1;
- struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
- u8 vs = llc_sk(sk)->vS;
- u8 nr = LLC_I_GET_NR(pdu);
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ const u8 vs = llc_sk(sk)->vS;
+ const u8 nr = LLC_I_GET_NR(pdu);
if (LLC_PDU_IS_CMD(pdu) &&
(LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) &&
@@ -542,9 +521,9 @@ int llc_conn_ev_rx_zzz_rsp_fbit_set_x_inval_nr(struct sock *sk,
struct sk_buff *skb)
{
u16 rc = 1;
- struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
- u8 vs = llc_sk(sk)->vS;
- u8 nr = LLC_I_GET_NR(pdu);
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ const u8 vs = llc_sk(sk)->vS;
+ const u8 nr = LLC_I_GET_NR(pdu);
if (LLC_PDU_IS_RSP(pdu) &&
(LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) &&
@@ -563,28 +542,28 @@ int llc_conn_ev_rx_any_frame(struct sock *sk, struct sk_buff *skb)
int llc_conn_ev_p_tmr_exp(struct sock *sk, struct sk_buff *skb)
{
- struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+ const struct llc_conn_state_ev *ev = llc_conn_ev(skb);
return ev->type != LLC_CONN_EV_TYPE_P_TMR;
}
int llc_conn_ev_ack_tmr_exp(struct sock *sk, struct sk_buff *skb)
{
- struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+ const struct llc_conn_state_ev *ev = llc_conn_ev(skb);
return ev->type != LLC_CONN_EV_TYPE_ACK_TMR;
}
int llc_conn_ev_rej_tmr_exp(struct sock *sk, struct sk_buff *skb)
{
- struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+ const struct llc_conn_state_ev *ev = llc_conn_ev(skb);
return ev->type != LLC_CONN_EV_TYPE_REJ_TMR;
}
int llc_conn_ev_busy_tmr_exp(struct sock *sk, struct sk_buff *skb)
{
- struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+ const struct llc_conn_state_ev *ev = llc_conn_ev(skb);
return ev->type != LLC_CONN_EV_TYPE_BUSY_TMR;
}
@@ -596,7 +575,7 @@ int llc_conn_ev_init_p_f_cycle(struct sock *sk, struct sk_buff *skb)
int llc_conn_ev_tx_buffer_full(struct sock *sk, struct sk_buff *skb)
{
- struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+ const struct llc_conn_state_ev *ev = llc_conn_ev(skb);
return ev->type == LLC_CONN_EV_TYPE_SIMPLE &&
ev->prim_type == LLC_CONN_EV_TX_BUFF_FULL ? 0 : 1;
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
index eba812a9c69..c761c15da42 100644
--- a/net/llc/llc_conn.c
+++ b/net/llc/llc_conn.c
@@ -16,7 +16,7 @@
#include <net/llc_sap.h>
#include <net/llc_conn.h>
#include <net/sock.h>
-#include <linux/tcp.h>
+#include <net/tcp_states.h>
#include <net/llc_c_ev.h>
#include <net/llc_c_ac.h>
#include <net/llc_c_st.h>
@@ -40,6 +40,11 @@ static struct llc_conn_state_trans *llc_qualify_conn_ev(struct sock *sk,
/* Offset table on connection states transition diagram */
static int llc_offset_table[NBR_CONN_STATES][NBR_CONN_EV];
+int sysctl_llc2_ack_timeout = LLC2_ACK_TIME * HZ;
+int sysctl_llc2_p_timeout = LLC2_P_TIME * HZ;
+int sysctl_llc2_rej_timeout = LLC2_REJ_TIME * HZ;
+int sysctl_llc2_busy_timeout = LLC2_BUSY_TIME * HZ;
+
/**
* llc_conn_state_process - sends event to connection state machine
* @sk: connection
@@ -53,7 +58,7 @@ static int llc_offset_table[NBR_CONN_STATES][NBR_CONN_EV];
int llc_conn_state_process(struct sock *sk, struct sk_buff *skb)
{
int rc;
- struct llc_sock *llc = llc_sk(sk);
+ struct llc_sock *llc = llc_sk(skb->sk);
struct llc_conn_state_ev *ev = llc_conn_ev(skb);
/*
@@ -63,26 +68,33 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb)
*/
skb_get(skb);
ev->ind_prim = ev->cfm_prim = 0;
- rc = llc_conn_service(sk, skb); /* sending event to state machine */
- if (rc) {
+ /*
+ * Send event to state machine
+ */
+ rc = llc_conn_service(skb->sk, skb);
+ if (unlikely(rc != 0)) {
printk(KERN_ERR "%s: llc_conn_service failed\n", __FUNCTION__);
goto out_kfree_skb;
}
- if (!ev->ind_prim && !ev->cfm_prim) {
+ if (unlikely(!ev->ind_prim && !ev->cfm_prim)) {
/* indicate or confirm not required */
- if (!skb->list)
+ /* XXX this is not very pretty, perhaps we should store
+ * XXX indicate/confirm-needed state in the llc_conn_state_ev
+ * XXX control block of the SKB instead? -DaveM
+ */
+ if (!skb->next)
goto out_kfree_skb;
goto out_skb_put;
}
- if (ev->ind_prim && ev->cfm_prim) /* Paranoia */
+ if (unlikely(ev->ind_prim && ev->cfm_prim)) /* Paranoia */
skb_get(skb);
switch (ev->ind_prim) {
case LLC_DATA_PRIM:
- llc_save_primitive(skb, LLC_DATA_PRIM);
- if (sock_queue_rcv_skb(sk, skb)) {
+ llc_save_primitive(sk, skb, LLC_DATA_PRIM);
+ if (unlikely(sock_queue_rcv_skb(sk, skb))) {
/*
* shouldn't happen
*/
@@ -91,13 +103,14 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb)
kfree_skb(skb);
}
break;
- case LLC_CONN_PRIM: {
- struct sock *parent = skb->sk;
-
- skb->sk = sk;
- skb_queue_tail(&parent->sk_receive_queue, skb);
- sk->sk_state_change(parent);
- }
+ case LLC_CONN_PRIM:
+ /*
+ * Can't be sock_queue_rcv_skb, because we have to leave the
+ * skb->sk pointing to the newly created struct sock in
+ * llc_conn_handler. -acme
+ */
+ skb_queue_tail(&sk->sk_receive_queue, skb);
+ sk->sk_state_change(sk);
break;
case LLC_DISC_PRIM:
sock_hold(sk);
@@ -107,8 +120,8 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb)
sk->sk_socket->state = SS_UNCONNECTED;
sk->sk_state = TCP_CLOSE;
if (!sock_flag(sk, SOCK_DEAD)) {
- sk->sk_state_change(sk);
sock_set_flag(sk, SOCK_DEAD);
+ sk->sk_state_change(sk);
}
}
kfree_skb(skb);
@@ -461,7 +474,7 @@ static int llc_exec_conn_trans_actions(struct sock *sk,
}
/**
- * llc_lookup_established - Finds connection for the remote/local sap/mac
+ * __llc_lookup_established - Finds connection for the remote/local sap/mac
* @sap: SAP
* @daddr: address of remote LLC (MAC + SAP)
* @laddr: address of local LLC (MAC + SAP)
@@ -469,14 +482,16 @@ static int llc_exec_conn_trans_actions(struct sock *sk,
* Search connection list of the SAP and finds connection using the remote
* mac, remote sap, local mac, and local sap. Returns pointer for
* connection found, %NULL otherwise.
+ * Caller has to make sure local_bh is disabled.
*/
-struct sock *llc_lookup_established(struct llc_sap *sap, struct llc_addr *daddr,
- struct llc_addr *laddr)
+static struct sock *__llc_lookup_established(struct llc_sap *sap,
+ struct llc_addr *daddr,
+ struct llc_addr *laddr)
{
struct sock *rc;
struct hlist_node *node;
- read_lock_bh(&sap->sk_list.lock);
+ read_lock(&sap->sk_list.lock);
sk_for_each(rc, node, &sap->sk_list.list) {
struct llc_sock *llc = llc_sk(rc);
@@ -490,10 +505,22 @@ struct sock *llc_lookup_established(struct llc_sap *sap, struct llc_addr *daddr,
}
rc = NULL;
found:
- read_unlock_bh(&sap->sk_list.lock);
+ read_unlock(&sap->sk_list.lock);
return rc;
}
+struct sock *llc_lookup_established(struct llc_sap *sap,
+ struct llc_addr *daddr,
+ struct llc_addr *laddr)
+{
+ struct sock *sk;
+
+ local_bh_disable();
+ sk = __llc_lookup_established(sap, daddr, laddr);
+ local_bh_enable();
+ return sk;
+}
+
/**
* llc_lookup_listener - Finds listener for local MAC + SAP
* @sap: SAP
@@ -502,6 +529,7 @@ found:
* Search connection list of the SAP and finds connection listening on
* local mac, and local sap. Returns pointer for parent socket found,
* %NULL otherwise.
+ * Caller has to make sure local_bh is disabled.
*/
static struct sock *llc_lookup_listener(struct llc_sap *sap,
struct llc_addr *laddr)
@@ -509,7 +537,7 @@ static struct sock *llc_lookup_listener(struct llc_sap *sap,
struct sock *rc;
struct hlist_node *node;
- read_lock_bh(&sap->sk_list.lock);
+ read_lock(&sap->sk_list.lock);
sk_for_each(rc, node, &sap->sk_list.list) {
struct llc_sock *llc = llc_sk(rc);
@@ -523,10 +551,19 @@ static struct sock *llc_lookup_listener(struct llc_sap *sap,
}
rc = NULL;
found:
- read_unlock_bh(&sap->sk_list.lock);
+ read_unlock(&sap->sk_list.lock);
return rc;
}
+static struct sock *__llc_lookup(struct llc_sap *sap,
+ struct llc_addr *daddr,
+ struct llc_addr *laddr)
+{
+ struct sock *sk = __llc_lookup_established(sap, daddr, laddr);
+
+ return sk ? : llc_lookup_listener(sap, laddr);
+}
+
/**
* llc_data_accept_state - designates if in this state data can be sent.
* @state: state of connection.
@@ -540,14 +577,14 @@ u8 llc_data_accept_state(u8 state)
}
/**
- * find_next_offset - finds offset for next category of transitions
+ * llc_find_next_offset - finds offset for next category of transitions
* @state: state table.
* @offset: start offset.
*
* Finds offset of next category of transitions in transition table.
* Returns the start index of next category.
*/
-static u16 find_next_offset(struct llc_conn_state *state, u16 offset)
+static u16 __init llc_find_next_offset(struct llc_conn_state *state, u16 offset)
{
u16 cnt = 0;
struct llc_conn_state_trans **next_trans;
@@ -574,8 +611,8 @@ void __init llc_build_offset_table(void)
next_offset = 0;
for (ev_type = 0; ev_type < NBR_CONN_EV; ev_type++) {
llc_offset_table[state][ev_type] = next_offset;
- next_offset += find_next_offset(curr_state,
- next_offset) + 1;
+ next_offset += llc_find_next_offset(curr_state,
+ next_offset) + 1;
}
}
}
@@ -619,6 +656,7 @@ static int llc_find_offset(int state, int ev_type)
*/
void llc_sap_add_socket(struct llc_sap *sap, struct sock *sk)
{
+ llc_sap_hold(sap);
write_lock_bh(&sap->sk_list.lock);
llc_sk(sk)->sap = sap;
sk_add_node(sk, &sap->sk_list.list);
@@ -638,6 +676,7 @@ void llc_sap_remove_socket(struct llc_sap *sap, struct sock *sk)
write_lock_bh(&sap->sk_list.lock);
sk_del_node_init(sk);
write_unlock_bh(&sap->sk_list.lock);
+ llc_sap_put(sap);
}
/**
@@ -650,15 +689,34 @@ void llc_sap_remove_socket(struct llc_sap *sap, struct sock *sk)
static int llc_conn_rcv(struct sock* sk, struct sk_buff *skb)
{
struct llc_conn_state_ev *ev = llc_conn_ev(skb);
- struct llc_sock *llc = llc_sk(sk);
- if (!llc->dev)
- llc->dev = skb->dev;
ev->type = LLC_CONN_EV_TYPE_PDU;
ev->reason = 0;
return llc_conn_state_process(sk, skb);
}
+static struct sock *llc_create_incoming_sock(struct sock *sk,
+ struct net_device *dev,
+ struct llc_addr *saddr,
+ struct llc_addr *daddr)
+{
+ struct sock *newsk = llc_sk_alloc(sk->sk_family, GFP_ATOMIC,
+ sk->sk_prot);
+ struct llc_sock *newllc, *llc = llc_sk(sk);
+
+ if (!newsk)
+ goto out;
+ newllc = llc_sk(newsk);
+ memcpy(&newllc->laddr, daddr, sizeof(newllc->laddr));
+ memcpy(&newllc->daddr, saddr, sizeof(newllc->daddr));
+ newllc->dev = dev;
+ dev_hold(dev);
+ llc_sap_add_socket(llc->sap, newsk);
+ llc_sap_hold(llc->sap);
+out:
+ return newsk;
+}
+
void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb)
{
struct llc_addr saddr, daddr;
@@ -669,35 +727,35 @@ void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb)
llc_pdu_decode_da(skb, daddr.mac);
llc_pdu_decode_dsap(skb, &daddr.lsap);
- sk = llc_lookup_established(sap, &saddr, &daddr);
- if (!sk) {
+ sk = __llc_lookup(sap, &saddr, &daddr);
+ if (!sk)
+ goto drop;
+
+ bh_lock_sock(sk);
+ /*
+ * This has to be done here and not at the upper layer ->accept
+ * method because of the way the PROCOM state machine works:
+ * it needs to set several state variables (see, for instance,
+ * llc_adm_actions_2 in net/llc/llc_c_st.c) and send a packet to
+ * the originator of the new connection, and this state has to be
+ * in the newly created struct sock private area. -acme
+ */
+ if (unlikely(sk->sk_state == TCP_LISTEN)) {
+ struct sock *newsk = llc_create_incoming_sock(sk, skb->dev,
+ &saddr, &daddr);
+ if (!newsk)
+ goto drop_unlock;
+ skb_set_owner_r(skb, newsk);
+ } else {
/*
- * Didn't find an active connection; verify if there
- * is a listening socket for this llc addr
+ * Can't be skb_set_owner_r, this will be done at the
+ * llc_conn_state_process function, later on, when we will use
+ * skb_queue_rcv_skb to send it to upper layers, this is
+ * another trick required to cope with how the PROCOM state
+ * machine works. -acme
*/
- struct llc_sock *llc;
- struct sock *parent = llc_lookup_listener(sap, &daddr);
-
- if (!parent) {
- dprintk("llc_lookup_listener failed!\n");
- goto drop;
- }
-
- sk = llc_sk_alloc(parent->sk_family, GFP_ATOMIC, parent->sk_prot);
- if (!sk) {
- sock_put(parent);
- goto drop;
- }
- llc = llc_sk(sk);
- memcpy(&llc->laddr, &daddr, sizeof(llc->laddr));
- memcpy(&llc->daddr, &saddr, sizeof(llc->daddr));
- llc_sap_add_socket(sap, sk);
- sock_hold(sk);
- sock_put(parent);
- skb->sk = parent;
- } else
skb->sk = sk;
- bh_lock_sock(sk);
+ }
if (!sock_owned_by_user(sk))
llc_conn_rcv(sk, skb);
else {
@@ -705,11 +763,16 @@ void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb)
llc_set_backlog_type(skb, LLC_PACKET);
sk_add_backlog(sk, skb);
}
+out:
bh_unlock_sock(sk);
sock_put(sk);
return;
drop:
kfree_skb(skb);
+ return;
+drop_unlock:
+ kfree_skb(skb);
+ goto out;
}
#undef LLC_REFCNT_DEBUG
@@ -718,32 +781,6 @@ static atomic_t llc_sock_nr;
#endif
/**
- * llc_release_sockets - releases all sockets in a sap
- * @sap: sap to release its sockets
- *
- * Releases all connections of a sap. Returns 0 if all actions complete
- * successfully, nonzero otherwise
- */
-int llc_release_sockets(struct llc_sap *sap)
-{
- int rc = 0;
- struct sock *sk;
- struct hlist_node *node;
-
- write_lock_bh(&sap->sk_list.lock);
-
- sk_for_each(sk, node, &sap->sk_list.list) {
- llc_sk(sk)->state = LLC_CONN_STATE_TEMP;
-
- if (llc_send_disc(sk))
- rc = 1;
- }
-
- write_unlock_bh(&sap->sk_list.lock);
- return rc;
-}
-
-/**
* llc_backlog_rcv - Processes rx frames and expired timers.
* @sk: LLC sock (p8022 connection)
* @skb: queued rx frame or event
@@ -758,14 +795,14 @@ static int llc_backlog_rcv(struct sock *sk, struct sk_buff *skb)
int rc = 0;
struct llc_sock *llc = llc_sk(sk);
- if (llc_backlog_type(skb) == LLC_PACKET) {
- if (llc->state > 1) /* not closed */
+ if (likely(llc_backlog_type(skb) == LLC_PACKET)) {
+ if (likely(llc->state > 1)) /* not closed */
rc = llc_conn_rcv(sk, skb);
else
goto out_kfree_skb;
} else if (llc_backlog_type(skb) == LLC_EVENT) {
/* timer expiration event */
- if (llc->state > 1) /* not closed */
+ if (likely(llc->state > 1)) /* not closed */
rc = llc_conn_state_process(sk, skb);
else
goto out_kfree_skb;
@@ -795,22 +832,22 @@ static void llc_sk_init(struct sock* sk)
llc->dec_step = llc->connect_step = 1;
init_timer(&llc->ack_timer.timer);
- llc->ack_timer.expire = LLC_ACK_TIME;
+ llc->ack_timer.expire = sysctl_llc2_ack_timeout;
llc->ack_timer.timer.data = (unsigned long)sk;
llc->ack_timer.timer.function = llc_conn_ack_tmr_cb;
init_timer(&llc->pf_cycle_timer.timer);
- llc->pf_cycle_timer.expire = LLC_P_TIME;
+ llc->pf_cycle_timer.expire = sysctl_llc2_p_timeout;
llc->pf_cycle_timer.timer.data = (unsigned long)sk;
llc->pf_cycle_timer.timer.function = llc_conn_pf_cycle_tmr_cb;
init_timer(&llc->rej_sent_timer.timer);
- llc->rej_sent_timer.expire = LLC_REJ_TIME;
+ llc->rej_sent_timer.expire = sysctl_llc2_rej_timeout;
llc->rej_sent_timer.timer.data = (unsigned long)sk;
llc->rej_sent_timer.timer.function = llc_conn_rej_tmr_cb;
init_timer(&llc->busy_state_timer.timer);
- llc->busy_state_timer.expire = LLC_BUSY_TIME;
+ llc->busy_state_timer.expire = sysctl_llc2_busy_timeout;
llc->busy_state_timer.timer.data = (unsigned long)sk;
llc->busy_state_timer.timer.function = llc_conn_busy_tmr_cb;
@@ -830,7 +867,7 @@ static void llc_sk_init(struct sock* sk)
* Allocates a LLC sock and initializes it. Returns the new LLC sock
* or %NULL if there's no memory available for one
*/
-struct sock *llc_sk_alloc(int family, int priority, struct proto *prot)
+struct sock *llc_sk_alloc(int family, gfp_t priority, struct proto *prot)
{
struct sock *sk = sk_alloc(family, priority, prot, 1);
diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c
index 5ff02c080a0..ab0fcd32fd8 100644
--- a/net/llc/llc_core.c
+++ b/net/llc/llc_core.c
@@ -40,6 +40,7 @@ static struct llc_sap *llc_sap_alloc(void)
sap->state = LLC_SAP_STATE_ACTIVE;
memcpy(sap->laddr.mac, llc_station_mac_sa, ETH_ALEN);
rwlock_init(&sap->sk_list.lock);
+ atomic_set(&sap->refcnt, 1);
}
return sap;
}
@@ -52,9 +53,7 @@ static struct llc_sap *llc_sap_alloc(void)
*/
static void llc_add_sap(struct llc_sap *sap)
{
- write_lock_bh(&llc_sap_list_lock);
list_add_tail(&sap->node, &llc_sap_list);
- write_unlock_bh(&llc_sap_list_lock);
}
/**
@@ -70,11 +69,25 @@ static void llc_del_sap(struct llc_sap *sap)
write_unlock_bh(&llc_sap_list_lock);
}
+static struct llc_sap *__llc_sap_find(unsigned char sap_value)
+{
+ struct llc_sap* sap;
+
+ list_for_each_entry(sap, &llc_sap_list, node)
+ if (sap->laddr.lsap == sap_value)
+ goto out;
+ sap = NULL;
+out:
+ return sap;
+}
+
/**
* llc_sap_find - searchs a SAP in station
* @sap_value: sap to be found
*
* Searchs for a sap in the sap list of the LLC's station upon the sap ID.
+ * If the sap is found it will be refcounted and the user will have to do
+ * a llc_sap_put after use.
* Returns the sap or %NULL if not found.
*/
struct llc_sap *llc_sap_find(unsigned char sap_value)
@@ -82,11 +95,9 @@ struct llc_sap *llc_sap_find(unsigned char sap_value)
struct llc_sap* sap;
read_lock_bh(&llc_sap_list_lock);
- list_for_each_entry(sap, &llc_sap_list, node)
- if (sap->laddr.lsap == sap_value)
- goto out;
- sap = NULL;
-out:
+ sap = __llc_sap_find(sap_value);
+ if (sap)
+ llc_sap_hold(sap);
read_unlock_bh(&llc_sap_list_lock);
return sap;
}
@@ -103,21 +114,23 @@ out:
struct llc_sap *llc_sap_open(unsigned char lsap,
int (*func)(struct sk_buff *skb,
struct net_device *dev,
- struct packet_type *pt))
+ struct packet_type *pt,
+ struct net_device *orig_dev))
{
- struct llc_sap *sap = llc_sap_find(lsap);
+ struct llc_sap *sap = NULL;
- if (sap) { /* SAP already exists */
- sap = NULL;
+ write_lock_bh(&llc_sap_list_lock);
+ if (__llc_sap_find(lsap)) /* SAP already exists */
goto out;
- }
sap = llc_sap_alloc();
if (!sap)
goto out;
sap->laddr.lsap = lsap;
sap->rcv_func = func;
+ llc_sap_hold(sap);
llc_add_sap(sap);
out:
+ write_unlock_bh(&llc_sap_list_lock);
return sap;
}
diff --git a/net/llc/llc_if.c b/net/llc/llc_if.c
index 0f9fc48aeaf..ba90f7f0801 100644
--- a/net/llc/llc_if.c
+++ b/net/llc/llc_if.c
@@ -15,7 +15,6 @@
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
-#include <linux/tcp.h>
#include <asm/errno.h>
#include <net/llc_if.h>
#include <net/llc_sap.h>
@@ -25,6 +24,7 @@
#include <net/llc_c_ev.h>
#include <net/llc_c_ac.h>
#include <net/llc_c_st.h>
+#include <net/tcp_states.h>
u8 llc_mac_null_var[IFHWADDRLEN];
@@ -47,14 +47,11 @@ int llc_build_and_send_pkt(struct sock *sk, struct sk_buff *skb)
int rc = -ECONNABORTED;
struct llc_sock *llc = llc_sk(sk);
- if (llc->state == LLC_CONN_STATE_ADM)
+ if (unlikely(llc->state == LLC_CONN_STATE_ADM))
goto out;
rc = -EBUSY;
- if (llc_data_accept_state(llc->state)) { /* data_conn_refuse */
- llc->failed_data_req = 1;
- goto out;
- }
- if (llc->p_flag) {
+ if (unlikely(llc_data_accept_state(llc->state) || /* data_conn_refuse */
+ llc->p_flag)) {
llc->failed_data_req = 1;
goto out;
}
@@ -110,6 +107,7 @@ int llc_establish_connection(struct sock *sk, u8 *lmac, u8 *dmac, u8 dsap)
ev->type = LLC_CONN_EV_TYPE_PRIM;
ev->prim = LLC_CONN_PRIM;
ev->prim_type = LLC_PRIM_TYPE_REQ;
+ skb_set_owner_w(skb, sk);
rc = llc_conn_state_process(sk, skb);
}
out_put:
@@ -144,6 +142,7 @@ int llc_send_disc(struct sock *sk)
skb = alloc_skb(0, GFP_ATOMIC);
if (!skb)
goto out;
+ skb_set_owner_w(skb, sk);
sk->sk_state = TCP_CLOSING;
ev = llc_conn_ev(skb);
ev->type = LLC_CONN_EV_TYPE_PRIM;
diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c
index 4da6976efc9..8f3addf0724 100644
--- a/net/llc/llc_input.c
+++ b/net/llc/llc_input.c
@@ -99,15 +99,19 @@ out:
static inline int llc_fixup_skb(struct sk_buff *skb)
{
u8 llc_len = 2;
- struct llc_pdu_sn *pdu;
+ struct llc_pdu_un *pdu;
- if (!pskb_may_pull(skb, sizeof(*pdu)))
+ if (unlikely(!pskb_may_pull(skb, sizeof(*pdu))))
return 0;
- pdu = (struct llc_pdu_sn *)skb->data;
+ pdu = (struct llc_pdu_un *)skb->data;
if ((pdu->ctrl_1 & LLC_PDU_TYPE_MASK) == LLC_PDU_TYPE_U)
llc_len = 1;
llc_len += 2;
+
+ if (unlikely(!pskb_may_pull(skb, llc_len)))
+ return 0;
+
skb->h.raw += llc_len;
skb_pull(skb, llc_len);
if (skb->protocol == htons(ETH_P_802_2)) {
@@ -132,7 +136,7 @@ static inline int llc_fixup_skb(struct sk_buff *skb)
* data now), it queues this frame in the connection's backlog.
*/
int llc_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt)
+ struct packet_type *pt, struct net_device *orig_dev)
{
struct llc_sap *sap;
struct llc_pdu_sn *pdu;
@@ -165,18 +169,23 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev,
* LLC functionality
*/
if (sap->rcv_func) {
- sap->rcv_func(skb, dev, pt);
- goto out;
+ sap->rcv_func(skb, dev, pt, orig_dev);
+ goto out_put;
}
dest = llc_pdu_type(skb);
if (unlikely(!dest || !llc_type_handlers[dest - 1]))
- goto drop;
+ goto drop_put;
llc_type_handlers[dest - 1](sap, skb);
+out_put:
+ llc_sap_put(sap);
out:
return 0;
drop:
kfree_skb(skb);
goto out;
+drop_put:
+ kfree_skb(skb);
+ goto out_put;
handle_station:
if (!llc_station_handler)
goto drop;
diff --git a/net/llc/llc_output.c b/net/llc/llc_output.c
index ab5784cf163..b4d55b6abb6 100644
--- a/net/llc/llc_output.c
+++ b/net/llc/llc_output.c
@@ -98,7 +98,7 @@ int llc_build_and_send_ui_pkt(struct llc_sap *sap, struct sk_buff *skb,
dsap, LLC_PDU_CMD);
llc_pdu_init_as_ui_cmd(skb);
rc = llc_mac_hdr_init(skb, skb->dev->dev_addr, dmac);
- if (!rc)
+ if (likely(!rc))
rc = dev_queue_xmit(skb);
return rc;
}
diff --git a/net/llc/llc_proc.c b/net/llc/llc_proc.c
index 36e8db3fa1a..bd531cb235a 100644
--- a/net/llc/llc_proc.c
+++ b/net/llc/llc_proc.c
@@ -134,7 +134,7 @@ static int llc_seq_socket_show(struct seq_file *seq, void *v)
llc_ui_format_mac(seq, llc->daddr.mac);
seq_printf(seq, "@%02X %8d %8d %2d %3d %4d\n", llc->daddr.lsap,
atomic_read(&sk->sk_wmem_alloc),
- atomic_read(&sk->sk_rmem_alloc),
+ atomic_read(&sk->sk_rmem_alloc) - llc->copied_seq,
sk->sk_state,
sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : -1,
llc->link);
diff --git a/net/llc/llc_s_ac.c b/net/llc/llc_s_ac.c
index ed8ba7de612..bb3580fb8cf 100644
--- a/net/llc/llc_s_ac.c
+++ b/net/llc/llc_s_ac.c
@@ -58,7 +58,7 @@ int llc_sap_action_send_ui(struct llc_sap *sap, struct sk_buff *skb)
ev->daddr.lsap, LLC_PDU_CMD);
llc_pdu_init_as_ui_cmd(skb);
rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac);
- if (!rc)
+ if (likely(!rc))
rc = dev_queue_xmit(skb);
return rc;
}
@@ -81,7 +81,7 @@ int llc_sap_action_send_xid_c(struct llc_sap *sap, struct sk_buff *skb)
ev->daddr.lsap, LLC_PDU_CMD);
llc_pdu_init_as_xid_cmd(skb, LLC_XID_NULL_CLASS_2, 0);
rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac);
- if (!rc)
+ if (likely(!rc))
rc = dev_queue_xmit(skb);
return rc;
}
@@ -103,15 +103,14 @@ int llc_sap_action_send_xid_r(struct llc_sap *sap, struct sk_buff *skb)
llc_pdu_decode_sa(skb, mac_da);
llc_pdu_decode_da(skb, mac_sa);
llc_pdu_decode_ssap(skb, &dsap);
- nskb = llc_alloc_frame();
+ nskb = llc_alloc_frame(NULL, skb->dev);
if (!nskb)
goto out;
- nskb->dev = skb->dev;
llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, dsap,
LLC_PDU_RSP);
llc_pdu_init_as_xid_rsp(nskb, LLC_XID_NULL_CLASS_2, 0);
rc = llc_mac_hdr_init(nskb, mac_sa, mac_da);
- if (!rc)
+ if (likely(!rc))
rc = dev_queue_xmit(nskb);
out:
return rc;
@@ -135,7 +134,7 @@ int llc_sap_action_send_test_c(struct llc_sap *sap, struct sk_buff *skb)
ev->daddr.lsap, LLC_PDU_CMD);
llc_pdu_init_as_test_cmd(skb);
rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac);
- if (!rc)
+ if (likely(!rc))
rc = dev_queue_xmit(skb);
return rc;
}
@@ -149,15 +148,14 @@ int llc_sap_action_send_test_r(struct llc_sap *sap, struct sk_buff *skb)
llc_pdu_decode_sa(skb, mac_da);
llc_pdu_decode_da(skb, mac_sa);
llc_pdu_decode_ssap(skb, &dsap);
- nskb = llc_alloc_frame();
+ nskb = llc_alloc_frame(NULL, skb->dev);
if (!nskb)
goto out;
- nskb->dev = skb->dev;
llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, dsap,
LLC_PDU_RSP);
llc_pdu_init_as_test_rsp(nskb, skb);
rc = llc_mac_hdr_init(nskb, mac_sa, mac_da);
- if (!rc)
+ if (likely(!rc))
rc = dev_queue_xmit(nskb);
out:
return rc;
diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c
index 965c94eb4bb..4029ceee9b9 100644
--- a/net/llc/llc_sap.c
+++ b/net/llc/llc_sap.c
@@ -21,16 +21,17 @@
#include <net/llc_s_ev.h>
#include <net/llc_s_st.h>
#include <net/sock.h>
-#include <linux/tcp.h>
+#include <net/tcp_states.h>
#include <linux/llc.h>
/**
* llc_alloc_frame - allocates sk_buff for frame
+ * @dev: network device this skb will be sent over
*
* Allocates an sk_buff for frame and initializes sk_buff fields.
* Returns allocated skb or %NULL when out of memory.
*/
-struct sk_buff *llc_alloc_frame(void)
+struct sk_buff *llc_alloc_frame(struct sock *sk, struct net_device *dev)
{
struct sk_buff *skb = alloc_skb(128, GFP_ATOMIC);
@@ -38,18 +39,23 @@ struct sk_buff *llc_alloc_frame(void)
skb_reserve(skb, 50);
skb->nh.raw = skb->h.raw = skb->data;
skb->protocol = htons(ETH_P_802_2);
- skb->dev = dev_base->next;
+ skb->dev = dev;
skb->mac.raw = skb->head;
+ if (sk != NULL)
+ skb_set_owner_w(skb, sk);
}
return skb;
}
-void llc_save_primitive(struct sk_buff* skb, u8 prim)
+void llc_save_primitive(struct sock *sk, struct sk_buff* skb, u8 prim)
{
- struct sockaddr_llc *addr = llc_ui_skb_cb(skb);
+ struct sockaddr_llc *addr;
+ if (skb->sk->sk_type == SOCK_STREAM) /* See UNIX98 */
+ return;
/* save primitive for use by the user. */
- addr->sllc_family = skb->sk->sk_family;
+ addr = llc_ui_skb_cb(skb);
+ addr->sllc_family = sk->sk_family;
addr->sllc_arphrd = skb->dev->type;
addr->sllc_test = prim == LLC_TEST_PRIM;
addr->sllc_xid = prim == LLC_XID_PRIM;
@@ -189,7 +195,7 @@ static void llc_sap_state_process(struct llc_sap *sap, struct sk_buff *skb)
if (skb->sk->sk_state == TCP_LISTEN)
kfree_skb(skb);
else {
- llc_save_primitive(skb, ev->prim);
+ llc_save_primitive(skb->sk, skb, ev->prim);
/* queue skb to the user. */
if (sock_queue_rcv_skb(skb->sk, skb))
@@ -308,7 +314,7 @@ void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb)
sk = llc_lookup_dgram(sap, &laddr);
if (sk) {
- skb->sk = sk;
+ skb_set_owner_r(skb, sk);
llc_sap_rcv(sap, skb);
sock_put(sk);
} else
diff --git a/net/llc/llc_station.c b/net/llc/llc_station.c
index 8fe48a24bad..f37dbf8ef12 100644
--- a/net/llc/llc_station.c
+++ b/net/llc/llc_station.c
@@ -50,6 +50,10 @@ struct llc_station {
struct sk_buff_head mac_pdu_q;
};
+#define LLC_STATION_ACK_TIME (3 * HZ)
+
+int sysctl_llc_station_ack_timeout = LLC_STATION_ACK_TIME;
+
/* Types of events (possible values in 'ev->type') */
#define LLC_STATION_EV_TYPE_SIMPLE 1
#define LLC_STATION_EV_TYPE_CONDITION 2
@@ -218,7 +222,8 @@ static void llc_station_send_pdu(struct sk_buff *skb)
static int llc_station_ac_start_ack_timer(struct sk_buff *skb)
{
- mod_timer(&llc_main_station.ack_timer, jiffies + LLC_ACK_TIME * HZ);
+ mod_timer(&llc_main_station.ack_timer,
+ jiffies + sysctl_llc_station_ack_timeout);
return 0;
}
@@ -249,14 +254,14 @@ static int llc_station_ac_inc_xid_r_cnt_by_1(struct sk_buff *skb)
static int llc_station_ac_send_null_dsap_xid_c(struct sk_buff *skb)
{
int rc = 1;
- struct sk_buff *nskb = llc_alloc_frame();
+ struct sk_buff *nskb = llc_alloc_frame(NULL, skb->dev);
if (!nskb)
goto out;
llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, 0, 0, LLC_PDU_CMD);
llc_pdu_init_as_xid_cmd(nskb, LLC_XID_NULL_CLASS_2, 127);
rc = llc_mac_hdr_init(nskb, llc_station_mac_sa, llc_station_mac_sa);
- if (rc)
+ if (unlikely(rc))
goto free;
llc_station_send_pdu(nskb);
out:
@@ -270,18 +275,17 @@ static int llc_station_ac_send_xid_r(struct sk_buff *skb)
{
u8 mac_da[ETH_ALEN], dsap;
int rc = 1;
- struct sk_buff* nskb = llc_alloc_frame();
+ struct sk_buff* nskb = llc_alloc_frame(NULL, skb->dev);
if (!nskb)
goto out;
rc = 0;
- nskb->dev = skb->dev;
llc_pdu_decode_sa(skb, mac_da);
llc_pdu_decode_ssap(skb, &dsap);
llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, 0, dsap, LLC_PDU_RSP);
llc_pdu_init_as_xid_rsp(nskb, LLC_XID_NULL_CLASS_2, 127);
rc = llc_mac_hdr_init(nskb, llc_station_mac_sa, mac_da);
- if (rc)
+ if (unlikely(rc))
goto free;
llc_station_send_pdu(nskb);
out:
@@ -295,18 +299,17 @@ static int llc_station_ac_send_test_r(struct sk_buff *skb)
{
u8 mac_da[ETH_ALEN], dsap;
int rc = 1;
- struct sk_buff *nskb = llc_alloc_frame();
+ struct sk_buff *nskb = llc_alloc_frame(NULL, skb->dev);
if (!nskb)
goto out;
rc = 0;
- nskb->dev = skb->dev;
llc_pdu_decode_sa(skb, mac_da);
llc_pdu_decode_ssap(skb, &dsap);
llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, 0, dsap, LLC_PDU_RSP);
llc_pdu_init_as_test_rsp(nskb, skb);
rc = llc_mac_hdr_init(nskb, llc_station_mac_sa, mac_da);
- if (rc)
+ if (unlikely(rc))
goto free;
llc_station_send_pdu(nskb);
out:
@@ -689,7 +692,8 @@ int __init llc_station_init(void)
init_timer(&llc_main_station.ack_timer);
llc_main_station.ack_timer.data = (unsigned long)&llc_main_station;
llc_main_station.ack_timer.function = llc_station_ack_tmr_cb;
-
+ llc_main_station.ack_timer.expires = jiffies +
+ sysctl_llc_station_ack_timeout;
skb = alloc_skb(0, GFP_ATOMIC);
if (!skb)
goto out;
@@ -697,7 +701,6 @@ int __init llc_station_init(void)
llc_set_station_handler(llc_station_rcv);
ev = llc_station_ev(skb);
memset(ev, 0, sizeof(*ev));
- llc_main_station.ack_timer.expires = jiffies + 3 * HZ;
llc_main_station.maximum_retry = 1;
llc_main_station.state = LLC_STATION_STATE_DOWN;
ev->type = LLC_STATION_EV_TYPE_SIMPLE;
diff --git a/net/llc/sysctl_net_llc.c b/net/llc/sysctl_net_llc.c
new file mode 100644
index 00000000000..d1eaddb1363
--- /dev/null
+++ b/net/llc/sysctl_net_llc.c
@@ -0,0 +1,131 @@
+/*
+ * sysctl_net_llc.c: sysctl interface to LLC net subsystem.
+ *
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <net/llc.h>
+
+#ifndef CONFIG_SYSCTL
+#error This file should not be compiled without CONFIG_SYSCTL defined
+#endif
+
+static struct ctl_table llc2_timeout_table[] = {
+ {
+ .ctl_name = NET_LLC2_ACK_TIMEOUT,
+ .procname = "ack",
+ .data = &sysctl_llc2_ack_timeout,
+ .maxlen = sizeof(long),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies,
+ },
+ {
+ .ctl_name = NET_LLC2_BUSY_TIMEOUT,
+ .procname = "busy",
+ .data = &sysctl_llc2_busy_timeout,
+ .maxlen = sizeof(long),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies,
+ },
+ {
+ .ctl_name = NET_LLC2_P_TIMEOUT,
+ .procname = "p",
+ .data = &sysctl_llc2_p_timeout,
+ .maxlen = sizeof(long),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies,
+ },
+ {
+ .ctl_name = NET_LLC2_REJ_TIMEOUT,
+ .procname = "rej",
+ .data = &sysctl_llc2_rej_timeout,
+ .maxlen = sizeof(long),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies,
+ },
+ { 0 },
+};
+
+static struct ctl_table llc_station_table[] = {
+ {
+ .ctl_name = NET_LLC_STATION_ACK_TIMEOUT,
+ .procname = "ack_timeout",
+ .data = &sysctl_llc_station_ack_timeout,
+ .maxlen = sizeof(long),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies,
+ },
+ { 0 },
+};
+
+static struct ctl_table llc2_dir_timeout_table[] = {
+ {
+ .ctl_name = NET_LLC2,
+ .procname = "timeout",
+ .mode = 0555,
+ .child = llc2_timeout_table,
+ },
+ { 0 },
+};
+
+static struct ctl_table llc_table[] = {
+ {
+ .ctl_name = NET_LLC2,
+ .procname = "llc2",
+ .mode = 0555,
+ .child = llc2_dir_timeout_table,
+ },
+ {
+ .ctl_name = NET_LLC_STATION,
+ .procname = "station",
+ .mode = 0555,
+ .child = llc_station_table,
+ },
+ { 0 },
+};
+
+static struct ctl_table llc_dir_table[] = {
+ {
+ .ctl_name = NET_LLC,
+ .procname = "llc",
+ .mode = 0555,
+ .child = llc_table,
+ },
+ { 0 },
+};
+
+static struct ctl_table llc_root_table[] = {
+ {
+ .ctl_name = CTL_NET,
+ .procname = "net",
+ .mode = 0555,
+ .child = llc_dir_table,
+ },
+ { 0 },
+};
+
+static struct ctl_table_header *llc_table_header;
+
+int __init llc_sysctl_init(void)
+{
+ llc_table_header = register_sysctl_table(llc_root_table, 1);
+
+ return llc_table_header ? 0 : -ENOMEM;
+}
+
+void llc_sysctl_exit(void)
+{
+ if (llc_table_header) {
+ unregister_sysctl_table(llc_table_header);
+ llc_table_header = NULL;
+ }
+}
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
new file mode 100644
index 00000000000..8296b38bf27
--- /dev/null
+++ b/net/netfilter/Kconfig
@@ -0,0 +1,24 @@
+config NETFILTER_NETLINK
+ tristate "Netfilter netlink interface"
+ help
+ If this option is enabled, the kernel will include support
+ for the new netfilter netlink interface.
+
+config NETFILTER_NETLINK_QUEUE
+ tristate "Netfilter NFQUEUE over NFNETLINK interface"
+ depends on NETFILTER_NETLINK
+ help
+ If this option isenabled, the kernel will include support
+ for queueing packets via NFNETLINK.
+
+config NETFILTER_NETLINK_LOG
+ tristate "Netfilter LOG over NFNETLINK interface"
+ depends on NETFILTER_NETLINK
+ help
+ If this option is enabled, the kernel will include support
+ for logging packets via NFNETLINK.
+
+ This obsoletes the existing ipt_ULOG and ebg_ulog mechanisms,
+ and is also scheduled to replace the old syslog-based ipt_LOG
+ and ip6t_LOG modules.
+
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
new file mode 100644
index 00000000000..b3b44f8b415
--- /dev/null
+++ b/net/netfilter/Makefile
@@ -0,0 +1,7 @@
+netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o
+
+obj-$(CONFIG_NETFILTER) = netfilter.o
+
+obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o
+obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o
+obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
new file mode 100644
index 00000000000..1ceb1a6c254
--- /dev/null
+++ b/net/netfilter/core.c
@@ -0,0 +1,216 @@
+/* netfilter.c: look after the filters for various protocols.
+ * Heavily influenced by the old firewall.c by David Bonn and Alan Cox.
+ *
+ * Thanks to Rob `CmdrTaco' Malda for not influencing this code in any
+ * way.
+ *
+ * Rusty Russell (C)2000 -- This code is GPL.
+ *
+ * February 2000: Modified by James Morris to have 1 queue per protocol.
+ * 15-Mar-2000: Added NF_REPEAT --RR.
+ * 08-May-2003: Internal logging interface added by Jozsef Kadlecsik.
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <net/protocol.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/wait.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/if.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/proc_fs.h>
+#include <net/sock.h>
+
+#include "nf_internals.h"
+
+/* In this code, we can be waiting indefinitely for userspace to
+ * service a packet if a hook returns NF_QUEUE. We could keep a count
+ * of skbuffs queued for userspace, and not deregister a hook unless
+ * this is zero, but that sucks. Now, we simply check when the
+ * packets come back: if the hook is gone, the packet is discarded. */
+struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS];
+EXPORT_SYMBOL(nf_hooks);
+static DEFINE_SPINLOCK(nf_hook_lock);
+
+int nf_register_hook(struct nf_hook_ops *reg)
+{
+ struct list_head *i;
+
+ spin_lock_bh(&nf_hook_lock);
+ list_for_each(i, &nf_hooks[reg->pf][reg->hooknum]) {
+ if (reg->priority < ((struct nf_hook_ops *)i)->priority)
+ break;
+ }
+ list_add_rcu(&reg->list, i->prev);
+ spin_unlock_bh(&nf_hook_lock);
+
+ synchronize_net();
+ return 0;
+}
+EXPORT_SYMBOL(nf_register_hook);
+
+void nf_unregister_hook(struct nf_hook_ops *reg)
+{
+ spin_lock_bh(&nf_hook_lock);
+ list_del_rcu(&reg->list);
+ spin_unlock_bh(&nf_hook_lock);
+
+ synchronize_net();
+}
+EXPORT_SYMBOL(nf_unregister_hook);
+
+unsigned int nf_iterate(struct list_head *head,
+ struct sk_buff **skb,
+ int hook,
+ const struct net_device *indev,
+ const struct net_device *outdev,
+ struct list_head **i,
+ int (*okfn)(struct sk_buff *),
+ int hook_thresh)
+{
+ unsigned int verdict;
+
+ /*
+ * The caller must not block between calls to this
+ * function because of risk of continuing from deleted element.
+ */
+ list_for_each_continue_rcu(*i, head) {
+ struct nf_hook_ops *elem = (struct nf_hook_ops *)*i;
+
+ if (hook_thresh > elem->priority)
+ continue;
+
+ /* Optimization: we don't need to hold module
+ reference here, since function can't sleep. --RR */
+ verdict = elem->hook(hook, skb, indev, outdev, okfn);
+ if (verdict != NF_ACCEPT) {
+#ifdef CONFIG_NETFILTER_DEBUG
+ if (unlikely((verdict & NF_VERDICT_MASK)
+ > NF_MAX_VERDICT)) {
+ NFDEBUG("Evil return from %p(%u).\n",
+ elem->hook, hook);
+ continue;
+ }
+#endif
+ if (verdict != NF_REPEAT)
+ return verdict;
+ *i = (*i)->prev;
+ }
+ }
+ return NF_ACCEPT;
+}
+
+
+/* Returns 1 if okfn() needs to be executed by the caller,
+ * -EPERM for NF_DROP, 0 otherwise. */
+int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb,
+ struct net_device *indev,
+ struct net_device *outdev,
+ int (*okfn)(struct sk_buff *),
+ int hook_thresh)
+{
+ struct list_head *elem;
+ unsigned int verdict;
+ int ret = 0;
+
+ /* We may already have this, but read-locks nest anyway */
+ rcu_read_lock();
+
+ elem = &nf_hooks[pf][hook];
+next_hook:
+ verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev,
+ outdev, &elem, okfn, hook_thresh);
+ if (verdict == NF_ACCEPT || verdict == NF_STOP) {
+ ret = 1;
+ goto unlock;
+ } else if (verdict == NF_DROP) {
+ kfree_skb(*pskb);
+ ret = -EPERM;
+ } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) {
+ NFDEBUG("nf_hook: Verdict = QUEUE.\n");
+ if (!nf_queue(pskb, elem, pf, hook, indev, outdev, okfn,
+ verdict >> NF_VERDICT_BITS))
+ goto next_hook;
+ }
+unlock:
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL(nf_hook_slow);
+
+
+int skb_make_writable(struct sk_buff **pskb, unsigned int writable_len)
+{
+ struct sk_buff *nskb;
+
+ if (writable_len > (*pskb)->len)
+ return 0;
+
+ /* Not exclusive use of packet? Must copy. */
+ if (skb_shared(*pskb) || skb_cloned(*pskb))
+ goto copy_skb;
+
+ return pskb_may_pull(*pskb, writable_len);
+
+copy_skb:
+ nskb = skb_copy(*pskb, GFP_ATOMIC);
+ if (!nskb)
+ return 0;
+ BUG_ON(skb_is_nonlinear(nskb));
+
+ /* Rest of kernel will get very unhappy if we pass it a
+ suddenly-orphaned skbuff */
+ if ((*pskb)->sk)
+ skb_set_owner_w(nskb, (*pskb)->sk);
+ kfree_skb(*pskb);
+ *pskb = nskb;
+ return 1;
+}
+EXPORT_SYMBOL(skb_make_writable);
+
+
+/* This does not belong here, but locally generated errors need it if connection
+ tracking in use: without this, connection may not be in hash table, and hence
+ manufactured ICMP or RST packets will not be associated with it. */
+void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *);
+EXPORT_SYMBOL(ip_ct_attach);
+
+void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb)
+{
+ void (*attach)(struct sk_buff *, struct sk_buff *);
+
+ if (skb->nfct && (attach = ip_ct_attach) != NULL) {
+ mb(); /* Just to be sure: must be read before executing this */
+ attach(new, skb);
+ }
+}
+EXPORT_SYMBOL(nf_ct_attach);
+
+#ifdef CONFIG_PROC_FS
+struct proc_dir_entry *proc_net_netfilter;
+EXPORT_SYMBOL(proc_net_netfilter);
+#endif
+
+void __init netfilter_init(void)
+{
+ int i, h;
+ for (i = 0; i < NPROTO; i++) {
+ for (h = 0; h < NF_MAX_HOOKS; h++)
+ INIT_LIST_HEAD(&nf_hooks[i][h]);
+ }
+
+#ifdef CONFIG_PROC_FS
+ proc_net_netfilter = proc_mkdir("netfilter", proc_net);
+ if (!proc_net_netfilter)
+ panic("cannot create netfilter proc entry");
+#endif
+
+ if (netfilter_queue_init() < 0)
+ panic("cannot initialize nf_queue");
+ if (netfilter_log_init() < 0)
+ panic("cannot initialize nf_log");
+}
diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h
new file mode 100644
index 00000000000..6bdee291061
--- /dev/null
+++ b/net/netfilter/nf_internals.h
@@ -0,0 +1,39 @@
+#ifndef _NF_INTERNALS_H
+#define _NF_INTERNALS_H
+
+#include <linux/config.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+
+#ifdef CONFIG_NETFILTER_DEBUG
+#define NFDEBUG(format, args...) printk(format , ## args)
+#else
+#define NFDEBUG(format, args...)
+#endif
+
+
+/* core.c */
+extern unsigned int nf_iterate(struct list_head *head,
+ struct sk_buff **skb,
+ int hook,
+ const struct net_device *indev,
+ const struct net_device *outdev,
+ struct list_head **i,
+ int (*okfn)(struct sk_buff *),
+ int hook_thresh);
+
+/* nf_queue.c */
+extern int nf_queue(struct sk_buff **skb,
+ struct list_head *elem,
+ int pf, unsigned int hook,
+ struct net_device *indev,
+ struct net_device *outdev,
+ int (*okfn)(struct sk_buff *),
+ unsigned int queuenum);
+extern int __init netfilter_queue_init(void);
+
+/* nf_log.c */
+extern int __init netfilter_log_init(void);
+
+#endif
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
new file mode 100644
index 00000000000..3e76bd0824a
--- /dev/null
+++ b/net/netfilter/nf_log.c
@@ -0,0 +1,178 @@
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <linux/seq_file.h>
+#include <net/protocol.h>
+
+#include "nf_internals.h"
+
+/* Internal logging interface, which relies on the real
+ LOG target modules */
+
+#define NF_LOG_PREFIXLEN 128
+
+static struct nf_logger *nf_logging[NPROTO]; /* = NULL */
+static DEFINE_SPINLOCK(nf_log_lock);
+
+/* return EBUSY if somebody else is registered, EEXIST if the same logger
+ * is registred, 0 on success. */
+int nf_log_register(int pf, struct nf_logger *logger)
+{
+ int ret = -EBUSY;
+
+ if (pf >= NPROTO)
+ return -EINVAL;
+
+ /* Any setup of logging members must be done before
+ * substituting pointer. */
+ spin_lock(&nf_log_lock);
+ if (!nf_logging[pf]) {
+ rcu_assign_pointer(nf_logging[pf], logger);
+ ret = 0;
+ } else if (nf_logging[pf] == logger)
+ ret = -EEXIST;
+
+ spin_unlock(&nf_log_lock);
+ return ret;
+}
+EXPORT_SYMBOL(nf_log_register);
+
+int nf_log_unregister_pf(int pf)
+{
+ if (pf >= NPROTO)
+ return -EINVAL;
+
+ spin_lock(&nf_log_lock);
+ nf_logging[pf] = NULL;
+ spin_unlock(&nf_log_lock);
+
+ /* Give time to concurrent readers. */
+ synchronize_net();
+
+ return 0;
+}
+EXPORT_SYMBOL(nf_log_unregister_pf);
+
+void nf_log_unregister_logger(struct nf_logger *logger)
+{
+ int i;
+
+ spin_lock(&nf_log_lock);
+ for (i = 0; i < NPROTO; i++) {
+ if (nf_logging[i] == logger)
+ nf_logging[i] = NULL;
+ }
+ spin_unlock(&nf_log_lock);
+
+ synchronize_net();
+}
+EXPORT_SYMBOL(nf_log_unregister_logger);
+
+void nf_log_packet(int pf,
+ unsigned int hooknum,
+ const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ struct nf_loginfo *loginfo,
+ const char *fmt, ...)
+{
+ va_list args;
+ char prefix[NF_LOG_PREFIXLEN];
+ struct nf_logger *logger;
+
+ rcu_read_lock();
+ logger = rcu_dereference(nf_logging[pf]);
+ if (logger) {
+ va_start(args, fmt);
+ vsnprintf(prefix, sizeof(prefix), fmt, args);
+ va_end(args);
+ /* We must read logging before nf_logfn[pf] */
+ logger->logfn(pf, hooknum, skb, in, out, loginfo, prefix);
+ } else if (net_ratelimit()) {
+ printk(KERN_WARNING "nf_log_packet: can\'t log since "
+ "no backend logging module loaded in! Please either "
+ "load one, or disable logging explicitly\n");
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(nf_log_packet);
+
+#ifdef CONFIG_PROC_FS
+static void *seq_start(struct seq_file *seq, loff_t *pos)
+{
+ rcu_read_lock();
+
+ if (*pos >= NPROTO)
+ return NULL;
+
+ return pos;
+}
+
+static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ (*pos)++;
+
+ if (*pos >= NPROTO)
+ return NULL;
+
+ return pos;
+}
+
+static void seq_stop(struct seq_file *s, void *v)
+{
+ rcu_read_unlock();
+}
+
+static int seq_show(struct seq_file *s, void *v)
+{
+ loff_t *pos = v;
+ const struct nf_logger *logger;
+
+ logger = rcu_dereference(nf_logging[*pos]);
+
+ if (!logger)
+ return seq_printf(s, "%2lld NONE\n", *pos);
+
+ return seq_printf(s, "%2lld %s\n", *pos, logger->name);
+}
+
+static struct seq_operations nflog_seq_ops = {
+ .start = seq_start,
+ .next = seq_next,
+ .stop = seq_stop,
+ .show = seq_show,
+};
+
+static int nflog_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &nflog_seq_ops);
+}
+
+static struct file_operations nflog_file_ops = {
+ .owner = THIS_MODULE,
+ .open = nflog_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+#endif /* PROC_FS */
+
+
+int __init netfilter_log_init(void)
+{
+#ifdef CONFIG_PROC_FS
+ struct proc_dir_entry *pde;
+
+ pde = create_proc_entry("nf_log", S_IRUGO, proc_net_netfilter);
+ if (!pde)
+ return -1;
+
+ pde->proc_fops = &nflog_file_ops;
+#endif
+ return 0;
+}
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
new file mode 100644
index 00000000000..d10d552d9c4
--- /dev/null
+++ b/net/netfilter/nf_queue.c
@@ -0,0 +1,343 @@
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <linux/seq_file.h>
+#include <net/protocol.h>
+
+#include "nf_internals.h"
+
+/*
+ * A queue handler may be registered for each protocol. Each is protected by
+ * long term mutex. The handler must provide an an outfn() to accept packets
+ * for queueing and must reinject all packets it receives, no matter what.
+ */
+static struct nf_queue_handler *queue_handler[NPROTO];
+static struct nf_queue_rerouter *queue_rerouter;
+
+static DEFINE_RWLOCK(queue_handler_lock);
+
+/* return EBUSY when somebody else is registered, return EEXIST if the
+ * same handler is registered, return 0 in case of success. */
+int nf_register_queue_handler(int pf, struct nf_queue_handler *qh)
+{
+ int ret;
+
+ if (pf >= NPROTO)
+ return -EINVAL;
+
+ write_lock_bh(&queue_handler_lock);
+ if (queue_handler[pf] == qh)
+ ret = -EEXIST;
+ else if (queue_handler[pf])
+ ret = -EBUSY;
+ else {
+ queue_handler[pf] = qh;
+ ret = 0;
+ }
+ write_unlock_bh(&queue_handler_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(nf_register_queue_handler);
+
+/* The caller must flush their queue before this */
+int nf_unregister_queue_handler(int pf)
+{
+ if (pf >= NPROTO)
+ return -EINVAL;
+
+ write_lock_bh(&queue_handler_lock);
+ queue_handler[pf] = NULL;
+ write_unlock_bh(&queue_handler_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(nf_unregister_queue_handler);
+
+int nf_register_queue_rerouter(int pf, struct nf_queue_rerouter *rer)
+{
+ if (pf >= NPROTO)
+ return -EINVAL;
+
+ write_lock_bh(&queue_handler_lock);
+ memcpy(&queue_rerouter[pf], rer, sizeof(queue_rerouter[pf]));
+ write_unlock_bh(&queue_handler_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_register_queue_rerouter);
+
+int nf_unregister_queue_rerouter(int pf)
+{
+ if (pf >= NPROTO)
+ return -EINVAL;
+
+ write_lock_bh(&queue_handler_lock);
+ memset(&queue_rerouter[pf], 0, sizeof(queue_rerouter[pf]));
+ write_unlock_bh(&queue_handler_lock);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_unregister_queue_rerouter);
+
+void nf_unregister_queue_handlers(struct nf_queue_handler *qh)
+{
+ int pf;
+
+ write_lock_bh(&queue_handler_lock);
+ for (pf = 0; pf < NPROTO; pf++) {
+ if (queue_handler[pf] == qh)
+ queue_handler[pf] = NULL;
+ }
+ write_unlock_bh(&queue_handler_lock);
+}
+EXPORT_SYMBOL_GPL(nf_unregister_queue_handlers);
+
+/*
+ * Any packet that leaves via this function must come back
+ * through nf_reinject().
+ */
+int nf_queue(struct sk_buff **skb,
+ struct list_head *elem,
+ int pf, unsigned int hook,
+ struct net_device *indev,
+ struct net_device *outdev,
+ int (*okfn)(struct sk_buff *),
+ unsigned int queuenum)
+{
+ int status;
+ struct nf_info *info;
+#ifdef CONFIG_BRIDGE_NETFILTER
+ struct net_device *physindev = NULL;
+ struct net_device *physoutdev = NULL;
+#endif
+
+ /* QUEUE == DROP if noone is waiting, to be safe. */
+ read_lock(&queue_handler_lock);
+ if (!queue_handler[pf]->outfn) {
+ read_unlock(&queue_handler_lock);
+ kfree_skb(*skb);
+ return 1;
+ }
+
+ info = kmalloc(sizeof(*info)+queue_rerouter[pf].rer_size, GFP_ATOMIC);
+ if (!info) {
+ if (net_ratelimit())
+ printk(KERN_ERR "OOM queueing packet %p\n",
+ *skb);
+ read_unlock(&queue_handler_lock);
+ kfree_skb(*skb);
+ return 1;
+ }
+
+ *info = (struct nf_info) {
+ (struct nf_hook_ops *)elem, pf, hook, indev, outdev, okfn };
+
+ /* If it's going away, ignore hook. */
+ if (!try_module_get(info->elem->owner)) {
+ read_unlock(&queue_handler_lock);
+ kfree(info);
+ return 0;
+ }
+
+ /* Bump dev refs so they don't vanish while packet is out */
+ if (indev) dev_hold(indev);
+ if (outdev) dev_hold(outdev);
+
+#ifdef CONFIG_BRIDGE_NETFILTER
+ if ((*skb)->nf_bridge) {
+ physindev = (*skb)->nf_bridge->physindev;
+ if (physindev) dev_hold(physindev);
+ physoutdev = (*skb)->nf_bridge->physoutdev;
+ if (physoutdev) dev_hold(physoutdev);
+ }
+#endif
+ if (queue_rerouter[pf].save)
+ queue_rerouter[pf].save(*skb, info);
+
+ status = queue_handler[pf]->outfn(*skb, info, queuenum,
+ queue_handler[pf]->data);
+
+ if (status >= 0 && queue_rerouter[pf].reroute)
+ status = queue_rerouter[pf].reroute(skb, info);
+
+ read_unlock(&queue_handler_lock);
+
+ if (status < 0) {
+ /* James M doesn't say fuck enough. */
+ if (indev) dev_put(indev);
+ if (outdev) dev_put(outdev);
+#ifdef CONFIG_BRIDGE_NETFILTER
+ if (physindev) dev_put(physindev);
+ if (physoutdev) dev_put(physoutdev);
+#endif
+ module_put(info->elem->owner);
+ kfree(info);
+ kfree_skb(*skb);
+
+ return 1;
+ }
+
+ return 1;
+}
+
+void nf_reinject(struct sk_buff *skb, struct nf_info *info,
+ unsigned int verdict)
+{
+ struct list_head *elem = &info->elem->list;
+ struct list_head *i;
+
+ rcu_read_lock();
+
+ /* Release those devices we held, or Alexey will kill me. */
+ if (info->indev) dev_put(info->indev);
+ if (info->outdev) dev_put(info->outdev);
+#ifdef CONFIG_BRIDGE_NETFILTER
+ if (skb->nf_bridge) {
+ if (skb->nf_bridge->physindev)
+ dev_put(skb->nf_bridge->physindev);
+ if (skb->nf_bridge->physoutdev)
+ dev_put(skb->nf_bridge->physoutdev);
+ }
+#endif
+
+ /* Drop reference to owner of hook which queued us. */
+ module_put(info->elem->owner);
+
+ list_for_each_rcu(i, &nf_hooks[info->pf][info->hook]) {
+ if (i == elem)
+ break;
+ }
+
+ if (elem == &nf_hooks[info->pf][info->hook]) {
+ /* The module which sent it to userspace is gone. */
+ NFDEBUG("%s: module disappeared, dropping packet.\n",
+ __FUNCTION__);
+ verdict = NF_DROP;
+ }
+
+ /* Continue traversal iff userspace said ok... */
+ if (verdict == NF_REPEAT) {
+ elem = elem->prev;
+ verdict = NF_ACCEPT;
+ }
+
+ if (verdict == NF_ACCEPT) {
+ next_hook:
+ verdict = nf_iterate(&nf_hooks[info->pf][info->hook],
+ &skb, info->hook,
+ info->indev, info->outdev, &elem,
+ info->okfn, INT_MIN);
+ }
+
+ switch (verdict & NF_VERDICT_MASK) {
+ case NF_ACCEPT:
+ info->okfn(skb);
+ break;
+
+ case NF_QUEUE:
+ if (!nf_queue(&skb, elem, info->pf, info->hook,
+ info->indev, info->outdev, info->okfn,
+ verdict >> NF_VERDICT_BITS))
+ goto next_hook;
+ break;
+ }
+ rcu_read_unlock();
+
+ if (verdict == NF_DROP)
+ kfree_skb(skb);
+
+ kfree(info);
+ return;
+}
+EXPORT_SYMBOL(nf_reinject);
+
+#ifdef CONFIG_PROC_FS
+static void *seq_start(struct seq_file *seq, loff_t *pos)
+{
+ if (*pos >= NPROTO)
+ return NULL;
+
+ return pos;
+}
+
+static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ (*pos)++;
+
+ if (*pos >= NPROTO)
+ return NULL;
+
+ return pos;
+}
+
+static void seq_stop(struct seq_file *s, void *v)
+{
+
+}
+
+static int seq_show(struct seq_file *s, void *v)
+{
+ int ret;
+ loff_t *pos = v;
+ struct nf_queue_handler *qh;
+
+ read_lock_bh(&queue_handler_lock);
+ qh = queue_handler[*pos];
+ if (!qh)
+ ret = seq_printf(s, "%2lld NONE\n", *pos);
+ else
+ ret = seq_printf(s, "%2lld %s\n", *pos, qh->name);
+ read_unlock_bh(&queue_handler_lock);
+
+ return ret;
+}
+
+static struct seq_operations nfqueue_seq_ops = {
+ .start = seq_start,
+ .next = seq_next,
+ .stop = seq_stop,
+ .show = seq_show,
+};
+
+static int nfqueue_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &nfqueue_seq_ops);
+}
+
+static struct file_operations nfqueue_file_ops = {
+ .owner = THIS_MODULE,
+ .open = nfqueue_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+#endif /* PROC_FS */
+
+
+int __init netfilter_queue_init(void)
+{
+#ifdef CONFIG_PROC_FS
+ struct proc_dir_entry *pde;
+#endif
+ queue_rerouter = kmalloc(NPROTO * sizeof(struct nf_queue_rerouter),
+ GFP_KERNEL);
+ if (!queue_rerouter)
+ return -ENOMEM;
+
+#ifdef CONFIG_PROC_FS
+ pde = create_proc_entry("nf_queue", S_IRUGO, proc_net_netfilter);
+ if (!pde) {
+ kfree(queue_rerouter);
+ return -1;
+ }
+ pde->proc_fops = &nfqueue_file_ops;
+#endif
+ memset(queue_rerouter, 0, NPROTO * sizeof(struct nf_queue_rerouter));
+
+ return 0;
+}
+
diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c
new file mode 100644
index 00000000000..61a833a9caa
--- /dev/null
+++ b/net/netfilter/nf_sockopt.c
@@ -0,0 +1,132 @@
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <net/sock.h>
+
+#include "nf_internals.h"
+
+/* Sockopts only registered and called from user context, so
+ net locking would be overkill. Also, [gs]etsockopt calls may
+ sleep. */
+static DECLARE_MUTEX(nf_sockopt_mutex);
+static LIST_HEAD(nf_sockopts);
+
+/* Do exclusive ranges overlap? */
+static inline int overlap(int min1, int max1, int min2, int max2)
+{
+ return max1 > min2 && min1 < max2;
+}
+
+/* Functions to register sockopt ranges (exclusive). */
+int nf_register_sockopt(struct nf_sockopt_ops *reg)
+{
+ struct list_head *i;
+ int ret = 0;
+
+ if (down_interruptible(&nf_sockopt_mutex) != 0)
+ return -EINTR;
+
+ list_for_each(i, &nf_sockopts) {
+ struct nf_sockopt_ops *ops = (struct nf_sockopt_ops *)i;
+ if (ops->pf == reg->pf
+ && (overlap(ops->set_optmin, ops->set_optmax,
+ reg->set_optmin, reg->set_optmax)
+ || overlap(ops->get_optmin, ops->get_optmax,
+ reg->get_optmin, reg->get_optmax))) {
+ NFDEBUG("nf_sock overlap: %u-%u/%u-%u v %u-%u/%u-%u\n",
+ ops->set_optmin, ops->set_optmax,
+ ops->get_optmin, ops->get_optmax,
+ reg->set_optmin, reg->set_optmax,
+ reg->get_optmin, reg->get_optmax);
+ ret = -EBUSY;
+ goto out;
+ }
+ }
+
+ list_add(&reg->list, &nf_sockopts);
+out:
+ up(&nf_sockopt_mutex);
+ return ret;
+}
+EXPORT_SYMBOL(nf_register_sockopt);
+
+void nf_unregister_sockopt(struct nf_sockopt_ops *reg)
+{
+ /* No point being interruptible: we're probably in cleanup_module() */
+ restart:
+ down(&nf_sockopt_mutex);
+ if (reg->use != 0) {
+ /* To be woken by nf_sockopt call... */
+ /* FIXME: Stuart Young's name appears gratuitously. */
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ reg->cleanup_task = current;
+ up(&nf_sockopt_mutex);
+ schedule();
+ goto restart;
+ }
+ list_del(&reg->list);
+ up(&nf_sockopt_mutex);
+}
+EXPORT_SYMBOL(nf_unregister_sockopt);
+
+/* Call get/setsockopt() */
+static int nf_sockopt(struct sock *sk, int pf, int val,
+ char __user *opt, int *len, int get)
+{
+ struct list_head *i;
+ struct nf_sockopt_ops *ops;
+ int ret;
+
+ if (down_interruptible(&nf_sockopt_mutex) != 0)
+ return -EINTR;
+
+ list_for_each(i, &nf_sockopts) {
+ ops = (struct nf_sockopt_ops *)i;
+ if (ops->pf == pf) {
+ if (get) {
+ if (val >= ops->get_optmin
+ && val < ops->get_optmax) {
+ ops->use++;
+ up(&nf_sockopt_mutex);
+ ret = ops->get(sk, val, opt, len);
+ goto out;
+ }
+ } else {
+ if (val >= ops->set_optmin
+ && val < ops->set_optmax) {
+ ops->use++;
+ up(&nf_sockopt_mutex);
+ ret = ops->set(sk, val, opt, *len);
+ goto out;
+ }
+ }
+ }
+ }
+ up(&nf_sockopt_mutex);
+ return -ENOPROTOOPT;
+
+ out:
+ down(&nf_sockopt_mutex);
+ ops->use--;
+ if (ops->cleanup_task)
+ wake_up_process(ops->cleanup_task);
+ up(&nf_sockopt_mutex);
+ return ret;
+}
+
+int nf_setsockopt(struct sock *sk, int pf, int val, char __user *opt,
+ int len)
+{
+ return nf_sockopt(sk, pf, val, opt, &len, 0);
+}
+EXPORT_SYMBOL(nf_setsockopt);
+
+int nf_getsockopt(struct sock *sk, int pf, int val, char __user *opt, int *len)
+{
+ return nf_sockopt(sk, pf, val, opt, len, 1);
+}
+EXPORT_SYMBOL(nf_getsockopt);
+
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
new file mode 100644
index 00000000000..4bc27a6334c
--- /dev/null
+++ b/net/netfilter/nfnetlink.c
@@ -0,0 +1,376 @@
+/* Netfilter messages via netlink socket. Allows for user space
+ * protocol helpers and general trouble making from userspace.
+ *
+ * (C) 2001 by Jay Schulist <jschlst@samba.org>,
+ * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org>
+ * (C) 2005 by Pablo Neira Ayuso <pablo@eurodev.net>
+ *
+ * Initial netfilter messages via netlink development funded and
+ * generally made possible by Network Robots, Inc. (www.networkrobots.com)
+ *
+ * Further development of this code funded by Astaro AG (http://www.astaro.com)
+ *
+ * This software may be used and distributed according to the terms
+ * of the GNU General Public License, incorporated herein by reference.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/fcntl.h>
+#include <linux/skbuff.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <net/sock.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+
+#include <linux/netfilter.h>
+#include <linux/netlink.h>
+#include <linux/netfilter/nfnetlink.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER);
+
+static char __initdata nfversion[] = "0.30";
+
+#if 0
+#define DEBUGP(format, args...) \
+ printk(KERN_DEBUG "%s(%d):%s(): " format, __FILE__, \
+ __LINE__, __FUNCTION__, ## args)
+#else
+#define DEBUGP(format, args...)
+#endif
+
+static struct sock *nfnl = NULL;
+static struct nfnetlink_subsystem *subsys_table[NFNL_SUBSYS_COUNT];
+DECLARE_MUTEX(nfnl_sem);
+
+void nfnl_lock(void)
+{
+ nfnl_shlock();
+}
+
+void nfnl_unlock(void)
+{
+ nfnl_shunlock();
+}
+
+int nfnetlink_subsys_register(struct nfnetlink_subsystem *n)
+{
+ DEBUGP("registering subsystem ID %u\n", n->subsys_id);
+
+ nfnl_lock();
+ if (subsys_table[n->subsys_id]) {
+ nfnl_unlock();
+ return -EBUSY;
+ }
+ subsys_table[n->subsys_id] = n;
+ nfnl_unlock();
+
+ return 0;
+}
+
+int nfnetlink_subsys_unregister(struct nfnetlink_subsystem *n)
+{
+ DEBUGP("unregistering subsystem ID %u\n", n->subsys_id);
+
+ nfnl_lock();
+ subsys_table[n->subsys_id] = NULL;
+ nfnl_unlock();
+
+ return 0;
+}
+
+static inline struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t type)
+{
+ u_int8_t subsys_id = NFNL_SUBSYS_ID(type);
+
+ if (subsys_id >= NFNL_SUBSYS_COUNT
+ || subsys_table[subsys_id] == NULL)
+ return NULL;
+
+ return subsys_table[subsys_id];
+}
+
+static inline struct nfnl_callback *
+nfnetlink_find_client(u_int16_t type, struct nfnetlink_subsystem *ss)
+{
+ u_int8_t cb_id = NFNL_MSG_TYPE(type);
+
+ if (cb_id >= ss->cb_count) {
+ DEBUGP("msgtype %u >= %u, returning\n", type, ss->cb_count);
+ return NULL;
+ }
+
+ return &ss->cb[cb_id];
+}
+
+void __nfa_fill(struct sk_buff *skb, int attrtype, int attrlen,
+ const void *data)
+{
+ struct nfattr *nfa;
+ int size = NFA_LENGTH(attrlen);
+
+ nfa = (struct nfattr *)skb_put(skb, NFA_ALIGN(size));
+ nfa->nfa_type = attrtype;
+ nfa->nfa_len = size;
+ memcpy(NFA_DATA(nfa), data, attrlen);
+ memset(NFA_DATA(nfa) + attrlen, 0, NFA_ALIGN(size) - size);
+}
+
+int nfattr_parse(struct nfattr *tb[], int maxattr, struct nfattr *nfa, int len)
+{
+ memset(tb, 0, sizeof(struct nfattr *) * maxattr);
+
+ while (NFA_OK(nfa, len)) {
+ unsigned flavor = NFA_TYPE(nfa);
+ if (flavor && flavor <= maxattr)
+ tb[flavor-1] = nfa;
+ nfa = NFA_NEXT(nfa, len);
+ }
+
+ return 0;
+}
+
+/**
+ * nfnetlink_check_attributes - check and parse nfnetlink attributes
+ *
+ * subsys: nfnl subsystem for which this message is to be parsed
+ * nlmsghdr: netlink message to be checked/parsed
+ * cda: array of pointers, needs to be at least subsys->attr_count big
+ *
+ */
+static int
+nfnetlink_check_attributes(struct nfnetlink_subsystem *subsys,
+ struct nlmsghdr *nlh, struct nfattr *cda[])
+{
+ int min_len;
+ u_int16_t attr_count;
+ u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
+
+ if (unlikely(cb_id >= subsys->cb_count)) {
+ DEBUGP("msgtype %u >= %u, returning\n",
+ cb_id, subsys->cb_count);
+ return -EINVAL;
+ }
+
+ min_len = NLMSG_ALIGN(sizeof(struct nfgenmsg));
+ if (unlikely(nlh->nlmsg_len < min_len))
+ return -EINVAL;
+
+ attr_count = subsys->cb[cb_id].attr_count;
+ memset(cda, 0, sizeof(struct nfattr *) * attr_count);
+
+ /* check attribute lengths. */
+ if (likely(nlh->nlmsg_len > min_len)) {
+ struct nfattr *attr = NFM_NFA(NLMSG_DATA(nlh));
+ int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len);
+
+ while (NFA_OK(attr, attrlen)) {
+ unsigned flavor = NFA_TYPE(attr);
+ if (flavor) {
+ if (flavor > attr_count)
+ return -EINVAL;
+ cda[flavor - 1] = attr;
+ }
+ attr = NFA_NEXT(attr, attrlen);
+ }
+ }
+
+ /* implicit: if nlmsg_len == min_len, we return 0, and an empty
+ * (zeroed) cda[] array. The message is valid, but empty. */
+
+ return 0;
+}
+
+int nfnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo)
+{
+ gfp_t allocation = in_interrupt() ? GFP_ATOMIC : GFP_KERNEL;
+ int err = 0;
+
+ NETLINK_CB(skb).dst_group = group;
+ if (echo)
+ atomic_inc(&skb->users);
+ netlink_broadcast(nfnl, skb, pid, group, allocation);
+ if (echo)
+ err = netlink_unicast(nfnl, skb, pid, MSG_DONTWAIT);
+
+ return err;
+}
+
+int nfnetlink_unicast(struct sk_buff *skb, u_int32_t pid, int flags)
+{
+ return netlink_unicast(nfnl, skb, pid, flags);
+}
+
+/* Process one complete nfnetlink message. */
+static inline int nfnetlink_rcv_msg(struct sk_buff *skb,
+ struct nlmsghdr *nlh, int *errp)
+{
+ struct nfnl_callback *nc;
+ struct nfnetlink_subsystem *ss;
+ int type, err = 0;
+
+ DEBUGP("entered; subsys=%u, msgtype=%u\n",
+ NFNL_SUBSYS_ID(nlh->nlmsg_type),
+ NFNL_MSG_TYPE(nlh->nlmsg_type));
+
+ /* Only requests are handled by kernel now. */
+ if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) {
+ DEBUGP("received non-request message\n");
+ return 0;
+ }
+
+ /* All the messages must at least contain nfgenmsg */
+ if (nlh->nlmsg_len <
+ NLMSG_LENGTH(NLMSG_ALIGN(sizeof(struct nfgenmsg)))) {
+ DEBUGP("received message was too short\n");
+ return 0;
+ }
+
+ type = nlh->nlmsg_type;
+ ss = nfnetlink_get_subsys(type);
+ if (!ss) {
+#ifdef CONFIG_KMOD
+ /* don't call nfnl_shunlock, since it would reenter
+ * with further packet processing */
+ up(&nfnl_sem);
+ request_module("nfnetlink-subsys-%d", NFNL_SUBSYS_ID(type));
+ nfnl_shlock();
+ ss = nfnetlink_get_subsys(type);
+ if (!ss)
+#endif
+ goto err_inval;
+ }
+
+ nc = nfnetlink_find_client(type, ss);
+ if (!nc) {
+ DEBUGP("unable to find client for type %d\n", type);
+ goto err_inval;
+ }
+
+ if (nc->cap_required &&
+ !cap_raised(NETLINK_CB(skb).eff_cap, nc->cap_required)) {
+ DEBUGP("permission denied for type %d\n", type);
+ *errp = -EPERM;
+ return -1;
+ }
+
+ {
+ u_int16_t attr_count =
+ ss->cb[NFNL_MSG_TYPE(nlh->nlmsg_type)].attr_count;
+ struct nfattr *cda[attr_count];
+
+ memset(cda, 0, sizeof(struct nfattr *) * attr_count);
+
+ err = nfnetlink_check_attributes(ss, nlh, cda);
+ if (err < 0)
+ goto err_inval;
+
+ DEBUGP("calling handler\n");
+ err = nc->call(nfnl, skb, nlh, cda, errp);
+ *errp = err;
+ return err;
+ }
+
+err_inval:
+ DEBUGP("returning -EINVAL\n");
+ *errp = -EINVAL;
+ return -1;
+}
+
+/* Process one packet of messages. */
+static inline int nfnetlink_rcv_skb(struct sk_buff *skb)
+{
+ int err;
+ struct nlmsghdr *nlh;
+
+ while (skb->len >= NLMSG_SPACE(0)) {
+ u32 rlen;
+
+ nlh = (struct nlmsghdr *)skb->data;
+ if (nlh->nlmsg_len < sizeof(struct nlmsghdr)
+ || skb->len < nlh->nlmsg_len)
+ return 0;
+ rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+ if (rlen > skb->len)
+ rlen = skb->len;
+ if (nfnetlink_rcv_msg(skb, nlh, &err)) {
+ if (!err)
+ return -1;
+ netlink_ack(skb, nlh, err);
+ } else
+ if (nlh->nlmsg_flags & NLM_F_ACK)
+ netlink_ack(skb, nlh, 0);
+ skb_pull(skb, rlen);
+ }
+
+ return 0;
+}
+
+static void nfnetlink_rcv(struct sock *sk, int len)
+{
+ do {
+ struct sk_buff *skb;
+
+ if (nfnl_shlock_nowait())
+ return;
+
+ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+ if (nfnetlink_rcv_skb(skb)) {
+ if (skb->len)
+ skb_queue_head(&sk->sk_receive_queue,
+ skb);
+ else
+ kfree_skb(skb);
+ break;
+ }
+ kfree_skb(skb);
+ }
+
+ /* don't call nfnl_shunlock, since it would reenter
+ * with further packet processing */
+ up(&nfnl_sem);
+ } while(nfnl && nfnl->sk_receive_queue.qlen);
+}
+
+static void __exit nfnetlink_exit(void)
+{
+ printk("Removing netfilter NETLINK layer.\n");
+ sock_release(nfnl->sk_socket);
+ return;
+}
+
+static int __init nfnetlink_init(void)
+{
+ printk("Netfilter messages via NETLINK v%s.\n", nfversion);
+
+ nfnl = netlink_kernel_create(NETLINK_NETFILTER, NFNLGRP_MAX,
+ nfnetlink_rcv, THIS_MODULE);
+ if (!nfnl) {
+ printk(KERN_ERR "cannot initialize nfnetlink!\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+module_init(nfnetlink_init);
+module_exit(nfnetlink_exit);
+
+EXPORT_SYMBOL_GPL(nfnetlink_subsys_register);
+EXPORT_SYMBOL_GPL(nfnetlink_subsys_unregister);
+EXPORT_SYMBOL_GPL(nfnetlink_send);
+EXPORT_SYMBOL_GPL(nfnetlink_unicast);
+EXPORT_SYMBOL_GPL(nfattr_parse);
+EXPORT_SYMBOL_GPL(__nfa_fill);
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
new file mode 100644
index 00000000000..efcd10f996b
--- /dev/null
+++ b/net/netfilter/nfnetlink_log.c
@@ -0,0 +1,1055 @@
+/*
+ * This is a module which is used for logging packets to userspace via
+ * nfetlink.
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * Based on the old ipv4-only ipt_ULOG.c:
+ * (C) 2000-2004 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/netlink.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_log.h>
+#include <linux/spinlock.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/security.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+#include <net/sock.h>
+
+#include <asm/atomic.h>
+
+#ifdef CONFIG_BRIDGE_NETFILTER
+#include "../bridge/br_private.h"
+#endif
+
+#define NFULNL_NLBUFSIZ_DEFAULT 4096
+#define NFULNL_TIMEOUT_DEFAULT 100 /* every second */
+#define NFULNL_QTHRESH_DEFAULT 100 /* 100 packets */
+
+#define PRINTR(x, args...) do { if (net_ratelimit()) \
+ printk(x, ## args); } while (0);
+
+#if 0
+#define UDEBUG(x, args ...) printk(KERN_DEBUG "%s(%d):%s(): " x, \
+ __FILE__, __LINE__, __FUNCTION__, \
+ ## args)
+#else
+#define UDEBUG(x, ...)
+#endif
+
+struct nfulnl_instance {
+ struct hlist_node hlist; /* global list of instances */
+ spinlock_t lock;
+ atomic_t use; /* use count */
+
+ unsigned int qlen; /* number of nlmsgs in skb */
+ struct sk_buff *skb; /* pre-allocatd skb */
+ struct nlmsghdr *lastnlh; /* netlink header of last msg in skb */
+ struct timer_list timer;
+ int peer_pid; /* PID of the peer process */
+
+ /* configurable parameters */
+ unsigned int flushtimeout; /* timeout until queue flush */
+ unsigned int nlbufsiz; /* netlink buffer allocation size */
+ unsigned int qthreshold; /* threshold of the queue */
+ u_int32_t copy_range;
+ u_int16_t group_num; /* number of this queue */
+ u_int8_t copy_mode;
+};
+
+static DEFINE_RWLOCK(instances_lock);
+
+#define INSTANCE_BUCKETS 16
+static struct hlist_head instance_table[INSTANCE_BUCKETS];
+static unsigned int hash_init;
+
+static inline u_int8_t instance_hashfn(u_int16_t group_num)
+{
+ return ((group_num & 0xff) % INSTANCE_BUCKETS);
+}
+
+static struct nfulnl_instance *
+__instance_lookup(u_int16_t group_num)
+{
+ struct hlist_head *head;
+ struct hlist_node *pos;
+ struct nfulnl_instance *inst;
+
+ UDEBUG("entering (group_num=%u)\n", group_num);
+
+ head = &instance_table[instance_hashfn(group_num)];
+ hlist_for_each_entry(inst, pos, head, hlist) {
+ if (inst->group_num == group_num)
+ return inst;
+ }
+ return NULL;
+}
+
+static inline void
+instance_get(struct nfulnl_instance *inst)
+{
+ atomic_inc(&inst->use);
+}
+
+static struct nfulnl_instance *
+instance_lookup_get(u_int16_t group_num)
+{
+ struct nfulnl_instance *inst;
+
+ read_lock_bh(&instances_lock);
+ inst = __instance_lookup(group_num);
+ if (inst)
+ instance_get(inst);
+ read_unlock_bh(&instances_lock);
+
+ return inst;
+}
+
+static void
+instance_put(struct nfulnl_instance *inst)
+{
+ if (inst && atomic_dec_and_test(&inst->use)) {
+ UDEBUG("kfree(inst=%p)\n", inst);
+ kfree(inst);
+ }
+}
+
+static void nfulnl_timer(unsigned long data);
+
+static struct nfulnl_instance *
+instance_create(u_int16_t group_num, int pid)
+{
+ struct nfulnl_instance *inst;
+
+ UDEBUG("entering (group_num=%u, pid=%d)\n", group_num,
+ pid);
+
+ write_lock_bh(&instances_lock);
+ if (__instance_lookup(group_num)) {
+ inst = NULL;
+ UDEBUG("aborting, instance already exists\n");
+ goto out_unlock;
+ }
+
+ inst = kmalloc(sizeof(*inst), GFP_ATOMIC);
+ if (!inst)
+ goto out_unlock;
+
+ memset(inst, 0, sizeof(*inst));
+ INIT_HLIST_NODE(&inst->hlist);
+ inst->lock = SPIN_LOCK_UNLOCKED;
+ /* needs to be two, since we _put() after creation */
+ atomic_set(&inst->use, 2);
+
+ init_timer(&inst->timer);
+ inst->timer.function = nfulnl_timer;
+ inst->timer.data = (unsigned long)inst;
+ /* don't start timer yet. (re)start it with every packet */
+
+ inst->peer_pid = pid;
+ inst->group_num = group_num;
+
+ inst->qthreshold = NFULNL_QTHRESH_DEFAULT;
+ inst->flushtimeout = NFULNL_TIMEOUT_DEFAULT;
+ inst->nlbufsiz = NFULNL_NLBUFSIZ_DEFAULT;
+ inst->copy_mode = NFULNL_COPY_PACKET;
+ inst->copy_range = 0xffff;
+
+ if (!try_module_get(THIS_MODULE))
+ goto out_free;
+
+ hlist_add_head(&inst->hlist,
+ &instance_table[instance_hashfn(group_num)]);
+
+ UDEBUG("newly added node: %p, next=%p\n", &inst->hlist,
+ inst->hlist.next);
+
+ write_unlock_bh(&instances_lock);
+
+ return inst;
+
+out_free:
+ instance_put(inst);
+out_unlock:
+ write_unlock_bh(&instances_lock);
+ return NULL;
+}
+
+static int __nfulnl_send(struct nfulnl_instance *inst);
+
+static void
+_instance_destroy2(struct nfulnl_instance *inst, int lock)
+{
+ /* first pull it out of the global list */
+ if (lock)
+ write_lock_bh(&instances_lock);
+
+ UDEBUG("removing instance %p (queuenum=%u) from hash\n",
+ inst, inst->group_num);
+
+ hlist_del(&inst->hlist);
+
+ if (lock)
+ write_unlock_bh(&instances_lock);
+
+ /* then flush all pending packets from skb */
+
+ spin_lock_bh(&inst->lock);
+ if (inst->skb) {
+ if (inst->qlen)
+ __nfulnl_send(inst);
+ if (inst->skb) {
+ kfree_skb(inst->skb);
+ inst->skb = NULL;
+ }
+ }
+ spin_unlock_bh(&inst->lock);
+
+ /* and finally put the refcount */
+ instance_put(inst);
+
+ module_put(THIS_MODULE);
+}
+
+static inline void
+__instance_destroy(struct nfulnl_instance *inst)
+{
+ _instance_destroy2(inst, 0);
+}
+
+static inline void
+instance_destroy(struct nfulnl_instance *inst)
+{
+ _instance_destroy2(inst, 1);
+}
+
+static int
+nfulnl_set_mode(struct nfulnl_instance *inst, u_int8_t mode,
+ unsigned int range)
+{
+ int status = 0;
+
+ spin_lock_bh(&inst->lock);
+
+ switch (mode) {
+ case NFULNL_COPY_NONE:
+ case NFULNL_COPY_META:
+ inst->copy_mode = mode;
+ inst->copy_range = 0;
+ break;
+
+ case NFULNL_COPY_PACKET:
+ inst->copy_mode = mode;
+ /* we're using struct nfattr which has 16bit nfa_len */
+ if (range > 0xffff)
+ inst->copy_range = 0xffff;
+ else
+ inst->copy_range = range;
+ break;
+
+ default:
+ status = -EINVAL;
+ break;
+ }
+
+ spin_unlock_bh(&inst->lock);
+
+ return status;
+}
+
+static int
+nfulnl_set_nlbufsiz(struct nfulnl_instance *inst, u_int32_t nlbufsiz)
+{
+ int status;
+
+ spin_lock_bh(&inst->lock);
+ if (nlbufsiz < NFULNL_NLBUFSIZ_DEFAULT)
+ status = -ERANGE;
+ else if (nlbufsiz > 131072)
+ status = -ERANGE;
+ else {
+ inst->nlbufsiz = nlbufsiz;
+ status = 0;
+ }
+ spin_unlock_bh(&inst->lock);
+
+ return status;
+}
+
+static int
+nfulnl_set_timeout(struct nfulnl_instance *inst, u_int32_t timeout)
+{
+ spin_lock_bh(&inst->lock);
+ inst->flushtimeout = timeout;
+ spin_unlock_bh(&inst->lock);
+
+ return 0;
+}
+
+static int
+nfulnl_set_qthresh(struct nfulnl_instance *inst, u_int32_t qthresh)
+{
+ spin_lock_bh(&inst->lock);
+ inst->qthreshold = qthresh;
+ spin_unlock_bh(&inst->lock);
+
+ return 0;
+}
+
+static struct sk_buff *nfulnl_alloc_skb(unsigned int inst_size,
+ unsigned int pkt_size)
+{
+ struct sk_buff *skb;
+
+ UDEBUG("entered (%u, %u)\n", inst_size, pkt_size);
+
+ /* alloc skb which should be big enough for a whole multipart
+ * message. WARNING: has to be <= 128k due to slab restrictions */
+
+ skb = alloc_skb(inst_size, GFP_ATOMIC);
+ if (!skb) {
+ PRINTR("nfnetlink_log: can't alloc whole buffer (%u bytes)\n",
+ inst_size);
+
+ /* try to allocate only as much as we need for current
+ * packet */
+
+ skb = alloc_skb(pkt_size, GFP_ATOMIC);
+ if (!skb)
+ PRINTR("nfnetlink_log: can't even alloc %u bytes\n",
+ pkt_size);
+ }
+
+ return skb;
+}
+
+static int
+__nfulnl_send(struct nfulnl_instance *inst)
+{
+ int status;
+
+ if (timer_pending(&inst->timer))
+ del_timer(&inst->timer);
+
+ if (inst->qlen > 1)
+ inst->lastnlh->nlmsg_type = NLMSG_DONE;
+
+ status = nfnetlink_unicast(inst->skb, inst->peer_pid, MSG_DONTWAIT);
+ if (status < 0) {
+ UDEBUG("netlink_unicast() failed\n");
+ /* FIXME: statistics */
+ }
+
+ inst->qlen = 0;
+ inst->skb = NULL;
+ inst->lastnlh = NULL;
+
+ return status;
+}
+
+static void nfulnl_timer(unsigned long data)
+{
+ struct nfulnl_instance *inst = (struct nfulnl_instance *)data;
+
+ UDEBUG("timer function called, flushing buffer\n");
+
+ spin_lock_bh(&inst->lock);
+ __nfulnl_send(inst);
+ instance_put(inst);
+ spin_unlock_bh(&inst->lock);
+}
+
+static inline int
+__build_packet_message(struct nfulnl_instance *inst,
+ const struct sk_buff *skb,
+ unsigned int data_len,
+ unsigned int pf,
+ unsigned int hooknum,
+ const struct net_device *indev,
+ const struct net_device *outdev,
+ const struct nf_loginfo *li,
+ const char *prefix)
+{
+ unsigned char *old_tail;
+ struct nfulnl_msg_packet_hdr pmsg;
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ u_int32_t tmp_uint;
+
+ UDEBUG("entered\n");
+
+ old_tail = inst->skb->tail;
+ nlh = NLMSG_PUT(inst->skb, 0, 0,
+ NFNL_SUBSYS_ULOG << 8 | NFULNL_MSG_PACKET,
+ sizeof(struct nfgenmsg));
+ nfmsg = NLMSG_DATA(nlh);
+ nfmsg->nfgen_family = pf;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = htons(inst->group_num);
+
+ pmsg.hw_protocol = htons(skb->protocol);
+ pmsg.hook = hooknum;
+
+ NFA_PUT(inst->skb, NFULA_PACKET_HDR, sizeof(pmsg), &pmsg);
+
+ if (prefix) {
+ int slen = strlen(prefix);
+ if (slen > NFULNL_PREFIXLEN)
+ slen = NFULNL_PREFIXLEN;
+ NFA_PUT(inst->skb, NFULA_PREFIX, slen, prefix);
+ }
+
+ if (indev) {
+ tmp_uint = htonl(indev->ifindex);
+#ifndef CONFIG_BRIDGE_NETFILTER
+ NFA_PUT(inst->skb, NFULA_IFINDEX_INDEV, sizeof(tmp_uint),
+ &tmp_uint);
+#else
+ if (pf == PF_BRIDGE) {
+ /* Case 1: outdev is physical input device, we need to
+ * look for bridge group (when called from
+ * netfilter_bridge) */
+ NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSINDEV,
+ sizeof(tmp_uint), &tmp_uint);
+ /* this is the bridge group "brX" */
+ tmp_uint = htonl(indev->br_port->br->dev->ifindex);
+ NFA_PUT(inst->skb, NFULA_IFINDEX_INDEV,
+ sizeof(tmp_uint), &tmp_uint);
+ } else {
+ /* Case 2: indev is bridge group, we need to look for
+ * physical device (when called from ipv4) */
+ NFA_PUT(inst->skb, NFULA_IFINDEX_INDEV,
+ sizeof(tmp_uint), &tmp_uint);
+ if (skb->nf_bridge && skb->nf_bridge->physindev) {
+ tmp_uint =
+ htonl(skb->nf_bridge->physindev->ifindex);
+ NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSINDEV,
+ sizeof(tmp_uint), &tmp_uint);
+ }
+ }
+#endif
+ }
+
+ if (outdev) {
+ tmp_uint = htonl(outdev->ifindex);
+#ifndef CONFIG_BRIDGE_NETFILTER
+ NFA_PUT(inst->skb, NFULA_IFINDEX_OUTDEV, sizeof(tmp_uint),
+ &tmp_uint);
+#else
+ if (pf == PF_BRIDGE) {
+ /* Case 1: outdev is physical output device, we need to
+ * look for bridge group (when called from
+ * netfilter_bridge) */
+ NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSOUTDEV,
+ sizeof(tmp_uint), &tmp_uint);
+ /* this is the bridge group "brX" */
+ tmp_uint = htonl(outdev->br_port->br->dev->ifindex);
+ NFA_PUT(inst->skb, NFULA_IFINDEX_OUTDEV,
+ sizeof(tmp_uint), &tmp_uint);
+ } else {
+ /* Case 2: indev is a bridge group, we need to look
+ * for physical device (when called from ipv4) */
+ NFA_PUT(inst->skb, NFULA_IFINDEX_OUTDEV,
+ sizeof(tmp_uint), &tmp_uint);
+ if (skb->nf_bridge) {
+ tmp_uint =
+ htonl(skb->nf_bridge->physoutdev->ifindex);
+ NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSOUTDEV,
+ sizeof(tmp_uint), &tmp_uint);
+ }
+ }
+#endif
+ }
+
+ if (skb->nfmark) {
+ tmp_uint = htonl(skb->nfmark);
+ NFA_PUT(inst->skb, NFULA_MARK, sizeof(tmp_uint), &tmp_uint);
+ }
+
+ if (indev && skb->dev && skb->dev->hard_header_parse) {
+ struct nfulnl_msg_packet_hw phw;
+
+ phw.hw_addrlen =
+ skb->dev->hard_header_parse((struct sk_buff *)skb,
+ phw.hw_addr);
+ phw.hw_addrlen = htons(phw.hw_addrlen);
+ NFA_PUT(inst->skb, NFULA_HWADDR, sizeof(phw), &phw);
+ }
+
+ if (skb->tstamp.off_sec) {
+ struct nfulnl_msg_packet_timestamp ts;
+
+ ts.sec = cpu_to_be64(skb->tstamp.off_sec);
+ ts.usec = cpu_to_be64(skb->tstamp.off_usec);
+
+ NFA_PUT(inst->skb, NFULA_TIMESTAMP, sizeof(ts), &ts);
+ }
+
+ /* UID */
+ if (skb->sk) {
+ read_lock_bh(&skb->sk->sk_callback_lock);
+ if (skb->sk->sk_socket && skb->sk->sk_socket->file) {
+ u_int32_t uid = htonl(skb->sk->sk_socket->file->f_uid);
+ /* need to unlock here since NFA_PUT may goto */
+ read_unlock_bh(&skb->sk->sk_callback_lock);
+ NFA_PUT(inst->skb, NFULA_UID, sizeof(uid), &uid);
+ } else
+ read_unlock_bh(&skb->sk->sk_callback_lock);
+ }
+
+ if (data_len) {
+ struct nfattr *nfa;
+ int size = NFA_LENGTH(data_len);
+
+ if (skb_tailroom(inst->skb) < (int)NFA_SPACE(data_len)) {
+ printk(KERN_WARNING "nfnetlink_log: no tailroom!\n");
+ goto nlmsg_failure;
+ }
+
+ nfa = (struct nfattr *)skb_put(inst->skb, NFA_ALIGN(size));
+ nfa->nfa_type = NFULA_PAYLOAD;
+ nfa->nfa_len = size;
+
+ if (skb_copy_bits(skb, 0, NFA_DATA(nfa), data_len))
+ BUG();
+ }
+
+ nlh->nlmsg_len = inst->skb->tail - old_tail;
+ return 0;
+
+nlmsg_failure:
+ UDEBUG("nlmsg_failure\n");
+nfattr_failure:
+ PRINTR(KERN_ERR "nfnetlink_log: error creating log nlmsg\n");
+ return -1;
+}
+
+#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
+
+static struct nf_loginfo default_loginfo = {
+ .type = NF_LOG_TYPE_ULOG,
+ .u = {
+ .ulog = {
+ .copy_len = 0xffff,
+ .group = 0,
+ .qthreshold = 1,
+ },
+ },
+};
+
+/* log handler for internal netfilter logging api */
+static void
+nfulnl_log_packet(unsigned int pf,
+ unsigned int hooknum,
+ const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *li_user,
+ const char *prefix)
+{
+ unsigned int size, data_len;
+ struct nfulnl_instance *inst;
+ const struct nf_loginfo *li;
+ unsigned int qthreshold;
+ unsigned int nlbufsiz;
+
+ if (li_user && li_user->type == NF_LOG_TYPE_ULOG)
+ li = li_user;
+ else
+ li = &default_loginfo;
+
+ inst = instance_lookup_get(li->u.ulog.group);
+ if (!inst)
+ inst = instance_lookup_get(0);
+ if (!inst) {
+ PRINTR("nfnetlink_log: trying to log packet, "
+ "but no instance for group %u\n", li->u.ulog.group);
+ return;
+ }
+
+ /* all macros expand to constant values at compile time */
+ /* FIXME: do we want to make the size calculation conditional based on
+ * what is actually present? way more branches and checks, but more
+ * memory efficient... */
+ size = NLMSG_SPACE(sizeof(struct nfgenmsg))
+ + NFA_SPACE(sizeof(struct nfulnl_msg_packet_hdr))
+ + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */
+ + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */
+#ifdef CONFIG_BRIDGE_NETFILTER
+ + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */
+ + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */
+#endif
+ + NFA_SPACE(sizeof(u_int32_t)) /* mark */
+ + NFA_SPACE(sizeof(u_int32_t)) /* uid */
+ + NFA_SPACE(NFULNL_PREFIXLEN) /* prefix */
+ + NFA_SPACE(sizeof(struct nfulnl_msg_packet_hw))
+ + NFA_SPACE(sizeof(struct nfulnl_msg_packet_timestamp));
+
+ UDEBUG("initial size=%u\n", size);
+
+ spin_lock_bh(&inst->lock);
+
+ qthreshold = inst->qthreshold;
+ /* per-rule qthreshold overrides per-instance */
+ if (qthreshold > li->u.ulog.qthreshold)
+ qthreshold = li->u.ulog.qthreshold;
+
+ switch (inst->copy_mode) {
+ case NFULNL_COPY_META:
+ case NFULNL_COPY_NONE:
+ data_len = 0;
+ break;
+
+ case NFULNL_COPY_PACKET:
+ if (inst->copy_range == 0
+ || inst->copy_range > skb->len)
+ data_len = skb->len;
+ else
+ data_len = inst->copy_range;
+
+ size += NFA_SPACE(data_len);
+ UDEBUG("copy_packet, therefore size now %u\n", size);
+ break;
+
+ default:
+ spin_unlock_bh(&inst->lock);
+ instance_put(inst);
+ return;
+ }
+
+ if (size > inst->nlbufsiz)
+ nlbufsiz = size;
+ else
+ nlbufsiz = inst->nlbufsiz;
+
+ if (!inst->skb) {
+ if (!(inst->skb = nfulnl_alloc_skb(nlbufsiz, size))) {
+ UDEBUG("error in nfulnl_alloc_skb(%u, %u)\n",
+ inst->nlbufsiz, size);
+ goto alloc_failure;
+ }
+ } else if (inst->qlen >= qthreshold ||
+ size > skb_tailroom(inst->skb)) {
+ /* either the queue len is too high or we don't have
+ * enough room in the skb left. flush to userspace. */
+ UDEBUG("flushing old skb\n");
+
+ __nfulnl_send(inst);
+
+ if (!(inst->skb = nfulnl_alloc_skb(nlbufsiz, size))) {
+ UDEBUG("error in nfulnl_alloc_skb(%u, %u)\n",
+ inst->nlbufsiz, size);
+ goto alloc_failure;
+ }
+ }
+
+ UDEBUG("qlen %d, qthreshold %d\n", inst->qlen, qthreshold);
+ inst->qlen++;
+
+ __build_packet_message(inst, skb, data_len, pf,
+ hooknum, in, out, li, prefix);
+
+ /* timer_pending always called within inst->lock, so there
+ * is no chance of a race here */
+ if (!timer_pending(&inst->timer)) {
+ instance_get(inst);
+ inst->timer.expires = jiffies + (inst->flushtimeout*HZ/100);
+ add_timer(&inst->timer);
+ }
+ spin_unlock_bh(&inst->lock);
+
+ return;
+
+alloc_failure:
+ spin_unlock_bh(&inst->lock);
+ instance_put(inst);
+ UDEBUG("error allocating skb\n");
+ /* FIXME: statistics */
+}
+
+static int
+nfulnl_rcv_nl_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct netlink_notify *n = ptr;
+
+ if (event == NETLINK_URELEASE &&
+ n->protocol == NETLINK_NETFILTER && n->pid) {
+ int i;
+
+ /* destroy all instances for this pid */
+ write_lock_bh(&instances_lock);
+ for (i = 0; i < INSTANCE_BUCKETS; i++) {
+ struct hlist_node *tmp, *t2;
+ struct nfulnl_instance *inst;
+ struct hlist_head *head = &instance_table[i];
+
+ hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) {
+ UDEBUG("node = %p\n", inst);
+ if (n->pid == inst->peer_pid)
+ __instance_destroy(inst);
+ }
+ }
+ write_unlock_bh(&instances_lock);
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nfulnl_rtnl_notifier = {
+ .notifier_call = nfulnl_rcv_nl_event,
+};
+
+static int
+nfulnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp)
+{
+ return -ENOTSUPP;
+}
+
+static struct nf_logger nfulnl_logger = {
+ .name = "nfnetlink_log",
+ .logfn = &nfulnl_log_packet,
+ .me = THIS_MODULE,
+};
+
+static const int nfula_min[NFULA_MAX] = {
+ [NFULA_PACKET_HDR-1] = sizeof(struct nfulnl_msg_packet_hdr),
+ [NFULA_MARK-1] = sizeof(u_int32_t),
+ [NFULA_TIMESTAMP-1] = sizeof(struct nfulnl_msg_packet_timestamp),
+ [NFULA_IFINDEX_INDEV-1] = sizeof(u_int32_t),
+ [NFULA_IFINDEX_OUTDEV-1]= sizeof(u_int32_t),
+ [NFULA_HWADDR-1] = sizeof(struct nfulnl_msg_packet_hw),
+ [NFULA_PAYLOAD-1] = 0,
+ [NFULA_PREFIX-1] = 0,
+ [NFULA_UID-1] = sizeof(u_int32_t),
+};
+
+static const int nfula_cfg_min[NFULA_CFG_MAX] = {
+ [NFULA_CFG_CMD-1] = sizeof(struct nfulnl_msg_config_cmd),
+ [NFULA_CFG_MODE-1] = sizeof(struct nfulnl_msg_config_mode),
+ [NFULA_CFG_TIMEOUT-1] = sizeof(u_int32_t),
+ [NFULA_CFG_QTHRESH-1] = sizeof(u_int32_t),
+ [NFULA_CFG_NLBUFSIZ-1] = sizeof(u_int32_t),
+};
+
+static int
+nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct nfattr *nfula[], int *errp)
+{
+ struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
+ u_int16_t group_num = ntohs(nfmsg->res_id);
+ struct nfulnl_instance *inst;
+ int ret = 0;
+
+ UDEBUG("entering for msg %u\n", NFNL_MSG_TYPE(nlh->nlmsg_type));
+
+ if (nfattr_bad_size(nfula, NFULA_CFG_MAX, nfula_cfg_min)) {
+ UDEBUG("bad attribute size\n");
+ return -EINVAL;
+ }
+
+ inst = instance_lookup_get(group_num);
+ if (nfula[NFULA_CFG_CMD-1]) {
+ u_int8_t pf = nfmsg->nfgen_family;
+ struct nfulnl_msg_config_cmd *cmd;
+ cmd = NFA_DATA(nfula[NFULA_CFG_CMD-1]);
+ UDEBUG("found CFG_CMD for\n");
+
+ switch (cmd->command) {
+ case NFULNL_CFG_CMD_BIND:
+ if (inst) {
+ ret = -EBUSY;
+ goto out_put;
+ }
+
+ inst = instance_create(group_num,
+ NETLINK_CB(skb).pid);
+ if (!inst) {
+ ret = -EINVAL;
+ goto out_put;
+ }
+ break;
+ case NFULNL_CFG_CMD_UNBIND:
+ if (!inst) {
+ ret = -ENODEV;
+ goto out_put;
+ }
+
+ if (inst->peer_pid != NETLINK_CB(skb).pid) {
+ ret = -EPERM;
+ goto out_put;
+ }
+
+ instance_destroy(inst);
+ break;
+ case NFULNL_CFG_CMD_PF_BIND:
+ UDEBUG("registering log handler for pf=%u\n", pf);
+ ret = nf_log_register(pf, &nfulnl_logger);
+ break;
+ case NFULNL_CFG_CMD_PF_UNBIND:
+ UDEBUG("unregistering log handler for pf=%u\n", pf);
+ /* This is a bug and a feature. We cannot unregister
+ * other handlers, like nfnetlink_inst can */
+ nf_log_unregister_pf(pf);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ } else {
+ if (!inst) {
+ UDEBUG("no config command, and no instance for "
+ "group=%u pid=%u =>ENOENT\n",
+ group_num, NETLINK_CB(skb).pid);
+ ret = -ENOENT;
+ goto out_put;
+ }
+
+ if (inst->peer_pid != NETLINK_CB(skb).pid) {
+ UDEBUG("no config command, and wrong pid\n");
+ ret = -EPERM;
+ goto out_put;
+ }
+ }
+
+ if (nfula[NFULA_CFG_MODE-1]) {
+ struct nfulnl_msg_config_mode *params;
+ params = NFA_DATA(nfula[NFULA_CFG_MODE-1]);
+
+ nfulnl_set_mode(inst, params->copy_mode,
+ ntohs(params->copy_range));
+ }
+
+ if (nfula[NFULA_CFG_TIMEOUT-1]) {
+ u_int32_t timeout =
+ *(u_int32_t *)NFA_DATA(nfula[NFULA_CFG_TIMEOUT-1]);
+
+ nfulnl_set_timeout(inst, ntohl(timeout));
+ }
+
+ if (nfula[NFULA_CFG_NLBUFSIZ-1]) {
+ u_int32_t nlbufsiz =
+ *(u_int32_t *)NFA_DATA(nfula[NFULA_CFG_NLBUFSIZ-1]);
+
+ nfulnl_set_nlbufsiz(inst, ntohl(nlbufsiz));
+ }
+
+ if (nfula[NFULA_CFG_QTHRESH-1]) {
+ u_int32_t qthresh =
+ *(u_int16_t *)NFA_DATA(nfula[NFULA_CFG_QTHRESH-1]);
+
+ nfulnl_set_qthresh(inst, ntohl(qthresh));
+ }
+
+out_put:
+ instance_put(inst);
+ return ret;
+}
+
+static struct nfnl_callback nfulnl_cb[NFULNL_MSG_MAX] = {
+ [NFULNL_MSG_PACKET] = { .call = nfulnl_recv_unsupp,
+ .attr_count = NFULA_MAX,
+ .cap_required = CAP_NET_ADMIN, },
+ [NFULNL_MSG_CONFIG] = { .call = nfulnl_recv_config,
+ .attr_count = NFULA_CFG_MAX,
+ .cap_required = CAP_NET_ADMIN },
+};
+
+static struct nfnetlink_subsystem nfulnl_subsys = {
+ .name = "log",
+ .subsys_id = NFNL_SUBSYS_ULOG,
+ .cb_count = NFULNL_MSG_MAX,
+ .cb = nfulnl_cb,
+};
+
+#ifdef CONFIG_PROC_FS
+struct iter_state {
+ unsigned int bucket;
+};
+
+static struct hlist_node *get_first(struct seq_file *seq)
+{
+ struct iter_state *st = seq->private;
+
+ if (!st)
+ return NULL;
+
+ for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
+ if (!hlist_empty(&instance_table[st->bucket]))
+ return instance_table[st->bucket].first;
+ }
+ return NULL;
+}
+
+static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h)
+{
+ struct iter_state *st = seq->private;
+
+ h = h->next;
+ while (!h) {
+ if (++st->bucket >= INSTANCE_BUCKETS)
+ return NULL;
+
+ h = instance_table[st->bucket].first;
+ }
+ return h;
+}
+
+static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct hlist_node *head;
+ head = get_first(seq);
+
+ if (head)
+ while (pos && (head = get_next(seq, head)))
+ pos--;
+ return pos ? NULL : head;
+}
+
+static void *seq_start(struct seq_file *seq, loff_t *pos)
+{
+ read_lock_bh(&instances_lock);
+ return get_idx(seq, *pos);
+}
+
+static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return get_next(s, v);
+}
+
+static void seq_stop(struct seq_file *s, void *v)
+{
+ read_unlock_bh(&instances_lock);
+}
+
+static int seq_show(struct seq_file *s, void *v)
+{
+ const struct nfulnl_instance *inst = v;
+
+ return seq_printf(s, "%5d %6d %5d %1d %5d %6d %2d\n",
+ inst->group_num,
+ inst->peer_pid, inst->qlen,
+ inst->copy_mode, inst->copy_range,
+ inst->flushtimeout, atomic_read(&inst->use));
+}
+
+static struct seq_operations nful_seq_ops = {
+ .start = seq_start,
+ .next = seq_next,
+ .stop = seq_stop,
+ .show = seq_show,
+};
+
+static int nful_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ struct iter_state *is;
+ int ret;
+
+ is = kmalloc(sizeof(*is), GFP_KERNEL);
+ if (!is)
+ return -ENOMEM;
+ memset(is, 0, sizeof(*is));
+ ret = seq_open(file, &nful_seq_ops);
+ if (ret < 0)
+ goto out_free;
+ seq = file->private_data;
+ seq->private = is;
+ return ret;
+out_free:
+ kfree(is);
+ return ret;
+}
+
+static struct file_operations nful_file_ops = {
+ .owner = THIS_MODULE,
+ .open = nful_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+#endif /* PROC_FS */
+
+static int
+init_or_cleanup(int init)
+{
+ int i, status = -ENOMEM;
+#ifdef CONFIG_PROC_FS
+ struct proc_dir_entry *proc_nful;
+#endif
+
+ if (!init)
+ goto cleanup;
+
+ for (i = 0; i < INSTANCE_BUCKETS; i++)
+ INIT_HLIST_HEAD(&instance_table[i]);
+
+ /* it's not really all that important to have a random value, so
+ * we can do this from the init function, even if there hasn't
+ * been that much entropy yet */
+ get_random_bytes(&hash_init, sizeof(hash_init));
+
+ netlink_register_notifier(&nfulnl_rtnl_notifier);
+ status = nfnetlink_subsys_register(&nfulnl_subsys);
+ if (status < 0) {
+ printk(KERN_ERR "log: failed to create netlink socket\n");
+ goto cleanup_netlink_notifier;
+ }
+
+#ifdef CONFIG_PROC_FS
+ proc_nful = create_proc_entry("nfnetlink_log", 0440,
+ proc_net_netfilter);
+ if (!proc_nful)
+ goto cleanup_subsys;
+ proc_nful->proc_fops = &nful_file_ops;
+#endif
+
+ return status;
+
+cleanup:
+ nf_log_unregister_logger(&nfulnl_logger);
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("nfnetlink_log", proc_net_netfilter);
+cleanup_subsys:
+#endif
+ nfnetlink_subsys_unregister(&nfulnl_subsys);
+cleanup_netlink_notifier:
+ netlink_unregister_notifier(&nfulnl_rtnl_notifier);
+ return status;
+}
+
+static int __init init(void)
+{
+
+ return init_or_cleanup(1);
+}
+
+static void __exit fini(void)
+{
+ init_or_cleanup(0);
+}
+
+MODULE_DESCRIPTION("netfilter userspace logging");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ULOG);
+
+module_init(init);
+module_exit(fini);
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
new file mode 100644
index 00000000000..eaa44c49567
--- /dev/null
+++ b/net/netfilter/nfnetlink_queue.c
@@ -0,0 +1,1127 @@
+/*
+ * This is a module which is used for queueing packets and communicating with
+ * userspace via nfetlink.
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * Based on the old ipv4-only ip_queue.c:
+ * (C) 2000-2002 James Morris <jmorris@intercode.com.au>
+ * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/notifier.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/proc_fs.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_queue.h>
+#include <linux/list.h>
+#include <net/sock.h>
+
+#include <asm/atomic.h>
+
+#ifdef CONFIG_BRIDGE_NETFILTER
+#include "../bridge/br_private.h"
+#endif
+
+#define NFQNL_QMAX_DEFAULT 1024
+
+#if 0
+#define QDEBUG(x, args ...) printk(KERN_DEBUG "%s(%d):%s(): " x, \
+ __FILE__, __LINE__, __FUNCTION__, \
+ ## args)
+#else
+#define QDEBUG(x, ...)
+#endif
+
+struct nfqnl_queue_entry {
+ struct list_head list;
+ struct nf_info *info;
+ struct sk_buff *skb;
+ unsigned int id;
+};
+
+struct nfqnl_instance {
+ struct hlist_node hlist; /* global list of queues */
+ atomic_t use;
+
+ int peer_pid;
+ unsigned int queue_maxlen;
+ unsigned int copy_range;
+ unsigned int queue_total;
+ unsigned int queue_dropped;
+ unsigned int queue_user_dropped;
+
+ atomic_t id_sequence; /* 'sequence' of pkt ids */
+
+ u_int16_t queue_num; /* number of this queue */
+ u_int8_t copy_mode;
+
+ spinlock_t lock;
+
+ struct list_head queue_list; /* packets in queue */
+};
+
+typedef int (*nfqnl_cmpfn)(struct nfqnl_queue_entry *, unsigned long);
+
+static DEFINE_RWLOCK(instances_lock);
+
+#define INSTANCE_BUCKETS 16
+static struct hlist_head instance_table[INSTANCE_BUCKETS];
+
+static inline u_int8_t instance_hashfn(u_int16_t queue_num)
+{
+ return ((queue_num >> 8) | queue_num) % INSTANCE_BUCKETS;
+}
+
+static struct nfqnl_instance *
+__instance_lookup(u_int16_t queue_num)
+{
+ struct hlist_head *head;
+ struct hlist_node *pos;
+ struct nfqnl_instance *inst;
+
+ head = &instance_table[instance_hashfn(queue_num)];
+ hlist_for_each_entry(inst, pos, head, hlist) {
+ if (inst->queue_num == queue_num)
+ return inst;
+ }
+ return NULL;
+}
+
+static struct nfqnl_instance *
+instance_lookup_get(u_int16_t queue_num)
+{
+ struct nfqnl_instance *inst;
+
+ read_lock_bh(&instances_lock);
+ inst = __instance_lookup(queue_num);
+ if (inst)
+ atomic_inc(&inst->use);
+ read_unlock_bh(&instances_lock);
+
+ return inst;
+}
+
+static void
+instance_put(struct nfqnl_instance *inst)
+{
+ if (inst && atomic_dec_and_test(&inst->use)) {
+ QDEBUG("kfree(inst=%p)\n", inst);
+ kfree(inst);
+ }
+}
+
+static struct nfqnl_instance *
+instance_create(u_int16_t queue_num, int pid)
+{
+ struct nfqnl_instance *inst;
+
+ QDEBUG("entering for queue_num=%u, pid=%d\n", queue_num, pid);
+
+ write_lock_bh(&instances_lock);
+ if (__instance_lookup(queue_num)) {
+ inst = NULL;
+ QDEBUG("aborting, instance already exists\n");
+ goto out_unlock;
+ }
+
+ inst = kmalloc(sizeof(*inst), GFP_ATOMIC);
+ if (!inst)
+ goto out_unlock;
+
+ memset(inst, 0, sizeof(*inst));
+ inst->queue_num = queue_num;
+ inst->peer_pid = pid;
+ inst->queue_maxlen = NFQNL_QMAX_DEFAULT;
+ inst->copy_range = 0xfffff;
+ inst->copy_mode = NFQNL_COPY_NONE;
+ atomic_set(&inst->id_sequence, 0);
+ /* needs to be two, since we _put() after creation */
+ atomic_set(&inst->use, 2);
+ inst->lock = SPIN_LOCK_UNLOCKED;
+ INIT_LIST_HEAD(&inst->queue_list);
+
+ if (!try_module_get(THIS_MODULE))
+ goto out_free;
+
+ hlist_add_head(&inst->hlist,
+ &instance_table[instance_hashfn(queue_num)]);
+
+ write_unlock_bh(&instances_lock);
+
+ QDEBUG("successfully created new instance\n");
+
+ return inst;
+
+out_free:
+ kfree(inst);
+out_unlock:
+ write_unlock_bh(&instances_lock);
+ return NULL;
+}
+
+static void nfqnl_flush(struct nfqnl_instance *queue, int verdict);
+
+static void
+_instance_destroy2(struct nfqnl_instance *inst, int lock)
+{
+ /* first pull it out of the global list */
+ if (lock)
+ write_lock_bh(&instances_lock);
+
+ QDEBUG("removing instance %p (queuenum=%u) from hash\n",
+ inst, inst->queue_num);
+ hlist_del(&inst->hlist);
+
+ if (lock)
+ write_unlock_bh(&instances_lock);
+
+ /* then flush all pending skbs from the queue */
+ nfqnl_flush(inst, NF_DROP);
+
+ /* and finally put the refcount */
+ instance_put(inst);
+
+ module_put(THIS_MODULE);
+}
+
+static inline void
+__instance_destroy(struct nfqnl_instance *inst)
+{
+ _instance_destroy2(inst, 0);
+}
+
+static inline void
+instance_destroy(struct nfqnl_instance *inst)
+{
+ _instance_destroy2(inst, 1);
+}
+
+
+
+static void
+issue_verdict(struct nfqnl_queue_entry *entry, int verdict)
+{
+ QDEBUG("entering for entry %p, verdict %u\n", entry, verdict);
+
+ /* TCP input path (and probably other bits) assume to be called
+ * from softirq context, not from syscall, like issue_verdict is
+ * called. TCP input path deadlocks with locks taken from timer
+ * softirq, e.g. We therefore emulate this by local_bh_disable() */
+
+ local_bh_disable();
+ nf_reinject(entry->skb, entry->info, verdict);
+ local_bh_enable();
+
+ kfree(entry);
+}
+
+static inline void
+__enqueue_entry(struct nfqnl_instance *queue,
+ struct nfqnl_queue_entry *entry)
+{
+ list_add(&entry->list, &queue->queue_list);
+ queue->queue_total++;
+}
+
+/*
+ * Find and return a queued entry matched by cmpfn, or return the last
+ * entry if cmpfn is NULL.
+ */
+static inline struct nfqnl_queue_entry *
+__find_entry(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn,
+ unsigned long data)
+{
+ struct list_head *p;
+
+ list_for_each_prev(p, &queue->queue_list) {
+ struct nfqnl_queue_entry *entry = (struct nfqnl_queue_entry *)p;
+
+ if (!cmpfn || cmpfn(entry, data))
+ return entry;
+ }
+ return NULL;
+}
+
+static inline void
+__dequeue_entry(struct nfqnl_instance *q, struct nfqnl_queue_entry *entry)
+{
+ list_del(&entry->list);
+ q->queue_total--;
+}
+
+static inline struct nfqnl_queue_entry *
+__find_dequeue_entry(struct nfqnl_instance *queue,
+ nfqnl_cmpfn cmpfn, unsigned long data)
+{
+ struct nfqnl_queue_entry *entry;
+
+ entry = __find_entry(queue, cmpfn, data);
+ if (entry == NULL)
+ return NULL;
+
+ __dequeue_entry(queue, entry);
+ return entry;
+}
+
+
+static inline void
+__nfqnl_flush(struct nfqnl_instance *queue, int verdict)
+{
+ struct nfqnl_queue_entry *entry;
+
+ while ((entry = __find_dequeue_entry(queue, NULL, 0)))
+ issue_verdict(entry, verdict);
+}
+
+static inline int
+__nfqnl_set_mode(struct nfqnl_instance *queue,
+ unsigned char mode, unsigned int range)
+{
+ int status = 0;
+
+ switch (mode) {
+ case NFQNL_COPY_NONE:
+ case NFQNL_COPY_META:
+ queue->copy_mode = mode;
+ queue->copy_range = 0;
+ break;
+
+ case NFQNL_COPY_PACKET:
+ queue->copy_mode = mode;
+ /* we're using struct nfattr which has 16bit nfa_len */
+ if (range > 0xffff)
+ queue->copy_range = 0xffff;
+ else
+ queue->copy_range = range;
+ break;
+
+ default:
+ status = -EINVAL;
+
+ }
+ return status;
+}
+
+static struct nfqnl_queue_entry *
+find_dequeue_entry(struct nfqnl_instance *queue,
+ nfqnl_cmpfn cmpfn, unsigned long data)
+{
+ struct nfqnl_queue_entry *entry;
+
+ spin_lock_bh(&queue->lock);
+ entry = __find_dequeue_entry(queue, cmpfn, data);
+ spin_unlock_bh(&queue->lock);
+
+ return entry;
+}
+
+static void
+nfqnl_flush(struct nfqnl_instance *queue, int verdict)
+{
+ spin_lock_bh(&queue->lock);
+ __nfqnl_flush(queue, verdict);
+ spin_unlock_bh(&queue->lock);
+}
+
+static struct sk_buff *
+nfqnl_build_packet_message(struct nfqnl_instance *queue,
+ struct nfqnl_queue_entry *entry, int *errp)
+{
+ unsigned char *old_tail;
+ size_t size;
+ size_t data_len = 0;
+ struct sk_buff *skb;
+ struct nfqnl_msg_packet_hdr pmsg;
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ unsigned int tmp_uint;
+
+ QDEBUG("entered\n");
+
+ /* all macros expand to constant values at compile time */
+ size = NLMSG_SPACE(sizeof(struct nfqnl_msg_packet_hdr))
+ + NLMSG_SPACE(sizeof(u_int32_t)) /* ifindex */
+ + NLMSG_SPACE(sizeof(u_int32_t)) /* ifindex */
+#ifdef CONFIG_BRIDGE_NETFILTER
+ + NLMSG_SPACE(sizeof(u_int32_t)) /* ifindex */
+ + NLMSG_SPACE(sizeof(u_int32_t)) /* ifindex */
+#endif
+ + NLMSG_SPACE(sizeof(u_int32_t)) /* mark */
+ + NLMSG_SPACE(sizeof(struct nfqnl_msg_packet_hw))
+ + NLMSG_SPACE(sizeof(struct nfqnl_msg_packet_timestamp));
+
+ spin_lock_bh(&queue->lock);
+
+ switch (queue->copy_mode) {
+ case NFQNL_COPY_META:
+ case NFQNL_COPY_NONE:
+ data_len = 0;
+ break;
+
+ case NFQNL_COPY_PACKET:
+ if (entry->skb->ip_summed == CHECKSUM_HW &&
+ (*errp = skb_checksum_help(entry->skb,
+ entry->info->outdev == NULL))) {
+ spin_unlock_bh(&queue->lock);
+ return NULL;
+ }
+ if (queue->copy_range == 0
+ || queue->copy_range > entry->skb->len)
+ data_len = entry->skb->len;
+ else
+ data_len = queue->copy_range;
+
+ size += NLMSG_SPACE(data_len);
+ break;
+
+ default:
+ *errp = -EINVAL;
+ spin_unlock_bh(&queue->lock);
+ return NULL;
+ }
+
+ spin_unlock_bh(&queue->lock);
+
+ skb = alloc_skb(size, GFP_ATOMIC);
+ if (!skb)
+ goto nlmsg_failure;
+
+ old_tail= skb->tail;
+ nlh = NLMSG_PUT(skb, 0, 0,
+ NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET,
+ sizeof(struct nfgenmsg));
+ nfmsg = NLMSG_DATA(nlh);
+ nfmsg->nfgen_family = entry->info->pf;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = htons(queue->queue_num);
+
+ pmsg.packet_id = htonl(entry->id);
+ pmsg.hw_protocol = htons(entry->skb->protocol);
+ pmsg.hook = entry->info->hook;
+
+ NFA_PUT(skb, NFQA_PACKET_HDR, sizeof(pmsg), &pmsg);
+
+ if (entry->info->indev) {
+ tmp_uint = htonl(entry->info->indev->ifindex);
+#ifndef CONFIG_BRIDGE_NETFILTER
+ NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint), &tmp_uint);
+#else
+ if (entry->info->pf == PF_BRIDGE) {
+ /* Case 1: indev is physical input device, we need to
+ * look for bridge group (when called from
+ * netfilter_bridge) */
+ NFA_PUT(skb, NFQA_IFINDEX_PHYSINDEV, sizeof(tmp_uint),
+ &tmp_uint);
+ /* this is the bridge group "brX" */
+ tmp_uint = htonl(entry->info->indev->br_port->br->dev->ifindex);
+ NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint),
+ &tmp_uint);
+ } else {
+ /* Case 2: indev is bridge group, we need to look for
+ * physical device (when called from ipv4) */
+ NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint),
+ &tmp_uint);
+ if (entry->skb->nf_bridge
+ && entry->skb->nf_bridge->physindev) {
+ tmp_uint = htonl(entry->skb->nf_bridge->physindev->ifindex);
+ NFA_PUT(skb, NFQA_IFINDEX_PHYSINDEV,
+ sizeof(tmp_uint), &tmp_uint);
+ }
+ }
+#endif
+ }
+
+ if (entry->info->outdev) {
+ tmp_uint = htonl(entry->info->outdev->ifindex);
+#ifndef CONFIG_BRIDGE_NETFILTER
+ NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint), &tmp_uint);
+#else
+ if (entry->info->pf == PF_BRIDGE) {
+ /* Case 1: outdev is physical output device, we need to
+ * look for bridge group (when called from
+ * netfilter_bridge) */
+ NFA_PUT(skb, NFQA_IFINDEX_PHYSOUTDEV, sizeof(tmp_uint),
+ &tmp_uint);
+ /* this is the bridge group "brX" */
+ tmp_uint = htonl(entry->info->outdev->br_port->br->dev->ifindex);
+ NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint),
+ &tmp_uint);
+ } else {
+ /* Case 2: outdev is bridge group, we need to look for
+ * physical output device (when called from ipv4) */
+ NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint),
+ &tmp_uint);
+ if (entry->skb->nf_bridge
+ && entry->skb->nf_bridge->physoutdev) {
+ tmp_uint = htonl(entry->skb->nf_bridge->physoutdev->ifindex);
+ NFA_PUT(skb, NFQA_IFINDEX_PHYSOUTDEV,
+ sizeof(tmp_uint), &tmp_uint);
+ }
+ }
+#endif
+ }
+
+ if (entry->skb->nfmark) {
+ tmp_uint = htonl(entry->skb->nfmark);
+ NFA_PUT(skb, NFQA_MARK, sizeof(u_int32_t), &tmp_uint);
+ }
+
+ if (entry->info->indev && entry->skb->dev
+ && entry->skb->dev->hard_header_parse) {
+ struct nfqnl_msg_packet_hw phw;
+
+ phw.hw_addrlen =
+ entry->skb->dev->hard_header_parse(entry->skb,
+ phw.hw_addr);
+ phw.hw_addrlen = htons(phw.hw_addrlen);
+ NFA_PUT(skb, NFQA_HWADDR, sizeof(phw), &phw);
+ }
+
+ if (entry->skb->tstamp.off_sec) {
+ struct nfqnl_msg_packet_timestamp ts;
+
+ ts.sec = cpu_to_be64(entry->skb->tstamp.off_sec);
+ ts.usec = cpu_to_be64(entry->skb->tstamp.off_usec);
+
+ NFA_PUT(skb, NFQA_TIMESTAMP, sizeof(ts), &ts);
+ }
+
+ if (data_len) {
+ struct nfattr *nfa;
+ int size = NFA_LENGTH(data_len);
+
+ if (skb_tailroom(skb) < (int)NFA_SPACE(data_len)) {
+ printk(KERN_WARNING "nf_queue: no tailroom!\n");
+ goto nlmsg_failure;
+ }
+
+ nfa = (struct nfattr *)skb_put(skb, NFA_ALIGN(size));
+ nfa->nfa_type = NFQA_PAYLOAD;
+ nfa->nfa_len = size;
+
+ if (skb_copy_bits(entry->skb, 0, NFA_DATA(nfa), data_len))
+ BUG();
+ }
+
+ nlh->nlmsg_len = skb->tail - old_tail;
+ return skb;
+
+nlmsg_failure:
+nfattr_failure:
+ if (skb)
+ kfree_skb(skb);
+ *errp = -EINVAL;
+ if (net_ratelimit())
+ printk(KERN_ERR "nf_queue: error creating packet message\n");
+ return NULL;
+}
+
+static int
+nfqnl_enqueue_packet(struct sk_buff *skb, struct nf_info *info,
+ unsigned int queuenum, void *data)
+{
+ int status = -EINVAL;
+ struct sk_buff *nskb;
+ struct nfqnl_instance *queue;
+ struct nfqnl_queue_entry *entry;
+
+ QDEBUG("entered\n");
+
+ queue = instance_lookup_get(queuenum);
+ if (!queue) {
+ QDEBUG("no queue instance matching\n");
+ return -EINVAL;
+ }
+
+ if (queue->copy_mode == NFQNL_COPY_NONE) {
+ QDEBUG("mode COPY_NONE, aborting\n");
+ status = -EAGAIN;
+ goto err_out_put;
+ }
+
+ entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+ if (entry == NULL) {
+ if (net_ratelimit())
+ printk(KERN_ERR
+ "nf_queue: OOM in nfqnl_enqueue_packet()\n");
+ status = -ENOMEM;
+ goto err_out_put;
+ }
+
+ entry->info = info;
+ entry->skb = skb;
+ entry->id = atomic_inc_return(&queue->id_sequence);
+
+ nskb = nfqnl_build_packet_message(queue, entry, &status);
+ if (nskb == NULL)
+ goto err_out_free;
+
+ spin_lock_bh(&queue->lock);
+
+ if (!queue->peer_pid)
+ goto err_out_free_nskb;
+
+ if (queue->queue_total >= queue->queue_maxlen) {
+ queue->queue_dropped++;
+ status = -ENOSPC;
+ if (net_ratelimit())
+ printk(KERN_WARNING "ip_queue: full at %d entries, "
+ "dropping packets(s). Dropped: %d\n",
+ queue->queue_total, queue->queue_dropped);
+ goto err_out_free_nskb;
+ }
+
+ /* nfnetlink_unicast will either free the nskb or add it to a socket */
+ status = nfnetlink_unicast(nskb, queue->peer_pid, MSG_DONTWAIT);
+ if (status < 0) {
+ queue->queue_user_dropped++;
+ goto err_out_unlock;
+ }
+
+ __enqueue_entry(queue, entry);
+
+ spin_unlock_bh(&queue->lock);
+ instance_put(queue);
+ return status;
+
+err_out_free_nskb:
+ kfree_skb(nskb);
+
+err_out_unlock:
+ spin_unlock_bh(&queue->lock);
+
+err_out_free:
+ kfree(entry);
+err_out_put:
+ instance_put(queue);
+ return status;
+}
+
+static int
+nfqnl_mangle(void *data, int data_len, struct nfqnl_queue_entry *e)
+{
+ int diff;
+
+ diff = data_len - e->skb->len;
+ if (diff < 0)
+ skb_trim(e->skb, data_len);
+ else if (diff > 0) {
+ if (data_len > 0xFFFF)
+ return -EINVAL;
+ if (diff > skb_tailroom(e->skb)) {
+ struct sk_buff *newskb;
+
+ newskb = skb_copy_expand(e->skb,
+ skb_headroom(e->skb),
+ diff,
+ GFP_ATOMIC);
+ if (newskb == NULL) {
+ printk(KERN_WARNING "ip_queue: OOM "
+ "in mangle, dropping packet\n");
+ return -ENOMEM;
+ }
+ if (e->skb->sk)
+ skb_set_owner_w(newskb, e->skb->sk);
+ kfree_skb(e->skb);
+ e->skb = newskb;
+ }
+ skb_put(e->skb, diff);
+ }
+ if (!skb_make_writable(&e->skb, data_len))
+ return -ENOMEM;
+ memcpy(e->skb->data, data, data_len);
+ e->skb->ip_summed = CHECKSUM_NONE;
+ return 0;
+}
+
+static inline int
+id_cmp(struct nfqnl_queue_entry *e, unsigned long id)
+{
+ return (id == e->id);
+}
+
+static int
+nfqnl_set_mode(struct nfqnl_instance *queue,
+ unsigned char mode, unsigned int range)
+{
+ int status;
+
+ spin_lock_bh(&queue->lock);
+ status = __nfqnl_set_mode(queue, mode, range);
+ spin_unlock_bh(&queue->lock);
+
+ return status;
+}
+
+static int
+dev_cmp(struct nfqnl_queue_entry *entry, unsigned long ifindex)
+{
+ if (entry->info->indev)
+ if (entry->info->indev->ifindex == ifindex)
+ return 1;
+
+ if (entry->info->outdev)
+ if (entry->info->outdev->ifindex == ifindex)
+ return 1;
+
+ return 0;
+}
+
+/* drop all packets with either indev or outdev == ifindex from all queue
+ * instances */
+static void
+nfqnl_dev_drop(int ifindex)
+{
+ int i;
+
+ QDEBUG("entering for ifindex %u\n", ifindex);
+
+ /* this only looks like we have to hold the readlock for a way too long
+ * time, issue_verdict(), nf_reinject(), ... - but we always only
+ * issue NF_DROP, which is processed directly in nf_reinject() */
+ read_lock_bh(&instances_lock);
+
+ for (i = 0; i < INSTANCE_BUCKETS; i++) {
+ struct hlist_node *tmp;
+ struct nfqnl_instance *inst;
+ struct hlist_head *head = &instance_table[i];
+
+ hlist_for_each_entry(inst, tmp, head, hlist) {
+ struct nfqnl_queue_entry *entry;
+ while ((entry = find_dequeue_entry(inst, dev_cmp,
+ ifindex)) != NULL)
+ issue_verdict(entry, NF_DROP);
+ }
+ }
+
+ read_unlock_bh(&instances_lock);
+}
+
+#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
+
+static int
+nfqnl_rcv_dev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = ptr;
+
+ /* Drop any packets associated with the downed device */
+ if (event == NETDEV_DOWN)
+ nfqnl_dev_drop(dev->ifindex);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nfqnl_dev_notifier = {
+ .notifier_call = nfqnl_rcv_dev_event,
+};
+
+static int
+nfqnl_rcv_nl_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct netlink_notify *n = ptr;
+
+ if (event == NETLINK_URELEASE &&
+ n->protocol == NETLINK_NETFILTER && n->pid) {
+ int i;
+
+ /* destroy all instances for this pid */
+ write_lock_bh(&instances_lock);
+ for (i = 0; i < INSTANCE_BUCKETS; i++) {
+ struct hlist_node *tmp, *t2;
+ struct nfqnl_instance *inst;
+ struct hlist_head *head = &instance_table[i];
+
+ hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) {
+ if (n->pid == inst->peer_pid)
+ __instance_destroy(inst);
+ }
+ }
+ write_unlock_bh(&instances_lock);
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nfqnl_rtnl_notifier = {
+ .notifier_call = nfqnl_rcv_nl_event,
+};
+
+static const int nfqa_verdict_min[NFQA_MAX] = {
+ [NFQA_VERDICT_HDR-1] = sizeof(struct nfqnl_msg_verdict_hdr),
+ [NFQA_MARK-1] = sizeof(u_int32_t),
+ [NFQA_PAYLOAD-1] = 0,
+};
+
+static int
+nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp)
+{
+ struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
+ u_int16_t queue_num = ntohs(nfmsg->res_id);
+
+ struct nfqnl_msg_verdict_hdr *vhdr;
+ struct nfqnl_instance *queue;
+ unsigned int verdict;
+ struct nfqnl_queue_entry *entry;
+ int err;
+
+ if (nfattr_bad_size(nfqa, NFQA_MAX, nfqa_verdict_min)) {
+ QDEBUG("bad attribute size\n");
+ return -EINVAL;
+ }
+
+ queue = instance_lookup_get(queue_num);
+ if (!queue)
+ return -ENODEV;
+
+ if (queue->peer_pid != NETLINK_CB(skb).pid) {
+ err = -EPERM;
+ goto err_out_put;
+ }
+
+ if (!nfqa[NFQA_VERDICT_HDR-1]) {
+ err = -EINVAL;
+ goto err_out_put;
+ }
+
+ vhdr = NFA_DATA(nfqa[NFQA_VERDICT_HDR-1]);
+ verdict = ntohl(vhdr->verdict);
+
+ if ((verdict & NF_VERDICT_MASK) > NF_MAX_VERDICT) {
+ err = -EINVAL;
+ goto err_out_put;
+ }
+
+ entry = find_dequeue_entry(queue, id_cmp, ntohl(vhdr->id));
+ if (entry == NULL) {
+ err = -ENOENT;
+ goto err_out_put;
+ }
+
+ if (nfqa[NFQA_PAYLOAD-1]) {
+ if (nfqnl_mangle(NFA_DATA(nfqa[NFQA_PAYLOAD-1]),
+ NFA_PAYLOAD(nfqa[NFQA_PAYLOAD-1]), entry) < 0)
+ verdict = NF_DROP;
+ }
+
+ if (nfqa[NFQA_MARK-1])
+ skb->nfmark = ntohl(*(u_int32_t *)NFA_DATA(nfqa[NFQA_MARK-1]));
+
+ issue_verdict(entry, verdict);
+ instance_put(queue);
+ return 0;
+
+err_out_put:
+ instance_put(queue);
+ return err;
+}
+
+static int
+nfqnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp)
+{
+ return -ENOTSUPP;
+}
+
+static const int nfqa_cfg_min[NFQA_CFG_MAX] = {
+ [NFQA_CFG_CMD-1] = sizeof(struct nfqnl_msg_config_cmd),
+ [NFQA_CFG_PARAMS-1] = sizeof(struct nfqnl_msg_config_params),
+};
+
+static struct nf_queue_handler nfqh = {
+ .name = "nf_queue",
+ .outfn = &nfqnl_enqueue_packet,
+};
+
+static int
+nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp)
+{
+ struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
+ u_int16_t queue_num = ntohs(nfmsg->res_id);
+ struct nfqnl_instance *queue;
+ int ret = 0;
+
+ QDEBUG("entering for msg %u\n", NFNL_MSG_TYPE(nlh->nlmsg_type));
+
+ if (nfattr_bad_size(nfqa, NFQA_CFG_MAX, nfqa_cfg_min)) {
+ QDEBUG("bad attribute size\n");
+ return -EINVAL;
+ }
+
+ queue = instance_lookup_get(queue_num);
+ if (nfqa[NFQA_CFG_CMD-1]) {
+ struct nfqnl_msg_config_cmd *cmd;
+ cmd = NFA_DATA(nfqa[NFQA_CFG_CMD-1]);
+ QDEBUG("found CFG_CMD\n");
+
+ switch (cmd->command) {
+ case NFQNL_CFG_CMD_BIND:
+ if (queue)
+ return -EBUSY;
+
+ queue = instance_create(queue_num, NETLINK_CB(skb).pid);
+ if (!queue)
+ return -EINVAL;
+ break;
+ case NFQNL_CFG_CMD_UNBIND:
+ if (!queue)
+ return -ENODEV;
+
+ if (queue->peer_pid != NETLINK_CB(skb).pid) {
+ ret = -EPERM;
+ goto out_put;
+ }
+
+ instance_destroy(queue);
+ break;
+ case NFQNL_CFG_CMD_PF_BIND:
+ QDEBUG("registering queue handler for pf=%u\n",
+ ntohs(cmd->pf));
+ ret = nf_register_queue_handler(ntohs(cmd->pf), &nfqh);
+ break;
+ case NFQNL_CFG_CMD_PF_UNBIND:
+ QDEBUG("unregistering queue handler for pf=%u\n",
+ ntohs(cmd->pf));
+ /* This is a bug and a feature. We can unregister
+ * other handlers(!) */
+ ret = nf_unregister_queue_handler(ntohs(cmd->pf));
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ } else {
+ if (!queue) {
+ QDEBUG("no config command, and no instance ENOENT\n");
+ ret = -ENOENT;
+ goto out_put;
+ }
+
+ if (queue->peer_pid != NETLINK_CB(skb).pid) {
+ QDEBUG("no config command, and wrong pid\n");
+ ret = -EPERM;
+ goto out_put;
+ }
+ }
+
+ if (nfqa[NFQA_CFG_PARAMS-1]) {
+ struct nfqnl_msg_config_params *params;
+ params = NFA_DATA(nfqa[NFQA_CFG_PARAMS-1]);
+
+ nfqnl_set_mode(queue, params->copy_mode,
+ ntohl(params->copy_range));
+ }
+
+out_put:
+ instance_put(queue);
+ return ret;
+}
+
+static struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = {
+ [NFQNL_MSG_PACKET] = { .call = nfqnl_recv_unsupp,
+ .attr_count = NFQA_MAX,
+ .cap_required = CAP_NET_ADMIN },
+ [NFQNL_MSG_VERDICT] = { .call = nfqnl_recv_verdict,
+ .attr_count = NFQA_MAX,
+ .cap_required = CAP_NET_ADMIN },
+ [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config,
+ .attr_count = NFQA_CFG_MAX,
+ .cap_required = CAP_NET_ADMIN },
+};
+
+static struct nfnetlink_subsystem nfqnl_subsys = {
+ .name = "nf_queue",
+ .subsys_id = NFNL_SUBSYS_QUEUE,
+ .cb_count = NFQNL_MSG_MAX,
+ .cb = nfqnl_cb,
+};
+
+#ifdef CONFIG_PROC_FS
+struct iter_state {
+ unsigned int bucket;
+};
+
+static struct hlist_node *get_first(struct seq_file *seq)
+{
+ struct iter_state *st = seq->private;
+
+ if (!st)
+ return NULL;
+
+ for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
+ if (!hlist_empty(&instance_table[st->bucket]))
+ return instance_table[st->bucket].first;
+ }
+ return NULL;
+}
+
+static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h)
+{
+ struct iter_state *st = seq->private;
+
+ h = h->next;
+ while (!h) {
+ if (++st->bucket >= INSTANCE_BUCKETS)
+ return NULL;
+
+ h = instance_table[st->bucket].first;
+ }
+ return h;
+}
+
+static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct hlist_node *head;
+ head = get_first(seq);
+
+ if (head)
+ while (pos && (head = get_next(seq, head)))
+ pos--;
+ return pos ? NULL : head;
+}
+
+static void *seq_start(struct seq_file *seq, loff_t *pos)
+{
+ read_lock_bh(&instances_lock);
+ return get_idx(seq, *pos);
+}
+
+static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return get_next(s, v);
+}
+
+static void seq_stop(struct seq_file *s, void *v)
+{
+ read_unlock_bh(&instances_lock);
+}
+
+static int seq_show(struct seq_file *s, void *v)
+{
+ const struct nfqnl_instance *inst = v;
+
+ return seq_printf(s, "%5d %6d %5d %1d %5d %5d %5d %8d %2d\n",
+ inst->queue_num,
+ inst->peer_pid, inst->queue_total,
+ inst->copy_mode, inst->copy_range,
+ inst->queue_dropped, inst->queue_user_dropped,
+ atomic_read(&inst->id_sequence),
+ atomic_read(&inst->use));
+}
+
+static struct seq_operations nfqnl_seq_ops = {
+ .start = seq_start,
+ .next = seq_next,
+ .stop = seq_stop,
+ .show = seq_show,
+};
+
+static int nfqnl_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ struct iter_state *is;
+ int ret;
+
+ is = kmalloc(sizeof(*is), GFP_KERNEL);
+ if (!is)
+ return -ENOMEM;
+ memset(is, 0, sizeof(*is));
+ ret = seq_open(file, &nfqnl_seq_ops);
+ if (ret < 0)
+ goto out_free;
+ seq = file->private_data;
+ seq->private = is;
+ return ret;
+out_free:
+ kfree(is);
+ return ret;
+}
+
+static struct file_operations nfqnl_file_ops = {
+ .owner = THIS_MODULE,
+ .open = nfqnl_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+#endif /* PROC_FS */
+
+static int
+init_or_cleanup(int init)
+{
+ int i, status = -ENOMEM;
+#ifdef CONFIG_PROC_FS
+ struct proc_dir_entry *proc_nfqueue;
+#endif
+
+ if (!init)
+ goto cleanup;
+
+ for (i = 0; i < INSTANCE_BUCKETS; i++)
+ INIT_HLIST_HEAD(&instance_table[i]);
+
+ netlink_register_notifier(&nfqnl_rtnl_notifier);
+ status = nfnetlink_subsys_register(&nfqnl_subsys);
+ if (status < 0) {
+ printk(KERN_ERR "nf_queue: failed to create netlink socket\n");
+ goto cleanup_netlink_notifier;
+ }
+
+#ifdef CONFIG_PROC_FS
+ proc_nfqueue = create_proc_entry("nfnetlink_queue", 0440,
+ proc_net_netfilter);
+ if (!proc_nfqueue)
+ goto cleanup_subsys;
+ proc_nfqueue->proc_fops = &nfqnl_file_ops;
+#endif
+
+ register_netdevice_notifier(&nfqnl_dev_notifier);
+
+ return status;
+
+cleanup:
+ nf_unregister_queue_handlers(&nfqh);
+ unregister_netdevice_notifier(&nfqnl_dev_notifier);
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("nfnetlink_queue", proc_net_netfilter);
+cleanup_subsys:
+#endif
+ nfnetlink_subsys_unregister(&nfqnl_subsys);
+cleanup_netlink_notifier:
+ netlink_unregister_notifier(&nfqnl_rtnl_notifier);
+ return status;
+}
+
+static int __init init(void)
+{
+
+ return init_or_cleanup(1);
+}
+
+static void __exit fini(void)
+{
+ init_or_cleanup(0);
+}
+
+MODULE_DESCRIPTION("netfilter packet queue handler");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_QUEUE);
+
+module_init(init);
+module_exit(fini);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index ff774a06c89..5ca283537bc 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -13,7 +13,12 @@
* added netlink_proto_exit
* Tue Jan 22 18:32:44 BRST 2002 Arnaldo C. de Melo <acme@conectiva.com.br>
* use nlk_sk, as sk->protinfo is on a diet 8)
- *
+ * Fri Jul 22 19:51:12 MEST 2005 Harald Welte <laforge@gnumonks.org>
+ * - inc module use count of module that owns
+ * the kernel socket in case userspace opens
+ * socket of same protocol
+ * - remove all module support, since netlink is
+ * mandatory if CONFIG_NET=y these days
*/
#include <linux/config.h>
@@ -55,21 +60,29 @@
#include <net/scm.h>
#define Nprintk(a...)
+#define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8)
struct netlink_sock {
/* struct sock has to be the first member of netlink_sock */
struct sock sk;
u32 pid;
- unsigned int groups;
u32 dst_pid;
- unsigned int dst_groups;
+ u32 dst_group;
+ u32 flags;
+ u32 subscriptions;
+ u32 ngroups;
+ unsigned long *groups;
unsigned long state;
wait_queue_head_t wait;
struct netlink_callback *cb;
spinlock_t cb_lock;
void (*data_ready)(struct sock *sk, int bytes);
+ struct module *module;
};
+#define NETLINK_KERNEL_SOCKET 0x1
+#define NETLINK_RECV_PKTINFO 0x2
+
static inline struct netlink_sock *nlk_sk(struct sock *sk)
{
return (struct netlink_sock *)sk;
@@ -92,6 +105,9 @@ struct netlink_table {
struct nl_pid_hash hash;
struct hlist_head mc_list;
unsigned int nl_nonroot;
+ unsigned int groups;
+ struct module *module;
+ int registered;
};
static struct netlink_table *nl_table;
@@ -106,6 +122,11 @@ static atomic_t nl_table_users = ATOMIC_INIT(0);
static struct notifier_block *netlink_chain;
+static u32 netlink_group_mask(u32 group)
+{
+ return group ? 1 << (group - 1) : 0;
+}
+
static struct hlist_head *nl_pid_hashfn(struct nl_pid_hash *hash, u32 pid)
{
return &hash->table[jhash_1word(pid, hash->rnd) & hash->mask];
@@ -122,6 +143,7 @@ static void netlink_sock_destruct(struct sock *sk)
BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
BUG_TRAP(!nlk_sk(sk)->cb);
+ BUG_TRAP(!nlk_sk(sk)->groups);
}
/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on SMP.
@@ -317,7 +339,7 @@ static void netlink_remove(struct sock *sk)
netlink_table_grab();
if (sk_del_node_init(sk))
nl_table[sk->sk_protocol].hash.entries--;
- if (nlk_sk(sk)->groups)
+ if (nlk_sk(sk)->subscriptions)
__sk_del_bind_node(sk);
netlink_table_ungrab();
}
@@ -328,19 +350,11 @@ static struct proto netlink_proto = {
.obj_size = sizeof(struct netlink_sock),
};
-static int netlink_create(struct socket *sock, int protocol)
+static int __netlink_create(struct socket *sock, int protocol)
{
struct sock *sk;
struct netlink_sock *nlk;
- sock->state = SS_UNCONNECTED;
-
- if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
- return -ESOCKTNOSUPPORT;
-
- if (protocol<0 || protocol >= MAX_LINKS)
- return -EPROTONOSUPPORT;
-
sock->ops = &netlink_ops;
sk = sk_alloc(PF_NETLINK, GFP_KERNEL, &netlink_proto, 1);
@@ -350,15 +364,56 @@ static int netlink_create(struct socket *sock, int protocol)
sock_init_data(sock, sk);
nlk = nlk_sk(sk);
-
spin_lock_init(&nlk->cb_lock);
init_waitqueue_head(&nlk->wait);
- sk->sk_destruct = netlink_sock_destruct;
+ sk->sk_destruct = netlink_sock_destruct;
sk->sk_protocol = protocol;
return 0;
}
+static int netlink_create(struct socket *sock, int protocol)
+{
+ struct module *module = NULL;
+ struct netlink_sock *nlk;
+ unsigned int groups;
+ int err = 0;
+
+ sock->state = SS_UNCONNECTED;
+
+ if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
+ return -ESOCKTNOSUPPORT;
+
+ if (protocol<0 || protocol >= MAX_LINKS)
+ return -EPROTONOSUPPORT;
+
+ netlink_lock_table();
+#ifdef CONFIG_KMOD
+ if (!nl_table[protocol].registered) {
+ netlink_unlock_table();
+ request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol);
+ netlink_lock_table();
+ }
+#endif
+ if (nl_table[protocol].registered &&
+ try_module_get(nl_table[protocol].module))
+ module = nl_table[protocol].module;
+ groups = nl_table[protocol].groups;
+ netlink_unlock_table();
+
+ if ((err = __netlink_create(sock, protocol) < 0))
+ goto out_module;
+
+ nlk = nlk_sk(sock->sk);
+ nlk->module = module;
+out:
+ return err;
+
+out_module:
+ module_put(module);
+ goto out;
+}
+
static int netlink_release(struct socket *sock)
{
struct sock *sk = sock->sk;
@@ -387,14 +442,27 @@ static int netlink_release(struct socket *sock)
skb_queue_purge(&sk->sk_write_queue);
- if (nlk->pid && !nlk->groups) {
+ if (nlk->pid && !nlk->subscriptions) {
struct netlink_notify n = {
.protocol = sk->sk_protocol,
.pid = nlk->pid,
};
notifier_call_chain(&netlink_chain, NETLINK_URELEASE, &n);
}
-
+
+ if (nlk->module)
+ module_put(nlk->module);
+
+ if (nlk->flags & NETLINK_KERNEL_SOCKET) {
+ netlink_table_grab();
+ nl_table[sk->sk_protocol].module = NULL;
+ nl_table[sk->sk_protocol].registered = 0;
+ netlink_table_ungrab();
+ }
+
+ kfree(nlk->groups);
+ nlk->groups = NULL;
+
sock_put(sk);
return 0;
}
@@ -443,6 +511,41 @@ static inline int netlink_capable(struct socket *sock, unsigned int flag)
capable(CAP_NET_ADMIN);
}
+static void
+netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions)
+{
+ struct netlink_sock *nlk = nlk_sk(sk);
+
+ if (nlk->subscriptions && !subscriptions)
+ __sk_del_bind_node(sk);
+ else if (!nlk->subscriptions && subscriptions)
+ sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list);
+ nlk->subscriptions = subscriptions;
+}
+
+static int netlink_alloc_groups(struct sock *sk)
+{
+ struct netlink_sock *nlk = nlk_sk(sk);
+ unsigned int groups;
+ int err = 0;
+
+ netlink_lock_table();
+ groups = nl_table[sk->sk_protocol].groups;
+ if (!nl_table[sk->sk_protocol].registered)
+ err = -ENOENT;
+ netlink_unlock_table();
+
+ if (err)
+ return err;
+
+ nlk->groups = kmalloc(NLGRPSZ(groups), GFP_KERNEL);
+ if (nlk->groups == NULL)
+ return -ENOMEM;
+ memset(nlk->groups, 0, NLGRPSZ(groups));
+ nlk->ngroups = groups;
+ return 0;
+}
+
static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
{
struct sock *sk = sock->sk;
@@ -454,8 +557,15 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len
return -EINVAL;
/* Only superuser is allowed to listen multicasts */
- if (nladdr->nl_groups && !netlink_capable(sock, NL_NONROOT_RECV))
- return -EPERM;
+ if (nladdr->nl_groups) {
+ if (!netlink_capable(sock, NL_NONROOT_RECV))
+ return -EPERM;
+ if (nlk->groups == NULL) {
+ err = netlink_alloc_groups(sk);
+ if (err)
+ return err;
+ }
+ }
if (nlk->pid) {
if (nladdr->nl_pid != nlk->pid)
@@ -468,15 +578,14 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len
return err;
}
- if (!nladdr->nl_groups && !nlk->groups)
+ if (!nladdr->nl_groups && (nlk->groups == NULL || !(u32)nlk->groups[0]))
return 0;
netlink_table_grab();
- if (nlk->groups && !nladdr->nl_groups)
- __sk_del_bind_node(sk);
- else if (!nlk->groups && nladdr->nl_groups)
- sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list);
- nlk->groups = nladdr->nl_groups;
+ netlink_update_subscriptions(sk, nlk->subscriptions +
+ hweight32(nladdr->nl_groups) -
+ hweight32(nlk->groups[0]));
+ nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | nladdr->nl_groups;
netlink_table_ungrab();
return 0;
@@ -493,7 +602,7 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr,
if (addr->sa_family == AF_UNSPEC) {
sk->sk_state = NETLINK_UNCONNECTED;
nlk->dst_pid = 0;
- nlk->dst_groups = 0;
+ nlk->dst_group = 0;
return 0;
}
if (addr->sa_family != AF_NETLINK)
@@ -509,7 +618,7 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr,
if (err == 0) {
sk->sk_state = NETLINK_CONNECTED;
nlk->dst_pid = nladdr->nl_pid;
- nlk->dst_groups = nladdr->nl_groups;
+ nlk->dst_group = ffs(nladdr->nl_groups);
}
return err;
@@ -527,10 +636,10 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr, int *addr
if (peer) {
nladdr->nl_pid = nlk->dst_pid;
- nladdr->nl_groups = nlk->dst_groups;
+ nladdr->nl_groups = netlink_group_mask(nlk->dst_group);
} else {
nladdr->nl_pid = nlk->pid;
- nladdr->nl_groups = nlk->groups;
+ nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0;
}
return 0;
}
@@ -631,11 +740,8 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, long t
int netlink_sendskb(struct sock *sk, struct sk_buff *skb, int protocol)
{
- struct netlink_sock *nlk;
int len = skb->len;
- nlk = nlk_sk(sk);
-
skb_queue_tail(&sk->sk_receive_queue, skb);
sk->sk_data_ready(sk, len);
sock_put(sk);
@@ -649,7 +755,7 @@ void netlink_detachskb(struct sock *sk, struct sk_buff *skb)
}
static inline struct sk_buff *netlink_trim(struct sk_buff *skb,
- unsigned int __nocast allocation)
+ gfp_t allocation)
{
int delta;
@@ -718,7 +824,7 @@ struct netlink_broadcast_data {
int failure;
int congested;
int delivered;
- unsigned int allocation;
+ gfp_t allocation;
struct sk_buff *skb, *skb2;
};
@@ -731,7 +837,8 @@ static inline int do_one_broadcast(struct sock *sk,
if (p->exclude_sk == sk)
goto out;
- if (nlk->pid == p->pid || !(nlk->groups & p->group))
+ if (nlk->pid == p->pid || p->group - 1 >= nlk->ngroups ||
+ !test_bit(p->group - 1, nlk->groups))
goto out;
if (p->failure) {
@@ -770,7 +877,7 @@ out:
}
int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid,
- u32 group, int allocation)
+ u32 group, gfp_t allocation)
{
struct netlink_broadcast_data info;
struct hlist_node *node;
@@ -827,7 +934,8 @@ static inline int do_one_set_err(struct sock *sk,
if (sk == p->exclude_sk)
goto out;
- if (nlk->pid == p->pid || !(nlk->groups & p->group))
+ if (nlk->pid == p->pid || p->group - 1 >= nlk->ngroups ||
+ !test_bit(p->group - 1, nlk->groups))
goto out;
sk->sk_err = p->code;
@@ -855,6 +963,99 @@ void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code)
read_unlock(&nl_table_lock);
}
+static int netlink_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct netlink_sock *nlk = nlk_sk(sk);
+ int val = 0, err;
+
+ if (level != SOL_NETLINK)
+ return -ENOPROTOOPT;
+
+ if (optlen >= sizeof(int) &&
+ get_user(val, (int __user *)optval))
+ return -EFAULT;
+
+ switch (optname) {
+ case NETLINK_PKTINFO:
+ if (val)
+ nlk->flags |= NETLINK_RECV_PKTINFO;
+ else
+ nlk->flags &= ~NETLINK_RECV_PKTINFO;
+ err = 0;
+ break;
+ case NETLINK_ADD_MEMBERSHIP:
+ case NETLINK_DROP_MEMBERSHIP: {
+ unsigned int subscriptions;
+ int old, new = optname == NETLINK_ADD_MEMBERSHIP ? 1 : 0;
+
+ if (!netlink_capable(sock, NL_NONROOT_RECV))
+ return -EPERM;
+ if (nlk->groups == NULL) {
+ err = netlink_alloc_groups(sk);
+ if (err)
+ return err;
+ }
+ if (!val || val - 1 >= nlk->ngroups)
+ return -EINVAL;
+ netlink_table_grab();
+ old = test_bit(val - 1, nlk->groups);
+ subscriptions = nlk->subscriptions - old + new;
+ if (new)
+ __set_bit(val - 1, nlk->groups);
+ else
+ __clear_bit(val - 1, nlk->groups);
+ netlink_update_subscriptions(sk, subscriptions);
+ netlink_table_ungrab();
+ err = 0;
+ break;
+ }
+ default:
+ err = -ENOPROTOOPT;
+ }
+ return err;
+}
+
+static int netlink_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ struct netlink_sock *nlk = nlk_sk(sk);
+ int len, val, err;
+
+ if (level != SOL_NETLINK)
+ return -ENOPROTOOPT;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+ if (len < 0)
+ return -EINVAL;
+
+ switch (optname) {
+ case NETLINK_PKTINFO:
+ if (len < sizeof(int))
+ return -EINVAL;
+ len = sizeof(int);
+ val = nlk->flags & NETLINK_RECV_PKTINFO ? 1 : 0;
+ put_user(len, optlen);
+ put_user(val, optval);
+ err = 0;
+ break;
+ default:
+ err = -ENOPROTOOPT;
+ }
+ return err;
+}
+
+static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
+{
+ struct nl_pktinfo info;
+
+ info.group = NETLINK_CB(skb).dst_group;
+ put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info);
+}
+
static inline void netlink_rcv_wake(struct sock *sk)
{
struct netlink_sock *nlk = nlk_sk(sk);
@@ -873,7 +1074,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
struct netlink_sock *nlk = nlk_sk(sk);
struct sockaddr_nl *addr=msg->msg_name;
u32 dst_pid;
- u32 dst_groups;
+ u32 dst_group;
struct sk_buff *skb;
int err;
struct scm_cookie scm;
@@ -891,12 +1092,12 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
if (addr->nl_family != AF_NETLINK)
return -EINVAL;
dst_pid = addr->nl_pid;
- dst_groups = addr->nl_groups;
- if (dst_groups && !netlink_capable(sock, NL_NONROOT_SEND))
+ dst_group = ffs(addr->nl_groups);
+ if (dst_group && !netlink_capable(sock, NL_NONROOT_SEND))
return -EPERM;
} else {
dst_pid = nlk->dst_pid;
- dst_groups = nlk->dst_groups;
+ dst_group = nlk->dst_group;
}
if (!nlk->pid) {
@@ -914,9 +1115,8 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
goto out;
NETLINK_CB(skb).pid = nlk->pid;
- NETLINK_CB(skb).groups = nlk->groups;
NETLINK_CB(skb).dst_pid = dst_pid;
- NETLINK_CB(skb).dst_groups = dst_groups;
+ NETLINK_CB(skb).dst_group = dst_group;
NETLINK_CB(skb).loginuid = audit_get_loginuid(current->audit_context);
memcpy(NETLINK_CREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
@@ -938,9 +1138,9 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
goto out;
}
- if (dst_groups) {
+ if (dst_group) {
atomic_inc(&skb->users);
- netlink_broadcast(sk, skb, dst_pid, dst_groups, GFP_KERNEL);
+ netlink_broadcast(sk, skb, dst_pid, dst_group, GFP_KERNEL);
}
err = netlink_unicast(sk, skb, dst_pid, msg->msg_flags&MSG_DONTWAIT);
@@ -986,7 +1186,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
addr->nl_family = AF_NETLINK;
addr->nl_pad = 0;
addr->nl_pid = NETLINK_CB(skb).pid;
- addr->nl_groups = NETLINK_CB(skb).dst_groups;
+ addr->nl_groups = netlink_group_mask(NETLINK_CB(skb).dst_group);
msg->msg_namelen = sizeof(*addr);
}
@@ -1001,6 +1201,8 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
netlink_dump(sk);
scm_recv(sock, msg, siocb->scm, flags);
+ if (nlk->flags & NETLINK_RECV_PKTINFO)
+ netlink_cmsg_recv_pktinfo(msg, skb);
out:
netlink_rcv_wake(sk);
@@ -1023,10 +1225,13 @@ static void netlink_data_ready(struct sock *sk, int len)
*/
struct sock *
-netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len))
+netlink_kernel_create(int unit, unsigned int groups,
+ void (*input)(struct sock *sk, int len),
+ struct module *module)
{
struct socket *sock;
struct sock *sk;
+ struct netlink_sock *nlk;
if (!nl_table)
return NULL;
@@ -1037,20 +1242,31 @@ netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len))
if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock))
return NULL;
- if (netlink_create(sock, unit) < 0) {
- sock_release(sock);
- return NULL;
- }
+ if (__netlink_create(sock, unit) < 0)
+ goto out_sock_release;
+
sk = sock->sk;
sk->sk_data_ready = netlink_data_ready;
if (input)
nlk_sk(sk)->data_ready = input;
- if (netlink_insert(sk, 0)) {
- sock_release(sock);
- return NULL;
- }
+ if (netlink_insert(sk, 0))
+ goto out_sock_release;
+
+ nlk = nlk_sk(sk);
+ nlk->flags |= NETLINK_KERNEL_SOCKET;
+
+ netlink_table_grab();
+ nl_table[unit].groups = groups < 32 ? 32 : groups;
+ nl_table[unit].module = module;
+ nl_table[unit].registered = 1;
+ netlink_table_ungrab();
+
return sk;
+
+out_sock_release:
+ sock_release(sock);
+ return NULL;
}
void netlink_set_nonroot(int protocol, unsigned int flags)
@@ -1288,7 +1504,7 @@ static int netlink_seq_show(struct seq_file *seq, void *v)
s,
s->sk_protocol,
nlk->pid,
- nlk->groups,
+ nlk->groups ? (u32)nlk->groups[0] : 0,
atomic_read(&s->sk_rmem_alloc),
atomic_read(&s->sk_wmem_alloc),
nlk->cb,
@@ -1362,8 +1578,8 @@ static struct proto_ops netlink_ops = {
.ioctl = sock_no_ioctl,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
- .setsockopt = sock_no_setsockopt,
- .getsockopt = sock_no_getsockopt,
+ .setsockopt = netlink_setsockopt,
+ .getsockopt = netlink_getsockopt,
.sendmsg = netlink_sendmsg,
.recvmsg = netlink_recvmsg,
.mmap = sock_no_mmap,
@@ -1438,21 +1654,7 @@ out:
return err;
}
-static void __exit netlink_proto_exit(void)
-{
- sock_unregister(PF_NETLINK);
- proc_net_remove("netlink");
- kfree(nl_table);
- nl_table = NULL;
- proto_unregister(&netlink_proto);
-}
-
core_initcall(netlink_proto_init);
-module_exit(netlink_proto_exit);
-
-MODULE_LICENSE("GPL");
-
-MODULE_ALIAS_NETPROTO(PF_NETLINK);
EXPORT_SYMBOL(netlink_ack);
EXPORT_SYMBOL(netlink_broadcast);
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 31ed4a9a1d0..e5d82d711ca 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -39,7 +39,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <net/ip.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <net/arp.h>
#include <linux/init.h>
@@ -56,6 +56,7 @@ int sysctl_netrom_transport_requested_window_size = NR_DEFAULT_WINDOW;
int sysctl_netrom_transport_no_activity_timeout = NR_DEFAULT_IDLE;
int sysctl_netrom_routing_control = NR_DEFAULT_ROUTING;
int sysctl_netrom_link_fails_count = NR_DEFAULT_FAILS;
+int sysctl_netrom_reset_circuit = NR_DEFAULT_RESET;
static unsigned short circuit = 0x101;
@@ -459,12 +460,7 @@ static struct sock *nr_make_new(struct sock *osk)
sk->sk_sndbuf = osk->sk_sndbuf;
sk->sk_state = TCP_ESTABLISHED;
sk->sk_sleep = osk->sk_sleep;
-
- if (sock_flag(osk, SOCK_ZAPPED))
- sock_set_flag(sk, SOCK_ZAPPED);
-
- if (sock_flag(osk, SOCK_DBG))
- sock_set_flag(sk, SOCK_DBG);
+ sock_copy_flags(sk, osk);
skb_queue_head_init(&nr->ack_queue);
skb_queue_head_init(&nr->reseq_queue);
@@ -541,7 +537,8 @@ static int nr_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
struct nr_sock *nr = nr_sk(sk);
struct full_sockaddr_ax25 *addr = (struct full_sockaddr_ax25 *)uaddr;
struct net_device *dev;
- ax25_address *user, *source;
+ ax25_uid_assoc *user;
+ ax25_address *source;
lock_sock(sk);
if (!sock_flag(sk, SOCK_ZAPPED)) {
@@ -580,16 +577,19 @@ static int nr_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
} else {
source = &addr->fsa_ax25.sax25_call;
- if ((user = ax25_findbyuid(current->euid)) == NULL) {
+ user = ax25_findbyuid(current->euid);
+ if (user) {
+ nr->user_addr = user->call;
+ ax25_uid_put(user);
+ } else {
if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) {
release_sock(sk);
dev_put(dev);
return -EPERM;
}
- user = source;
+ nr->user_addr = *source;
}
- nr->user_addr = *user;
nr->source_addr = *source;
}
@@ -609,7 +609,8 @@ static int nr_connect(struct socket *sock, struct sockaddr *uaddr,
struct sock *sk = sock->sk;
struct nr_sock *nr = nr_sk(sk);
struct sockaddr_ax25 *addr = (struct sockaddr_ax25 *)uaddr;
- ax25_address *user, *source = NULL;
+ ax25_address *source = NULL;
+ ax25_uid_assoc *user;
struct net_device *dev;
lock_sock(sk);
@@ -650,16 +651,19 @@ static int nr_connect(struct socket *sock, struct sockaddr *uaddr,
}
source = (ax25_address *)dev->dev_addr;
- if ((user = ax25_findbyuid(current->euid)) == NULL) {
+ user = ax25_findbyuid(current->euid);
+ if (user) {
+ nr->user_addr = user->call;
+ ax25_uid_put(user);
+ } else {
if (ax25_uid_policy && !capable(CAP_NET_ADMIN)) {
dev_put(dev);
release_sock(sk);
return -EPERM;
}
- user = source;
+ nr->user_addr = *source;
}
- nr->user_addr = *user;
nr->source_addr = *source;
nr->device = dev;
@@ -855,17 +859,16 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev)
frametype = skb->data[19] & 0x0F;
flags = skb->data[19] & 0xF0;
-#ifdef CONFIG_INET
/*
* Check for an incoming IP over NET/ROM frame.
*/
- if (frametype == NR_PROTOEXT && circuit_index == NR_PROTO_IP && circuit_id == NR_PROTO_IP) {
+ if (frametype == NR_PROTOEXT &&
+ circuit_index == NR_PROTO_IP && circuit_id == NR_PROTO_IP) {
skb_pull(skb, NR_NETWORK_LEN + NR_TRANSPORT_LEN);
skb->h.raw = skb->data;
return nr_rx_ip(skb, dev);
}
-#endif
/*
* Find an existing socket connection, based on circuit ID, if it's
@@ -906,17 +909,17 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev)
if (frametype != NR_CONNREQ) {
/*
* Here it would be nice to be able to send a reset but
- * NET/ROM doesn't have one. The following hack would
- * have been a way to extend the protocol but apparently
- * it kills BPQ boxes... :-(
+ * NET/ROM doesn't have one. We've tried to extend the protocol
+ * by sending NR_CONNACK | NR_CHOKE_FLAGS replies but that
+ * apparently kills BPQ boxes... :-(
+ * So now we try to follow the established behaviour of
+ * G8PZT's Xrouter which is sending packets with command type 7
+ * as an extension of the protocol.
*/
-#if 0
- /*
- * Never reply to a CONNACK/CHOKE.
- */
- if (frametype != NR_CONNACK || flags != NR_CHOKE_FLAG)
- nr_transmit_refusal(skb, 1);
-#endif
+ if (sysctl_netrom_reset_circuit &&
+ (frametype != NR_RESET || flags != 0))
+ nr_transmit_reset(skb, 1);
+
return 0;
}
@@ -1185,9 +1188,7 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
}
case SIOCGSTAMP:
- ret = -EINVAL;
- if (sk != NULL)
- ret = sock_get_timestamp(sk, argp);
+ ret = sock_get_timestamp(sk, argp);
release_sock(sk);
return ret;
@@ -1259,6 +1260,7 @@ static int nr_info_show(struct seq_file *seq, void *v)
struct net_device *dev;
struct nr_sock *nr;
const char *devname;
+ char buf[11];
if (v == SEQ_START_TOKEN)
seq_puts(seq,
@@ -1274,11 +1276,11 @@ static int nr_info_show(struct seq_file *seq, void *v)
else
devname = dev->name;
- seq_printf(seq, "%-9s ", ax2asc(&nr->user_addr));
- seq_printf(seq, "%-9s ", ax2asc(&nr->dest_addr));
+ seq_printf(seq, "%-9s ", ax2asc(buf, &nr->user_addr));
+ seq_printf(seq, "%-9s ", ax2asc(buf, &nr->dest_addr));
seq_printf(seq,
"%-9s %-3s %02X/%02X %02X/%02X %2d %3d %3d %3d %3lu/%03lu %2lu/%02lu %3lu/%03lu %3lu/%03lu %2d/%02d %3d %5d %5d %ld\n",
- ax2asc(&nr->source_addr),
+ ax2asc(buf, &nr->source_addr),
devname,
nr->my_index,
nr->my_id,
@@ -1390,8 +1392,7 @@ static int __init nr_proto_init(void)
struct net_device *dev;
sprintf(name, "nr%d", i);
- dev = alloc_netdev(sizeof(struct net_device_stats), name,
- nr_setup);
+ dev = alloc_netdev(sizeof(struct nr_private), name, nr_setup);
if (!dev) {
printk(KERN_ERR "NET/ROM: nr_proto_init - unable to allocate device structure\n");
goto fail;
diff --git a/net/netrom/nr_dev.c b/net/netrom/nr_dev.c
index 220bf7494f7..509afddae56 100644
--- a/net/netrom/nr_dev.c
+++ b/net/netrom/nr_dev.c
@@ -38,8 +38,6 @@
#include <net/ax25.h>
#include <net/netrom.h>
-#ifdef CONFIG_INET
-
/*
* Only allow IP over NET/ROM frames through if the netrom device is up.
*/
@@ -49,7 +47,7 @@ int nr_rx_ip(struct sk_buff *skb, struct net_device *dev)
struct net_device_stats *stats = netdev_priv(dev);
if (!netif_running(dev)) {
- stats->rx_errors++;
+ stats->rx_dropped++;
return 0;
}
@@ -60,27 +58,23 @@ int nr_rx_ip(struct sk_buff *skb, struct net_device *dev)
/* Spoof incoming device */
skb->dev = dev;
- skb->h.raw = skb->data;
+ skb->mac.raw = skb->nh.raw;
skb->nh.raw = skb->data;
skb->pkt_type = PACKET_HOST;
- ip_rcv(skb, skb->dev, NULL);
+ netif_rx(skb);
return 1;
}
+#ifdef CONFIG_INET
static int nr_rebuild_header(struct sk_buff *skb)
{
- struct net_device *dev = skb->dev;
- struct net_device_stats *stats = netdev_priv(dev);
- struct sk_buff *skbn;
unsigned char *bp = skb->data;
- int len;
- if (arp_find(bp + 7, skb)) {
+ if (arp_find(bp + 7, skb))
return 1;
- }
bp[6] &= ~AX25_CBIT;
bp[6] &= ~AX25_EBIT;
@@ -91,27 +85,7 @@ static int nr_rebuild_header(struct sk_buff *skb)
bp[6] |= AX25_EBIT;
bp[6] |= AX25_SSSID_SPARE;
- if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) {
- kfree_skb(skb);
- return 1;
- }
-
- if (skb->sk != NULL)
- skb_set_owner_w(skbn, skb->sk);
-
- kfree_skb(skb);
-
- len = skbn->len;
-
- if (!nr_route_frame(skbn, NULL)) {
- kfree_skb(skbn);
- stats->tx_errors++;
- }
-
- stats->tx_packets++;
- stats->tx_bytes += len;
-
- return 1;
+ return 0;
}
#else
@@ -186,15 +160,27 @@ static int nr_close(struct net_device *dev)
static int nr_xmit(struct sk_buff *skb, struct net_device *dev)
{
- struct net_device_stats *stats = netdev_priv(dev);
- dev_kfree_skb(skb);
- stats->tx_errors++;
+ struct nr_private *nr = netdev_priv(dev);
+ struct net_device_stats *stats = &nr->stats;
+ unsigned int len = skb->len;
+
+ if (!nr_route_frame(skb, NULL)) {
+ kfree_skb(skb);
+ stats->tx_errors++;
+ return 0;
+ }
+
+ stats->tx_packets++;
+ stats->tx_bytes += len;
+
return 0;
}
static struct net_device_stats *nr_get_stats(struct net_device *dev)
{
- return netdev_priv(dev);
+ struct nr_private *nr = netdev_priv(dev);
+
+ return &nr->stats;
}
void nr_setup(struct net_device *dev)
@@ -209,12 +195,11 @@ void nr_setup(struct net_device *dev)
dev->hard_header_len = NR_NETWORK_LEN + NR_TRANSPORT_LEN;
dev->addr_len = AX25_ADDR_LEN;
dev->type = ARPHRD_NETROM;
- dev->tx_queue_len = 40;
dev->rebuild_header = nr_rebuild_header;
dev->set_mac_address = nr_set_mac_address;
/* New-style flags. */
- dev->flags = 0;
+ dev->flags = IFF_NOARP;
dev->get_stats = nr_get_stats;
}
diff --git a/net/netrom/nr_in.c b/net/netrom/nr_in.c
index 9c44b379412..004e8599b8f 100644
--- a/net/netrom/nr_in.c
+++ b/net/netrom/nr_in.c
@@ -22,8 +22,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <net/tcp.h>
-#include <net/ip.h> /* For ip_rcv */
+#include <net/tcp_states.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <linux/fcntl.h>
@@ -99,6 +98,11 @@ static int nr_state1_machine(struct sock *sk, struct sk_buff *skb,
nr_disconnect(sk, ECONNREFUSED);
break;
+ case NR_RESET:
+ if (sysctl_netrom_reset_circuit);
+ nr_disconnect(sk, ECONNRESET);
+ break;
+
default:
break;
}
@@ -125,6 +129,11 @@ static int nr_state2_machine(struct sock *sk, struct sk_buff *skb,
nr_disconnect(sk, 0);
break;
+ case NR_RESET:
+ if (sysctl_netrom_reset_circuit);
+ nr_disconnect(sk, ECONNRESET);
+ break;
+
default:
break;
}
@@ -255,6 +264,11 @@ static int nr_state3_machine(struct sock *sk, struct sk_buff *skb, int frametype
}
break;
+ case NR_RESET:
+ if (sysctl_netrom_reset_circuit);
+ nr_disconnect(sk, ECONNRESET);
+ break;
+
default:
break;
}
diff --git a/net/netrom/nr_loopback.c b/net/netrom/nr_loopback.c
index 165b2abce11..e856ae1b360 100644
--- a/net/netrom/nr_loopback.c
+++ b/net/netrom/nr_loopback.c
@@ -17,7 +17,7 @@
static void nr_loopback_timer(unsigned long);
static struct sk_buff_head loopback_queue;
-static struct timer_list loopback_timer = TIMER_INITIALIZER(nr_loopback_timer, 0, 0);
+static DEFINE_TIMER(loopback_timer, nr_loopback_timer, 0, 0);
void __init nr_loopback_init(void)
{
diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c
index 7a86b36cba5..b3b9097c87c 100644
--- a/net/netrom/nr_route.c
+++ b/net/netrom/nr_route.c
@@ -881,6 +881,7 @@ static void nr_node_stop(struct seq_file *seq, void *v)
static int nr_node_show(struct seq_file *seq, void *v)
{
+ char buf[11];
int i;
if (v == SEQ_START_TOKEN)
@@ -890,7 +891,7 @@ static int nr_node_show(struct seq_file *seq, void *v)
struct nr_node *nr_node = v;
nr_node_lock(nr_node);
seq_printf(seq, "%-9s %-7s %d %d",
- ax2asc(&nr_node->callsign),
+ ax2asc(buf, &nr_node->callsign),
(nr_node->mnemonic[0] == '\0') ? "*" : nr_node->mnemonic,
nr_node->which + 1,
nr_node->count);
@@ -964,6 +965,7 @@ static void nr_neigh_stop(struct seq_file *seq, void *v)
static int nr_neigh_show(struct seq_file *seq, void *v)
{
+ char buf[11];
int i;
if (v == SEQ_START_TOKEN)
@@ -973,7 +975,7 @@ static int nr_neigh_show(struct seq_file *seq, void *v)
seq_printf(seq, "%05d %-9s %-4s %3d %d %3d %3d",
nr_neigh->number,
- ax2asc(&nr_neigh->callsign),
+ ax2asc(buf, &nr_neigh->callsign),
nr_neigh->dev ? nr_neigh->dev->name : "???",
nr_neigh->quality,
nr_neigh->locked,
@@ -983,7 +985,7 @@ static int nr_neigh_show(struct seq_file *seq, void *v)
if (nr_neigh->digipeat != NULL) {
for (i = 0; i < nr_neigh->digipeat->ndigi; i++)
seq_printf(seq, " %s",
- ax2asc(&nr_neigh->digipeat->calls[i]));
+ ax2asc(buf, &nr_neigh->digipeat->calls[i]));
}
seq_puts(seq, "\n");
diff --git a/net/netrom/nr_subr.c b/net/netrom/nr_subr.c
index 0627347b14b..bcb9946b4f5 100644
--- a/net/netrom/nr_subr.c
+++ b/net/netrom/nr_subr.c
@@ -21,7 +21,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <linux/fcntl.h>
@@ -77,7 +77,7 @@ void nr_requeue_frames(struct sock *sk)
if (skb_prev == NULL)
skb_queue_head(&sk->sk_write_queue, skb);
else
- skb_append(skb_prev, skb);
+ skb_append(skb_prev, skb, &sk->sk_write_queue);
skb_prev = skb;
}
}
@@ -210,10 +210,9 @@ void nr_write_internal(struct sock *sk, int frametype)
}
/*
- * This routine is called when a Connect Acknowledge with the Choke Flag
- * set is needed to refuse a connection.
+ * This routine is called to send an error reply.
*/
-void nr_transmit_refusal(struct sk_buff *skb, int mine)
+void __nr_transmit_reply(struct sk_buff *skb, int mine, unsigned char cmdflags)
{
struct sk_buff *skbn;
unsigned char *dptr;
@@ -254,7 +253,7 @@ void nr_transmit_refusal(struct sk_buff *skb, int mine)
*dptr++ = 0;
}
- *dptr++ = NR_CONNACK | NR_CHOKE_FLAG;
+ *dptr++ = cmdflags;
*dptr++ = 0;
if (!nr_route_frame(skbn, NULL))
diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c
index faabda8088b..75b72d389ba 100644
--- a/net/netrom/nr_timer.c
+++ b/net/netrom/nr_timer.c
@@ -22,7 +22,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <linux/fcntl.h>
diff --git a/net/netrom/sysctl_net_netrom.c b/net/netrom/sysctl_net_netrom.c
index c9ed50382ea..6bb8dda849d 100644
--- a/net/netrom/sysctl_net_netrom.c
+++ b/net/netrom/sysctl_net_netrom.c
@@ -30,6 +30,7 @@ static int min_idle[] = {0 * HZ};
static int max_idle[] = {65535 * HZ};
static int min_route[] = {0}, max_route[] = {1};
static int min_fails[] = {1}, max_fails[] = {10};
+static int min_reset[] = {0}, max_reset[] = {1};
static struct ctl_table_header *nr_table_header;
@@ -155,6 +156,17 @@ static ctl_table nr_table[] = {
.extra1 = &min_fails,
.extra2 = &max_fails
},
+ {
+ .ctl_name = NET_NETROM_RESET,
+ .procname = "reset",
+ .data = &sysctl_netrom_reset_circuit,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &min_reset,
+ .extra2 = &max_reset
+ },
{ .ctl_name = 0 }
};
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index c9d5980aa4d..499ae3df4a4 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -36,6 +36,11 @@
* Michal Ostrowski : Module initialization cleanup.
* Ulises Alonso : Frame number limit removal and
* packet_set_ring memory leak.
+ * Eric Biederman : Allow for > 8 byte hardware addresses.
+ * The convention is that longer addresses
+ * will simply extend the hardware address
+ * byte arrays at the end of sockaddr_ll
+ * and packet_mreq.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -161,7 +166,17 @@ struct packet_mclist
int count;
unsigned short type;
unsigned short alen;
- unsigned char addr[8];
+ unsigned char addr[MAX_ADDR_LEN];
+};
+/* identical to struct packet_mreq except it has
+ * a longer address field.
+ */
+struct packet_mreq_max
+{
+ int mr_ifindex;
+ unsigned short mr_type;
+ unsigned short mr_alen;
+ unsigned char mr_address[MAX_ADDR_LEN];
};
#endif
#ifdef CONFIG_PACKET_MMAP
@@ -241,7 +256,7 @@ static struct proto_ops packet_ops;
#ifdef CONFIG_SOCK_PACKET
static struct proto_ops packet_ops_spkt;
-static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
struct sock *sk;
struct sockaddr_pkt *spkt;
@@ -441,7 +456,7 @@ static inline unsigned run_filter(struct sk_buff *skb, struct sock *sk, unsigned
we will not harm anyone.
*/
-static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
struct sock *sk;
struct sockaddr_ll *sll;
@@ -546,7 +561,7 @@ drop:
}
#ifdef CONFIG_PACKET_MMAP
-static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
struct sock *sk;
struct packet_sock *po;
@@ -635,12 +650,12 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct pack
h->tp_snaplen = snaplen;
h->tp_mac = macoff;
h->tp_net = netoff;
- if (skb->stamp.tv_sec == 0) {
- do_gettimeofday(&skb->stamp);
+ if (skb->tstamp.off_sec == 0) {
+ __net_timestamp(skb);
sock_enable_timestamp(sk);
}
- h->tp_sec = skb->stamp.tv_sec;
- h->tp_usec = skb->stamp.tv_usec;
+ h->tp_sec = skb->tstamp.off_sec;
+ h->tp_usec = skb->tstamp.off_usec;
sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
sll->sll_halen = 0;
@@ -716,6 +731,8 @@ static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
err = -EINVAL;
if (msg->msg_namelen < sizeof(struct sockaddr_ll))
goto out;
+ if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
+ goto out;
ifindex = saddr->sll_ifindex;
proto = saddr->sll_protocol;
addr = saddr->sll_addr;
@@ -1045,6 +1062,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
struct sock *sk = sock->sk;
struct sk_buff *skb;
int copied, err;
+ struct sockaddr_ll *sll;
err = -EINVAL;
if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
@@ -1057,16 +1075,6 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
#endif
/*
- * If the address length field is there to be filled in, we fill
- * it in now.
- */
-
- if (sock->type == SOCK_PACKET)
- msg->msg_namelen = sizeof(struct sockaddr_pkt);
- else
- msg->msg_namelen = sizeof(struct sockaddr_ll);
-
- /*
* Call the generic datagram receiver. This handles all sorts
* of horrible races and re-entrancy so we can forget about it
* in the protocol layers.
@@ -1087,6 +1095,17 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
goto out;
/*
+ * If the address length field is there to be filled in, we fill
+ * it in now.
+ */
+
+ sll = (struct sockaddr_ll*)skb->cb;
+ if (sock->type == SOCK_PACKET)
+ msg->msg_namelen = sizeof(struct sockaddr_pkt);
+ else
+ msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
+
+ /*
* You lose any data beyond the buffer you gave. If it worries a
* user program they can ask the device for its MTU anyway.
*/
@@ -1166,7 +1185,7 @@ static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
sll->sll_halen = 0;
}
- *uaddr_len = sizeof(*sll);
+ *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
return 0;
}
@@ -1199,7 +1218,7 @@ static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, i
}
}
-static int packet_mc_add(struct sock *sk, struct packet_mreq *mreq)
+static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
{
struct packet_sock *po = pkt_sk(sk);
struct packet_mclist *ml, *i;
@@ -1249,7 +1268,7 @@ done:
return err;
}
-static int packet_mc_drop(struct sock *sk, struct packet_mreq *mreq)
+static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
{
struct packet_mclist *ml, **mlp;
@@ -1315,11 +1334,17 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
case PACKET_ADD_MEMBERSHIP:
case PACKET_DROP_MEMBERSHIP:
{
- struct packet_mreq mreq;
- if (optlen<sizeof(mreq))
+ struct packet_mreq_max mreq;
+ int len = optlen;
+ memset(&mreq, 0, sizeof(mreq));
+ if (len < sizeof(struct packet_mreq))
return -EINVAL;
- if (copy_from_user(&mreq,optval,sizeof(mreq)))
+ if (len > sizeof(mreq))
+ len = sizeof(mreq);
+ if (copy_from_user(&mreq,optval,len))
return -EFAULT;
+ if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
+ return -EINVAL;
if (optname == PACKET_ADD_MEMBERSHIP)
ret = packet_mc_add(sk, &mreq);
else
@@ -1535,8 +1560,7 @@ static unsigned int packet_poll(struct file * file, struct socket *sock,
static void packet_mm_open(struct vm_area_struct *vma)
{
struct file *file = vma->vm_file;
- struct inode *inode = file->f_dentry->d_inode;
- struct socket * sock = SOCKET_I(inode);
+ struct socket * sock = file->private_data;
struct sock *sk = sock->sk;
if (sk)
@@ -1546,8 +1570,7 @@ static void packet_mm_open(struct vm_area_struct *vma)
static void packet_mm_close(struct vm_area_struct *vma)
{
struct file *file = vma->vm_file;
- struct inode *inode = file->f_dentry->d_inode;
- struct socket * sock = SOCKET_I(inode);
+ struct socket * sock = file->private_data;
struct sock *sk = sock->sk;
if (sk)
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 7eb6a5bf93e..829fdbc4400 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -41,7 +41,7 @@
#include <net/rose.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <net/ip.h>
#include <net/arp.h>
@@ -556,12 +556,7 @@ static struct sock *rose_make_new(struct sock *osk)
sk->sk_sndbuf = osk->sk_sndbuf;
sk->sk_state = TCP_ESTABLISHED;
sk->sk_sleep = osk->sk_sleep;
-
- if (sock_flag(osk, SOCK_ZAPPED))
- sock_set_flag(sk, SOCK_ZAPPED);
-
- if (sock_flag(osk, SOCK_DBG))
- sock_set_flag(sk, SOCK_DBG);
+ sock_copy_flags(sk, osk);
init_timer(&rose->timer);
init_timer(&rose->idletimer);
@@ -631,7 +626,8 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
struct rose_sock *rose = rose_sk(sk);
struct sockaddr_rose *addr = (struct sockaddr_rose *)uaddr;
struct net_device *dev;
- ax25_address *user, *source;
+ ax25_address *source;
+ ax25_uid_assoc *user;
int n;
if (!sock_flag(sk, SOCK_ZAPPED))
@@ -656,14 +652,17 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
source = &addr->srose_call;
- if ((user = ax25_findbyuid(current->euid)) == NULL) {
+ user = ax25_findbyuid(current->euid);
+ if (user) {
+ rose->source_call = user->call;
+ ax25_uid_put(user);
+ } else {
if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE))
return -EACCES;
- user = source;
+ rose->source_call = *source;
}
rose->source_addr = addr->srose_addr;
- rose->source_call = *user;
rose->device = dev;
rose->source_ndigis = addr->srose_ndigis;
@@ -690,8 +689,8 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le
struct rose_sock *rose = rose_sk(sk);
struct sockaddr_rose *addr = (struct sockaddr_rose *)uaddr;
unsigned char cause, diagnostic;
- ax25_address *user;
struct net_device *dev;
+ ax25_uid_assoc *user;
int n;
if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) {
@@ -741,12 +740,14 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le
if ((dev = rose_dev_first()) == NULL)
return -ENETUNREACH;
- if ((user = ax25_findbyuid(current->euid)) == NULL)
+ user = ax25_findbyuid(current->euid);
+ if (!user)
return -EINVAL;
memcpy(&rose->source_addr, dev->dev_addr, ROSE_ADDR_LEN);
- rose->source_call = *user;
+ rose->source_call = user->call;
rose->device = dev;
+ ax25_uid_put(user);
rose_insert_socket(sk); /* Finish the bind */
}
@@ -1242,7 +1243,7 @@ static int rose_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
amount = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc);
if (amount < 0)
amount = 0;
- return put_user(amount, (unsigned int __user *)argp);
+ return put_user(amount, (unsigned int __user *) argp);
}
case TIOCINQ: {
@@ -1251,13 +1252,11 @@ static int rose_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
/* These two are safe on a single CPU system as only user tasks fiddle here */
if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL)
amount = skb->len;
- return put_user(amount, (unsigned int __user *)argp);
+ return put_user(amount, (unsigned int __user *) argp);
}
case SIOCGSTAMP:
- if (sk != NULL)
- return sock_get_timestamp(sk, (struct timeval __user *)argp);
- return -EINVAL;
+ return sock_get_timestamp(sk, (struct timeval __user *) argp);
case SIOCGIFADDR:
case SIOCSIFADDR:
@@ -1362,6 +1361,8 @@ static void rose_info_stop(struct seq_file *seq, void *v)
static int rose_info_show(struct seq_file *seq, void *v)
{
+ char buf[11];
+
if (v == SEQ_START_TOKEN)
seq_puts(seq,
"dest_addr dest_call src_addr src_call dev lci neigh st vs vr va t t1 t2 t3 hb idle Snd-Q Rcv-Q inode\n");
@@ -1379,12 +1380,12 @@ static int rose_info_show(struct seq_file *seq, void *v)
seq_printf(seq, "%-10s %-9s ",
rose2asc(&rose->dest_addr),
- ax2asc(&rose->dest_call));
+ ax2asc(buf, &rose->dest_call));
if (ax25cmp(&rose->source_call, &null_ax25_address) == 0)
callsign = "??????-?";
else
- callsign = ax2asc(&rose->source_call);
+ callsign = ax2asc(buf, &rose->source_call);
seq_printf(seq,
"%-10s %-9s %-5s %3.3X %05d %d %d %d %d %3lu %3lu %3lu %3lu %3lu %3lu/%03lu %5d %5d %ld\n",
@@ -1471,22 +1472,25 @@ static const char banner[] = KERN_INFO "F6FBB/G4KLX ROSE for Linux. Version 0.62
static int __init rose_proto_init(void)
{
int i;
- int rc = proto_register(&rose_proto, 0);
+ int rc;
+
+ if (rose_ndevs > 0x7FFFFFFF/sizeof(struct net_device *)) {
+ printk(KERN_ERR "ROSE: rose_proto_init - rose_ndevs parameter to large\n");
+ rc = -EINVAL;
+ goto out;
+ }
+ rc = proto_register(&rose_proto, 0);
if (rc != 0)
goto out;
rose_callsign = null_ax25_address;
- if (rose_ndevs > 0x7FFFFFFF/sizeof(struct net_device *)) {
- printk(KERN_ERR "ROSE: rose_proto_init - rose_ndevs parameter to large\n");
- return -1;
- }
-
dev_rose = kmalloc(rose_ndevs * sizeof(struct net_device *), GFP_KERNEL);
if (dev_rose == NULL) {
printk(KERN_ERR "ROSE: rose_proto_init - unable to allocate device structure\n");
- return -1;
+ rc = -ENOMEM;
+ goto out_proto_unregister;
}
memset(dev_rose, 0x00, rose_ndevs * sizeof(struct net_device*));
@@ -1499,10 +1503,12 @@ static int __init rose_proto_init(void)
name, rose_setup);
if (!dev) {
printk(KERN_ERR "ROSE: rose_proto_init - unable to allocate memory\n");
+ rc = -ENOMEM;
goto fail;
}
- if (register_netdev(dev)) {
- printk(KERN_ERR "ROSE: netdevice regeistration failed\n");
+ rc = register_netdev(dev);
+ if (rc) {
+ printk(KERN_ERR "ROSE: netdevice registration failed\n");
free_netdev(dev);
goto fail;
}
@@ -1535,8 +1541,9 @@ fail:
free_netdev(dev_rose[i]);
}
kfree(dev_rose);
+out_proto_unregister:
proto_unregister(&rose_proto);
- return -ENOMEM;
+ goto out;
}
module_init(rose_proto_init);
diff --git a/net/rose/rose_dev.c b/net/rose/rose_dev.c
index a8ed9a1d09f..d297af737d1 100644
--- a/net/rose/rose_dev.c
+++ b/net/rose/rose_dev.c
@@ -149,6 +149,6 @@ void rose_setup(struct net_device *dev)
dev->set_mac_address = rose_set_mac_address;
/* New-style flags. */
- dev->flags = 0;
+ dev->flags = IFF_NOARP;
dev->get_stats = rose_get_stats;
}
diff --git a/net/rose/rose_in.c b/net/rose/rose_in.c
index ef475a1bb1b..8348d33f1ef 100644
--- a/net/rose/rose_in.c
+++ b/net/rose/rose_in.c
@@ -26,8 +26,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <net/ip.h> /* For ip_rcv */
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <asm/system.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c
index ff73ebb912b..b18fe504301 100644
--- a/net/rose/rose_route.c
+++ b/net/rose/rose_route.c
@@ -24,7 +24,7 @@
#include <linux/if_arp.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <asm/system.h>
#include <asm/uaccess.h>
#include <linux/fcntl.h>
@@ -727,7 +727,7 @@ int rose_rt_ioctl(unsigned int cmd, void __user *arg)
}
if (rose_route.mask > 10) /* Mask can't be more than 10 digits */
return -EINVAL;
- if (rose_route.ndigis > 8) /* No more than 8 digipeats */
+ if (rose_route.ndigis > AX25_MAX_DIGIS)
return -EINVAL;
err = rose_add_node(&rose_route, dev);
dev_put(dev);
@@ -851,6 +851,7 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25)
unsigned char cause, diagnostic;
struct net_device *dev;
int len, res = 0;
+ char buf[11];
#if 0
if (call_in_firewall(PF_ROSE, skb->dev, skb->data, NULL, &skb) != FW_ACCEPT)
@@ -876,7 +877,7 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25)
if (rose_neigh == NULL) {
printk("rose_route : unknown neighbour or device %s\n",
- ax2asc(&ax25->dest_addr));
+ ax2asc(buf, &ax25->dest_addr));
goto out;
}
@@ -994,8 +995,10 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25)
* 1. The frame isn't for us,
* 2. It isn't "owned" by any existing route.
*/
- if (frametype != ROSE_CALL_REQUEST) /* XXX */
- return 0;
+ if (frametype != ROSE_CALL_REQUEST) { /* XXX */
+ res = 0;
+ goto out;
+ }
len = (((skb->data[3] >> 4) & 0x0F) + 1) / 2;
len += (((skb->data[3] >> 0) & 0x0F) + 1) / 2;
@@ -1176,6 +1179,7 @@ static void rose_neigh_stop(struct seq_file *seq, void *v)
static int rose_neigh_show(struct seq_file *seq, void *v)
{
+ char buf[11];
int i;
if (v == SEQ_START_TOKEN)
@@ -1187,7 +1191,7 @@ static int rose_neigh_show(struct seq_file *seq, void *v)
/* if (!rose_neigh->loopback) { */
seq_printf(seq, "%05d %-9s %-4s %3d %3d %3s %3s %3lu %3lu",
rose_neigh->number,
- (rose_neigh->loopback) ? "RSLOOP-0" : ax2asc(&rose_neigh->callsign),
+ (rose_neigh->loopback) ? "RSLOOP-0" : ax2asc(buf, &rose_neigh->callsign),
rose_neigh->dev ? rose_neigh->dev->name : "???",
rose_neigh->count,
rose_neigh->use,
@@ -1198,7 +1202,7 @@ static int rose_neigh_show(struct seq_file *seq, void *v)
if (rose_neigh->digipeat != NULL) {
for (i = 0; i < rose_neigh->digipeat->ndigi; i++)
- seq_printf(seq, " %s", ax2asc(&rose_neigh->digipeat->calls[i]));
+ seq_printf(seq, " %s", ax2asc(buf, &rose_neigh->digipeat->calls[i]));
}
seq_puts(seq, "\n");
@@ -1258,6 +1262,8 @@ static void rose_route_stop(struct seq_file *seq, void *v)
static int rose_route_show(struct seq_file *seq, void *v)
{
+ char buf[11];
+
if (v == SEQ_START_TOKEN)
seq_puts(seq,
"lci address callsign neigh <-> lci address callsign neigh\n");
@@ -1269,7 +1275,7 @@ static int rose_route_show(struct seq_file *seq, void *v)
"%3.3X %-10s %-9s %05d ",
rose_route->lci1,
rose2asc(&rose_route->src_addr),
- ax2asc(&rose_route->src_call),
+ ax2asc(buf, &rose_route->src_call),
rose_route->neigh1->number);
else
seq_puts(seq,
@@ -1280,7 +1286,7 @@ static int rose_route_show(struct seq_file *seq, void *v)
"%3.3X %-10s %-9s %05d\n",
rose_route->lci2,
rose2asc(&rose_route->dest_addr),
- ax2asc(&rose_route->dest_call),
+ ax2asc(buf, &rose_route->dest_call),
rose_route->neigh2->number);
else
seq_puts(seq,
diff --git a/net/rose/rose_subr.c b/net/rose/rose_subr.c
index 7db7e1cedc3..36a77944622 100644
--- a/net/rose/rose_subr.c
+++ b/net/rose/rose_subr.c
@@ -21,7 +21,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <asm/system.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
@@ -74,7 +74,7 @@ void rose_requeue_frames(struct sock *sk)
if (skb_prev == NULL)
skb_queue_head(&sk->sk_write_queue, skb);
else
- skb_append(skb_prev, skb);
+ skb_append(skb_prev, skb, &sk->sk_write_queue);
skb_prev = skb;
}
}
@@ -337,13 +337,13 @@ static int rose_parse_ccitt(unsigned char *p, struct rose_facilities_struct *fac
memcpy(&facilities->source_addr, p + 7, ROSE_ADDR_LEN);
memcpy(callsign, p + 12, l - 10);
callsign[l - 10] = '\0';
- facilities->source_call = *asc2ax(callsign);
+ asc2ax(&facilities->source_call, callsign);
}
if (*p == FAC_CCITT_SRC_NSAP) {
memcpy(&facilities->dest_addr, p + 7, ROSE_ADDR_LEN);
memcpy(callsign, p + 12, l - 10);
callsign[l - 10] = '\0';
- facilities->dest_call = *asc2ax(callsign);
+ asc2ax(&facilities->dest_call, callsign);
}
p += l + 2;
n += l + 2;
@@ -400,6 +400,7 @@ static int rose_create_facilities(unsigned char *buffer, struct rose_sock *rose)
{
unsigned char *p = buffer + 1;
char *callsign;
+ char buf[11];
int len, nb;
/* National Facilities */
@@ -456,7 +457,7 @@ static int rose_create_facilities(unsigned char *buffer, struct rose_sock *rose)
*p++ = FAC_CCITT_DEST_NSAP;
- callsign = ax2asc(&rose->dest_call);
+ callsign = ax2asc(buf, &rose->dest_call);
*p++ = strlen(callsign) + 10;
*p++ = (strlen(callsign) + 9) * 2; /* ??? */
@@ -471,7 +472,7 @@ static int rose_create_facilities(unsigned char *buffer, struct rose_sock *rose)
*p++ = FAC_CCITT_SRC_NSAP;
- callsign = ax2asc(&rose->source_call);
+ callsign = ax2asc(buf, &rose->source_call);
*p++ = strlen(callsign) + 10;
*p++ = (strlen(callsign) + 9) * 2; /* ??? */
diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c
index 84dd4403f79..b6c8f38cc26 100644
--- a/net/rose/rose_timer.c
+++ b/net/rose/rose_timer.c
@@ -22,7 +22,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <asm/system.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
@@ -138,6 +138,7 @@ static void rose_heartbeat_expiry(unsigned long param)
is accepted() it isn't 'dead' so doesn't get removed. */
if (sock_flag(sk, SOCK_DESTROY) ||
(sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_DEAD))) {
+ bh_unlock_sock(sk);
rose_destroy_socket(sk);
return;
}
diff --git a/net/rxrpc/call.c b/net/rxrpc/call.c
index 5cfd4cadee4..c4aeb7d4026 100644
--- a/net/rxrpc/call.c
+++ b/net/rxrpc/call.c
@@ -1923,7 +1923,7 @@ int rxrpc_call_write_data(struct rxrpc_call *call,
size_t sioc,
struct kvec *siov,
u8 rxhdr_flags,
- int alloc_flags,
+ gfp_t alloc_flags,
int dup_data,
size_t *size_sent)
{
diff --git a/net/rxrpc/connection.c b/net/rxrpc/connection.c
index 61463c74f8c..2ba14a75dbb 100644
--- a/net/rxrpc/connection.c
+++ b/net/rxrpc/connection.c
@@ -522,7 +522,7 @@ int rxrpc_conn_newmsg(struct rxrpc_connection *conn,
uint8_t type,
int dcount,
struct kvec diov[],
- int alloc_flags,
+ gfp_t alloc_flags,
struct rxrpc_message **_msg)
{
struct rxrpc_message *msg;
diff --git a/net/rxrpc/transport.c b/net/rxrpc/transport.c
index 9bce7794130..122c086ee2d 100644
--- a/net/rxrpc/transport.c
+++ b/net/rxrpc/transport.c
@@ -330,7 +330,7 @@ static int rxrpc_incoming_msg(struct rxrpc_transport *trans,
msg->trans = trans;
msg->state = RXRPC_MSG_RECEIVED;
- msg->stamp = pkt->stamp;
+ skb_get_timestamp(pkt, &msg->stamp);
if (msg->stamp.tv_sec == 0) {
do_gettimeofday(&msg->stamp);
if (pkt->sk)
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 59d3e71f8b8..7f34e7fd767 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -2,13 +2,15 @@
# Traffic control configuration.
#
-menuconfig NET_SCHED
+menu "QoS and/or fair queueing"
+
+config NET_SCHED
bool "QoS and/or fair queueing"
---help---
When the kernel has several packets to send out over a network
device, it has to decide which ones to send first, which ones to
- delay, and which ones to drop. This is the job of the packet
- scheduler, and several different algorithms for how to do this
+ delay, and which ones to drop. This is the job of the queueing
+ disciplines, several different algorithms for how to do this
"fairly" have been proposed.
If you say N here, you will get the standard packet scheduler, which
@@ -23,13 +25,13 @@ menuconfig NET_SCHED
To administer these schedulers, you'll need the user-level utilities
from the package iproute2+tc at <ftp://ftp.tux.org/pub/net/ip-routing/>.
That package also contains some documentation; for more, check out
- <http://snafu.freedom.org/linux2.2/iproute-notes.html>.
+ <http://linux-net.osdl.org/index.php/Iproute2>.
This Quality of Service (QoS) support will enable you to use
Differentiated Services (diffserv) and Resource Reservation Protocol
- (RSVP) on your Linux router if you also say Y to "QoS support",
- "Packet classifier API" and to some classifiers below. Documentation
- and software is at <http://diffserv.sourceforge.net/>.
+ (RSVP) on your Linux router if you also say Y to the corresponding
+ classifiers below. Documentation and software is at
+ <http://diffserv.sourceforge.net/>.
If you say Y here and to "/proc file system" below, you will be able
to read status information about packet schedulers from the file
@@ -42,7 +44,7 @@ choice
prompt "Packet scheduler clock source"
depends on NET_SCHED
default NET_SCH_CLK_JIFFIES
- help
+ ---help---
Packet schedulers need a monotonic clock that increments at a static
rate. The kernel provides several suitable interfaces, each with
different properties:
@@ -56,7 +58,7 @@ choice
config NET_SCH_CLK_JIFFIES
bool "Timer interrupt"
- help
+ ---help---
Say Y here if you want to use the timer interrupt (jiffies) as clock
source. This clock source is fast, synchronized on all processors and
handles cpu clock frequency changes, but its resolution is too low
@@ -64,7 +66,7 @@ config NET_SCH_CLK_JIFFIES
config NET_SCH_CLK_GETTIMEOFDAY
bool "gettimeofday"
- help
+ ---help---
Say Y here if you want to use gettimeofday as clock source. This clock
source has high resolution, is synchronized on all processors and
handles cpu clock frequency changes, but it is slow.
@@ -72,10 +74,12 @@ config NET_SCH_CLK_GETTIMEOFDAY
Choose this if you need a high resolution clock source but can't use
the CPU's cycle counter.
+# don't allow on SMP x86 because they can have unsynchronized TSCs.
+# gettimeofday is a good alternative
config NET_SCH_CLK_CPU
bool "CPU cycle counter"
- depends on X86_TSC || X86_64 || ALPHA || SPARC64 || PPC64 || IA64
- help
+ depends on ((X86_TSC || X86_64) && !SMP) || ALPHA || SPARC64 || PPC64 || IA64
+ ---help---
Say Y here if you want to use the CPU's cycle counter as clock source.
This is a cheap and high resolution clock source, but on some
architectures it is not synchronized on all processors and doesn't
@@ -93,134 +97,129 @@ config NET_SCH_CLK_CPU
endchoice
+comment "Queueing/Scheduling"
+ depends on NET_SCHED
+
config NET_SCH_CBQ
- tristate "CBQ packet scheduler"
+ tristate "Class Based Queueing (CBQ)"
depends on NET_SCHED
---help---
Say Y here if you want to use the Class-Based Queueing (CBQ) packet
- scheduling algorithm for some of your network devices. This
- algorithm classifies the waiting packets into a tree-like hierarchy
- of classes; the leaves of this tree are in turn scheduled by
- separate algorithms (called "disciplines" in this context).
+ scheduling algorithm. This algorithm classifies the waiting packets
+ into a tree-like hierarchy of classes; the leaves of this tree are
+ in turn scheduled by separate algorithms.
- See the top of <file:net/sched/sch_cbq.c> for references about the
- CBQ algorithm.
+ See the top of <file:net/sched/sch_cbq.c> for more details.
CBQ is a commonly used scheduler, so if you're unsure, you should
say Y here. Then say Y to all the queueing algorithms below that you
- want to use as CBQ disciplines. Then say Y to "Packet classifier
- API" and say Y to all the classifiers you want to use; a classifier
- is a routine that allows you to sort your outgoing traffic into
- classes based on a certain criterion.
+ want to use as leaf disciplines.
To compile this code as a module, choose M here: the
module will be called sch_cbq.
config NET_SCH_HTB
- tristate "HTB packet scheduler"
+ tristate "Hierarchical Token Bucket (HTB)"
depends on NET_SCHED
---help---
Say Y here if you want to use the Hierarchical Token Buckets (HTB)
- packet scheduling algorithm for some of your network devices. See
+ packet scheduling algorithm. See
<http://luxik.cdi.cz/~devik/qos/htb/> for complete manual and
in-depth articles.
- HTB is very similar to the CBQ regarding its goals however is has
+ HTB is very similar to CBQ regarding its goals however is has
different properties and different algorithm.
To compile this code as a module, choose M here: the
module will be called sch_htb.
config NET_SCH_HFSC
- tristate "HFSC packet scheduler"
+ tristate "Hierarchical Fair Service Curve (HFSC)"
depends on NET_SCHED
---help---
Say Y here if you want to use the Hierarchical Fair Service Curve
- (HFSC) packet scheduling algorithm for some of your network devices.
+ (HFSC) packet scheduling algorithm.
To compile this code as a module, choose M here: the
module will be called sch_hfsc.
-#tristate ' H-PFQ packet scheduler' CONFIG_NET_SCH_HPFQ
config NET_SCH_ATM
- tristate "ATM pseudo-scheduler"
+ tristate "ATM Virtual Circuits (ATM)"
depends on NET_SCHED && ATM
---help---
Say Y here if you want to use the ATM pseudo-scheduler. This
- provides a framework for invoking classifiers (aka "filters"), which
- in turn select classes of this queuing discipline. Each class maps
- the flow(s) it is handling to a given virtual circuit (see the top of
- <file:net/sched/sch_atm.c>).
+ provides a framework for invoking classifiers, which in turn
+ select classes of this queuing discipline. Each class maps
+ the flow(s) it is handling to a given virtual circuit.
+
+ See the top of <file:net/sched/sch_atm.c>) for more details.
To compile this code as a module, choose M here: the
module will be called sch_atm.
config NET_SCH_PRIO
- tristate "The simplest PRIO pseudoscheduler"
+ tristate "Multi Band Priority Queueing (PRIO)"
depends on NET_SCHED
- help
+ ---help---
Say Y here if you want to use an n-band priority queue packet
- "scheduler" for some of your network devices or as a leaf discipline
- for the CBQ scheduling algorithm. If unsure, say Y.
+ scheduler.
To compile this code as a module, choose M here: the
module will be called sch_prio.
config NET_SCH_RED
- tristate "RED queue"
+ tristate "Random Early Detection (RED)"
depends on NET_SCHED
- help
+ ---help---
Say Y here if you want to use the Random Early Detection (RED)
- packet scheduling algorithm for some of your network devices (see
- the top of <file:net/sched/sch_red.c> for details and references
- about the algorithm).
+ packet scheduling algorithm.
+
+ See the top of <file:net/sched/sch_red.c> for more details.
To compile this code as a module, choose M here: the
module will be called sch_red.
config NET_SCH_SFQ
- tristate "SFQ queue"
+ tristate "Stochastic Fairness Queueing (SFQ)"
depends on NET_SCHED
---help---
Say Y here if you want to use the Stochastic Fairness Queueing (SFQ)
- packet scheduling algorithm for some of your network devices or as a
- leaf discipline for the CBQ scheduling algorithm (see the top of
- <file:net/sched/sch_sfq.c> for details and references about the SFQ
- algorithm).
+ packet scheduling algorithm .
+
+ See the top of <file:net/sched/sch_sfq.c> for more details.
To compile this code as a module, choose M here: the
module will be called sch_sfq.
config NET_SCH_TEQL
- tristate "TEQL queue"
+ tristate "True Link Equalizer (TEQL)"
depends on NET_SCHED
---help---
Say Y here if you want to use the True Link Equalizer (TLE) packet
- scheduling algorithm for some of your network devices or as a leaf
- discipline for the CBQ scheduling algorithm. This queueing
- discipline allows the combination of several physical devices into
- one virtual device. (see the top of <file:net/sched/sch_teql.c> for
- details).
+ scheduling algorithm. This queueing discipline allows the combination
+ of several physical devices into one virtual device.
+
+ See the top of <file:net/sched/sch_teql.c> for more details.
To compile this code as a module, choose M here: the
module will be called sch_teql.
config NET_SCH_TBF
- tristate "TBF queue"
+ tristate "Token Bucket Filter (TBF)"
depends on NET_SCHED
- help
- Say Y here if you want to use the Simple Token Bucket Filter (TBF)
- packet scheduling algorithm for some of your network devices or as a
- leaf discipline for the CBQ scheduling algorithm (see the top of
- <file:net/sched/sch_tbf.c> for a description of the TBF algorithm).
+ ---help---
+ Say Y here if you want to use the Token Bucket Filter (TBF) packet
+ scheduling algorithm.
+
+ See the top of <file:net/sched/sch_tbf.c> for more details.
To compile this code as a module, choose M here: the
module will be called sch_tbf.
config NET_SCH_GRED
- tristate "GRED queue"
+ tristate "Generic Random Early Detection (GRED)"
depends on NET_SCHED
- help
+ ---help---
Say Y here if you want to use the Generic Random Early Detection
(GRED) packet scheduling algorithm for some of your network devices
(see the top of <file:net/sched/sch_red.c> for details and
@@ -230,9 +229,9 @@ config NET_SCH_GRED
module will be called sch_gred.
config NET_SCH_DSMARK
- tristate "Diffserv field marker"
+ tristate "Differentiated Services marker (DSMARK)"
depends on NET_SCHED
- help
+ ---help---
Say Y if you want to schedule packets according to the
Differentiated Services architecture proposed in RFC 2475.
Technical information on this method, with pointers to associated
@@ -242,9 +241,9 @@ config NET_SCH_DSMARK
module will be called sch_dsmark.
config NET_SCH_NETEM
- tristate "Network emulator"
+ tristate "Network emulator (NETEM)"
depends on NET_SCHED
- help
+ ---help---
Say Y if you want to emulate network delay, loss, and packet
re-ordering. This is often useful to simulate networks when
testing applications or protocols.
@@ -257,58 +256,23 @@ config NET_SCH_NETEM
config NET_SCH_INGRESS
tristate "Ingress Qdisc"
depends on NET_SCHED
- help
- If you say Y here, you will be able to police incoming bandwidth
- and drop packets when this bandwidth exceeds your desired rate.
+ ---help---
+ Say Y here if you want to use classifiers for incoming packets.
If unsure, say Y.
To compile this code as a module, choose M here: the
module will be called sch_ingress.
-config NET_QOS
- bool "QoS support"
+comment "Classification"
depends on NET_SCHED
- ---help---
- Say Y here if you want to include Quality Of Service scheduling
- features, which means that you will be able to request certain
- rate-of-flow limits for your network devices.
-
- This Quality of Service (QoS) support will enable you to use
- Differentiated Services (diffserv) and Resource Reservation Protocol
- (RSVP) on your Linux router if you also say Y to "Packet classifier
- API" and to some classifiers below. Documentation and software is at
- <http://diffserv.sourceforge.net/>.
-
- Note that the answer to this question won't directly affect the
- kernel: saying N will just cause the configurator to skip all
- the questions about QoS support.
-
-config NET_ESTIMATOR
- bool "Rate estimator"
- depends on NET_QOS
- help
- In order for Quality of Service scheduling to work, the current
- rate-of-flow for a network device has to be estimated; if you say Y
- here, the kernel will do just that.
config NET_CLS
- bool "Packet classifier API"
- depends on NET_SCHED
- ---help---
- The CBQ scheduling algorithm requires that network packets which are
- scheduled to be sent out over a network device be classified
- according to some criterion. If you say Y here, you will get a
- choice of several different packet classifiers with the following
- questions.
-
- This will enable you to use Differentiated Services (diffserv) and
- Resource Reservation Protocol (RSVP) on your Linux router.
- Documentation and software is at
- <http://diffserv.sourceforge.net/>.
+ boolean
config NET_CLS_BASIC
- tristate "Basic classifier"
- depends on NET_CLS
+ tristate "Elementary classification (BASIC)"
+ depends NET_SCHED
+ select NET_CLS
---help---
Say Y here if you want to be able to classify packets using
only extended matches and actions.
@@ -317,24 +281,25 @@ config NET_CLS_BASIC
module will be called cls_basic.
config NET_CLS_TCINDEX
- tristate "TC index classifier"
- depends on NET_CLS
- help
- If you say Y here, you will be able to classify outgoing packets
- according to the tc_index field of the skb. You will want this
- feature if you want to implement Differentiated Services using
- sch_dsmark. If unsure, say Y.
+ tristate "Traffic-Control Index (TCINDEX)"
+ depends NET_SCHED
+ select NET_CLS
+ ---help---
+ Say Y here if you want to be able to classify packets based on
+ traffic control indices. You will want this feature if you want
+ to implement Differentiated Services together with DSMARK.
To compile this code as a module, choose M here: the
module will be called cls_tcindex.
config NET_CLS_ROUTE4
- tristate "Routing table based classifier"
- depends on NET_CLS
+ tristate "Routing decision (ROUTE)"
+ depends NET_SCHED
select NET_CLS_ROUTE
- help
- If you say Y here, you will be able to classify outgoing packets
- according to the route table entry they matched. If unsure, say Y.
+ select NET_CLS
+ ---help---
+ If you say Y here, you will be able to classify packets
+ according to the route table entry they matched.
To compile this code as a module, choose M here: the
module will be called cls_route.
@@ -344,58 +309,45 @@ config NET_CLS_ROUTE
default n
config NET_CLS_FW
- tristate "Firewall based classifier"
- depends on NET_CLS
- help
- If you say Y here, you will be able to classify outgoing packets
- according to firewall criteria you specified.
+ tristate "Netfilter mark (FW)"
+ depends NET_SCHED
+ select NET_CLS
+ ---help---
+ If you say Y here, you will be able to classify packets
+ according to netfilter/firewall marks.
To compile this code as a module, choose M here: the
module will be called cls_fw.
config NET_CLS_U32
- tristate "U32 classifier"
- depends on NET_CLS
- help
- If you say Y here, you will be able to classify outgoing packets
- according to their destination address. If unsure, say Y.
+ tristate "Universal 32bit comparisons w/ hashing (U32)"
+ depends NET_SCHED
+ select NET_CLS
+ ---help---
+ Say Y here to be able to classify packetes using a universal
+ 32bit pieces based comparison scheme.
To compile this code as a module, choose M here: the
module will be called cls_u32.
config CLS_U32_PERF
- bool "U32 classifier performance counters"
+ bool "Performance counters support"
depends on NET_CLS_U32
- help
- gathers stats that could be used to tune u32 classifier performance.
- Requires a new iproute2
- You MUST NOT turn this on if you dont have an update iproute2.
-
-config NET_CLS_IND
- bool "classify input device (slows things u32/fw) "
- depends on NET_CLS_U32 || NET_CLS_FW
- help
- This option will be killed eventually when a
- metadata action appears because it slows things a little
- Available only for u32 and fw classifiers.
- Requires a new iproute2
- You MUST NOT turn this on if you dont have an update iproute2.
+ ---help---
+ Say Y here to make u32 gather additional statistics useful for
+ fine tuning u32 classifiers.
config CLS_U32_MARK
- bool "Use nfmark as a key in U32 classifier"
+ bool "Netfilter marks support"
depends on NET_CLS_U32 && NETFILTER
- help
- This allows you to match mark in a u32 filter.
- Example:
- tc filter add dev eth0 protocol ip parent 1:0 prio 5 u32 \
- match mark 0x0090 0xffff \
- match ip dst 4.4.4.4 \
- flowid 1:90
- You must use a new iproute2 to use this feature.
+ ---help---
+ Say Y here to be able to use netfilter marks as u32 key.
config NET_CLS_RSVP
- tristate "Special RSVP classifier"
- depends on NET_CLS && NET_QOS
+ tristate "IPv4 Resource Reservation Protocol (RSVP)"
+ depends on NET_SCHED
+ select NET_CLS
+ select NET_ESTIMATOR
---help---
The Resource Reservation Protocol (RSVP) permits end systems to
request a minimum and maximum data flow rate for a connection; this
@@ -408,31 +360,33 @@ config NET_CLS_RSVP
module will be called cls_rsvp.
config NET_CLS_RSVP6
- tristate "Special RSVP classifier for IPv6"
- depends on NET_CLS && NET_QOS
+ tristate "IPv6 Resource Reservation Protocol (RSVP6)"
+ depends on NET_SCHED
+ select NET_CLS
+ select NET_ESTIMATOR
---help---
The Resource Reservation Protocol (RSVP) permits end systems to
request a minimum and maximum data flow rate for a connection; this
is important for real time data such as streaming sound or video.
Say Y here if you want to be able to classify outgoing packets based
- on their RSVP requests and you are using the new Internet Protocol
- IPv6 as opposed to the older and more common IPv4.
+ on their RSVP requests and you are using the IPv6.
To compile this code as a module, choose M here: the
module will be called cls_rsvp6.
config NET_EMATCH
bool "Extended Matches"
- depends on NET_CLS
+ depends NET_SCHED
+ select NET_CLS
---help---
Say Y here if you want to use extended matches on top of classifiers
and select the extended matches below.
Extended matches are small classification helpers not worth writing
- a separate classifier.
+ a separate classifier for.
- You must have a recent version of the iproute2 tools in order to use
+ A recent version of the iproute2 package is required to use
extended matches.
config NET_EMATCH_STACK
@@ -466,7 +420,7 @@ config NET_EMATCH_NBYTE
module will be called em_nbyte.
config NET_EMATCH_U32
- tristate "U32 hashing key"
+ tristate "U32 key"
depends on NET_EMATCH
---help---
Say Y here if you want to be able to classify packets using
@@ -491,78 +445,123 @@ config NET_EMATCH_TEXT
depends on NET_EMATCH
select TEXTSEARCH
select TEXTSEARCH_KMP
+ select TEXTSEARCH_BM
select TEXTSEARCH_FSM
---help---
- Say Y here if you want to be ablt to classify packets based on
+ Say Y here if you want to be able to classify packets based on
textsearch comparisons.
To compile this code as a module, choose M here: the
module will be called em_text.
config NET_CLS_ACT
- bool "Packet ACTION"
- depends on EXPERIMENTAL && NET_CLS && NET_QOS
+ bool "Actions"
+ depends on EXPERIMENTAL && NET_SCHED
+ select NET_ESTIMATOR
---help---
- This option requires you have a new iproute2. It enables
- tc extensions which can be used with tc classifiers.
- You MUST NOT turn this on if you dont have an update iproute2.
+ Say Y here if you want to use traffic control actions. Actions
+ get attached to classifiers and are invoked after a successful
+ classification. They are used to overwrite the classification
+ result, instantly drop or redirect packets, etc.
+
+ A recent version of the iproute2 package is required to use
+ extended matches.
config NET_ACT_POLICE
- tristate "Policing Actions"
+ tristate "Traffic Policing"
depends on NET_CLS_ACT
---help---
- If you are using a newer iproute2 select this one, otherwise use one
- below to select a policer.
- You MUST NOT turn this on if you dont have an update iproute2.
+ Say Y here if you want to do traffic policing, i.e. strict
+ bandwidth limiting. This action replaces the existing policing
+ module.
+
+ To compile this code as a module, choose M here: the
+ module will be called police.
config NET_ACT_GACT
- tristate "generic Actions"
+ tristate "Generic actions"
depends on NET_CLS_ACT
---help---
- You must have new iproute2 to use this feature.
- This adds simple filtering actions like drop, accept etc.
+ Say Y here to take generic actions such as dropping and
+ accepting packets.
+
+ To compile this code as a module, choose M here: the
+ module will be called gact.
config GACT_PROB
- bool "generic Actions probability"
+ bool "Probability support"
depends on NET_ACT_GACT
---help---
- Allows generic actions to be randomly or deterministically used.
+ Say Y here to use the generic action randomly or deterministically.
config NET_ACT_MIRRED
- tristate "Packet In/Egress redirecton/mirror Actions"
+ tristate "Redirecting and Mirroring"
depends on NET_CLS_ACT
---help---
- requires new iproute2
- This allows packets to be mirrored or redirected to netdevices
+ Say Y here to allow packets to be mirrored or redirected to
+ other devices.
+
+ To compile this code as a module, choose M here: the
+ module will be called mirred.
config NET_ACT_IPT
- tristate "iptables Actions"
+ tristate "IPtables targets"
depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
---help---
- requires new iproute2
- This allows iptables targets to be used by tc filters
+ Say Y here to be able to invoke iptables targets after succesful
+ classification.
+
+ To compile this code as a module, choose M here: the
+ module will be called ipt.
config NET_ACT_PEDIT
- tristate "Generic Packet Editor Actions"
+ tristate "Packet Editing"
depends on NET_CLS_ACT
---help---
- requires new iproute2
- This allows for packets to be generically edited
+ Say Y here if you want to mangle the content of packets.
-config NET_CLS_POLICE
- bool "Traffic policing (needed for in/egress)"
- depends on NET_CLS && NET_QOS && NET_CLS_ACT!=y
- help
- Say Y to support traffic policing (bandwidth limits). Needed for
- ingress and egress rate limiting.
+ To compile this code as a module, choose M here: the
+ module will be called pedit.
config NET_ACT_SIMP
- tristate "Simple action"
+ tristate "Simple Example (Debug)"
depends on NET_CLS_ACT
---help---
- You must have new iproute2 to use this feature.
- This adds a very simple action for demonstration purposes
- The idea is to give action authors a basic example to look at.
- All this action will do is print on the console the configured
- policy string followed by _ then packet count.
+ Say Y here to add a simple action for demonstration purposes.
+ It is meant as an example and for debugging purposes. It will
+ print a configured policy string followed by the packet count
+ to the console for every packet that passes by.
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called simple.
+
+config NET_CLS_POLICE
+ bool "Traffic Policing (obsolete)"
+ depends on NET_SCHED && NET_CLS_ACT!=y
+ select NET_ESTIMATOR
+ ---help---
+ Say Y here if you want to do traffic policing, i.e. strict
+ bandwidth limiting. This option is obsoleted by the traffic
+ policer implemented as action, it stays here for compatibility
+ reasons.
+
+config NET_CLS_IND
+ bool "Incoming device classification"
+ depends on NET_SCHED && (NET_CLS_U32 || NET_CLS_FW)
+ ---help---
+ Say Y here to extend the u32 and fw classifier to support
+ classification based on the incoming device. This option is
+ likely to disappear in favour of the metadata ematch.
+
+config NET_ESTIMATOR
+ bool "Rate estimator"
+ depends on NET_SCHED
+ ---help---
+ Say Y here to allow using rate estimators to estimate the current
+ rate-of-flow for network devices, queues, etc. This module is
+ automaticaly selected if needed but can be selected manually for
+ statstical purposes.
+endmenu
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 249c61936ea..8aebe8f6d27 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -165,7 +165,7 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action *act,
while ((a = act) != NULL) {
repeat:
if (a->ops && a->ops->act) {
- ret = a->ops->act(&skb, a);
+ ret = a->ops->act(&skb, a, res);
if (TC_MUNGED & skb->tc_verd) {
/* copied already, allow trampling */
skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
@@ -179,11 +179,6 @@ repeat:
act = a->next;
}
exec_done:
- if (skb->tc_classid > 0) {
- res->classid = skb->tc_classid;
- res->class = 0;
- skb->tc_classid = 0;
- }
return ret;
}
@@ -598,7 +593,7 @@ static int tca_action_flush(struct rtattr *rta, struct nlmsghdr *n, u32 pid)
nlh->nlmsg_flags |= NLM_F_ROOT;
module_put(a->ops->owner);
kfree(a);
- err = rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+ err = rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
if (err > 0)
return 0;
@@ -661,7 +656,7 @@ tca_action_gd(struct rtattr *rta, struct nlmsghdr *n, u32 pid, int event)
/* now do the delete */
tcf_action_destroy(head, 0);
- ret = rtnetlink_send(skb, pid, RTMGRP_TC,
+ ret = rtnetlink_send(skb, pid, RTNLGRP_TC,
n->nlmsg_flags&NLM_F_ECHO);
if (ret > 0)
return 0;
@@ -703,9 +698,9 @@ static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event,
x->rta_len = skb->tail - (u8*)x;
nlh->nlmsg_len = skb->tail - b;
- NETLINK_CB(skb).dst_groups = RTMGRP_TC;
+ NETLINK_CB(skb).dst_group = RTNLGRP_TC;
- err = rtnetlink_send(skb, pid, RTMGRP_TC, flags&NLM_F_ECHO);
+ err = rtnetlink_send(skb, pid, RTNLGRP_TC, flags&NLM_F_ECHO);
if (err > 0)
err = 0;
return err;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 3b5714ef4d1..b4d89fbb378 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -367,7 +367,7 @@ static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n,
return -EINVAL;
}
- return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+ return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
}
struct tcf_dump_args
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 00eae5f9a01..cf68a59fdc5 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -393,10 +393,10 @@ META_COLLECTOR(int_sk_route_caps)
dst->value = skb->sk->sk_route_caps;
}
-META_COLLECTOR(int_sk_hashent)
+META_COLLECTOR(int_sk_hash)
{
SKIP_NONLOCAL(skb);
- dst->value = skb->sk->sk_hashent;
+ dst->value = skb->sk->sk_hash;
}
META_COLLECTOR(int_sk_lingertime)
@@ -515,7 +515,7 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
[META_ID(SK_FORWARD_ALLOCS)] = META_FUNC(int_sk_fwd_alloc),
[META_ID(SK_ALLOCS)] = META_FUNC(int_sk_alloc),
[META_ID(SK_ROUTE_CAPS)] = META_FUNC(int_sk_route_caps),
- [META_ID(SK_HASHENT)] = META_FUNC(int_sk_hashent),
+ [META_ID(SK_HASH)] = META_FUNC(int_sk_hash),
[META_ID(SK_LINGERTIME)] = META_FUNC(int_sk_lingertime),
[META_ID(SK_ACK_BACKLOG)] = META_FUNC(int_sk_ack_bl),
[META_ID(SK_MAX_ACK_BACKLOG)] = META_FUNC(int_sk_max_ack_bl),
diff --git a/net/sched/gact.c b/net/sched/gact.c
index a811c89fef7..d1c6d542912 100644
--- a/net/sched/gact.c
+++ b/net/sched/gact.c
@@ -135,7 +135,7 @@ tcf_gact_cleanup(struct tc_action *a, int bind)
}
static int
-tcf_gact(struct sk_buff **pskb, struct tc_action *a)
+tcf_gact(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res)
{
struct tcf_gact *p = PRIV(a, gact);
struct sk_buff *skb = *pskb;
diff --git a/net/sched/ipt.c b/net/sched/ipt.c
index b114d994d52..f50136eed21 100644
--- a/net/sched/ipt.c
+++ b/net/sched/ipt.c
@@ -201,7 +201,7 @@ tcf_ipt_cleanup(struct tc_action *a, int bind)
}
static int
-tcf_ipt(struct sk_buff **pskb, struct tc_action *a)
+tcf_ipt(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res)
{
int ret = 0, result = 0;
struct tcf_ipt *p = PRIV(a, ipt);
diff --git a/net/sched/mirred.c b/net/sched/mirred.c
index f309ce33680..20d06916dc0 100644
--- a/net/sched/mirred.c
+++ b/net/sched/mirred.c
@@ -158,7 +158,7 @@ tcf_mirred_cleanup(struct tc_action *a, int bind)
}
static int
-tcf_mirred(struct sk_buff **pskb, struct tc_action *a)
+tcf_mirred(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res)
{
struct tcf_mirred *p = PRIV(a, mirred);
struct net_device *dev;
diff --git a/net/sched/pedit.c b/net/sched/pedit.c
index 678be6a645f..767d24f4610 100644
--- a/net/sched/pedit.c
+++ b/net/sched/pedit.c
@@ -130,7 +130,7 @@ tcf_pedit_cleanup(struct tc_action *a, int bind)
}
static int
-tcf_pedit(struct sk_buff **pskb, struct tc_action *a)
+tcf_pedit(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res)
{
struct tcf_pedit *p = PRIV(a, pedit);
struct sk_buff *skb = *pskb;
diff --git a/net/sched/police.c b/net/sched/police.c
index c03545faf52..eb39fb2f39b 100644
--- a/net/sched/police.c
+++ b/net/sched/police.c
@@ -284,7 +284,8 @@ static int tcf_act_police_cleanup(struct tc_action *a, int bind)
return 0;
}
-static int tcf_act_police(struct sk_buff **pskb, struct tc_action *a)
+static int tcf_act_police(struct sk_buff **pskb, struct tc_action *a,
+ struct tcf_result *res)
{
psched_time_t now;
struct sk_buff *skb = *pskb;
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index b9a069af4a0..31570b9a6e9 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -816,7 +816,7 @@ static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
}
if (skb->len)
- return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+ return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
err_out:
kfree_skb(skb);
@@ -1040,7 +1040,7 @@ static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
return -EINVAL;
}
- return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+ return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
}
struct qdisc_dump_args
@@ -1194,7 +1194,7 @@ EXPORT_SYMBOL(psched_time_base);
* with 32-bit get_cycles(). Safe up to 4GHz CPU.
*/
static void psched_tick(unsigned long);
-static struct timer_list psched_timer = TIMER_INITIALIZER(psched_tick, 0, 0);
+static DEFINE_TIMER(psched_timer, psched_tick, 0, 0);
static void psched_tick(unsigned long dummy)
{
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 8edefd5d095..99ceb91f015 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -238,6 +238,20 @@ static void dev_watchdog_down(struct net_device *dev)
spin_unlock_bh(&dev->xmit_lock);
}
+void netif_carrier_on(struct net_device *dev)
+{
+ if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
+ linkwatch_fire_event(dev);
+ if (netif_running(dev))
+ __netdev_watchdog_up(dev);
+}
+
+void netif_carrier_off(struct net_device *dev)
+{
+ if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
+ linkwatch_fire_event(dev);
+}
+
/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
under all circumstances. It is difficult to invent anything faster or
cheaper.
@@ -438,6 +452,7 @@ struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
if (!ops->init || ops->init(sch, NULL) == 0)
return sch;
+ qdisc_destroy(sch);
errout:
return NULL;
}
@@ -599,6 +614,8 @@ void dev_shutdown(struct net_device *dev)
}
EXPORT_SYMBOL(__netdev_watchdog_up);
+EXPORT_SYMBOL(netif_carrier_on);
+EXPORT_SYMBOL(netif_carrier_off);
EXPORT_SYMBOL(noop_qdisc);
EXPORT_SYMBOL(noop_qdisc_ops);
EXPORT_SYMBOL(qdisc_create_dflt);
diff --git a/net/sched/simple.c b/net/sched/simple.c
index 3ab4c675ab5..8a6ae4f491e 100644
--- a/net/sched/simple.c
+++ b/net/sched/simple.c
@@ -44,7 +44,7 @@ static DEFINE_RWLOCK(simp_lock);
#include <net/pkt_act.h>
#include <net/act_generic.h>
-static int tcf_simp(struct sk_buff **pskb, struct tc_action *a)
+static int tcf_simp(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res)
{
struct sk_buff *skb = *pskb;
struct tcf_defact *p = PRIV(a, defact);
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 5b24ae0650d..12b0f582a66 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -71,7 +71,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
const struct sctp_endpoint *ep,
const struct sock *sk,
sctp_scope_t scope,
- unsigned int __nocast gfp)
+ gfp_t gfp)
{
struct sctp_sock *sp;
int i;
@@ -273,7 +273,7 @@ fail_init:
struct sctp_association *sctp_association_new(const struct sctp_endpoint *ep,
const struct sock *sk,
sctp_scope_t scope,
- unsigned int __nocast gfp)
+ gfp_t gfp)
{
struct sctp_association *asoc;
@@ -479,7 +479,7 @@ void sctp_assoc_rm_peer(struct sctp_association *asoc,
/* Add a transport address to an association. */
struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
const union sctp_addr *addr,
- const unsigned int __nocast gfp,
+ const gfp_t gfp,
const int peer_state)
{
struct sctp_transport *peer;
@@ -1231,7 +1231,7 @@ void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned len)
* local endpoint and the remote peer.
*/
int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc,
- unsigned int __nocast gfp)
+ gfp_t gfp)
{
sctp_scope_t scope;
int flags;
@@ -1254,7 +1254,7 @@ int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc,
/* Build the association's bind address list from the cookie. */
int sctp_assoc_set_bind_addr_from_cookie(struct sctp_association *asoc,
struct sctp_cookie *cookie,
- unsigned int __nocast gfp)
+ gfp_t gfp)
{
int var_size2 = ntohs(cookie->peer_init->chunk_hdr.length);
int var_size3 = cookie->raw_addr_list_len;
diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
index f71549710f2..2b962627f63 100644
--- a/net/sctp/bind_addr.c
+++ b/net/sctp/bind_addr.c
@@ -53,7 +53,7 @@
/* Forward declarations for internal helpers. */
static int sctp_copy_one_addr(struct sctp_bind_addr *, union sctp_addr *,
- sctp_scope_t scope, unsigned int __nocast gfp,
+ sctp_scope_t scope, gfp_t gfp,
int flags);
static void sctp_bind_addr_clean(struct sctp_bind_addr *);
@@ -64,7 +64,7 @@ static void sctp_bind_addr_clean(struct sctp_bind_addr *);
*/
int sctp_bind_addr_copy(struct sctp_bind_addr *dest,
const struct sctp_bind_addr *src,
- sctp_scope_t scope, unsigned int __nocast gfp,
+ sctp_scope_t scope, gfp_t gfp,
int flags)
{
struct sctp_sockaddr_entry *addr;
@@ -146,7 +146,7 @@ void sctp_bind_addr_free(struct sctp_bind_addr *bp)
/* Add an address to the bind address list in the SCTP_bind_addr structure. */
int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new,
- unsigned int __nocast gfp)
+ gfp_t gfp)
{
struct sctp_sockaddr_entry *addr;
@@ -200,7 +200,7 @@ int sctp_del_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *del_addr)
*/
union sctp_params sctp_bind_addrs_to_raw(const struct sctp_bind_addr *bp,
int *addrs_len,
- unsigned int __nocast gfp)
+ gfp_t gfp)
{
union sctp_params addrparms;
union sctp_params retval;
@@ -252,7 +252,7 @@ end_raw:
* address parameters).
*/
int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list,
- int addrs_len, __u16 port, unsigned int __nocast gfp)
+ int addrs_len, __u16 port, gfp_t gfp)
{
union sctp_addr_param *rawaddr;
struct sctp_paramhdr *param;
@@ -350,7 +350,7 @@ union sctp_addr *sctp_find_unmatch_addr(struct sctp_bind_addr *bp,
/* Copy out addresses from the global local address list. */
static int sctp_copy_one_addr(struct sctp_bind_addr *dest,
union sctp_addr *addr,
- sctp_scope_t scope, unsigned int __nocast gfp,
+ sctp_scope_t scope, gfp_t gfp,
int flags)
{
int error = 0;
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 61da2937e64..83ef411772f 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -62,7 +62,7 @@ static void sctp_datamsg_init(struct sctp_datamsg *msg)
}
/* Allocate and initialize datamsg. */
-SCTP_STATIC struct sctp_datamsg *sctp_datamsg_new(unsigned int __nocast gfp)
+SCTP_STATIC struct sctp_datamsg *sctp_datamsg_new(gfp_t gfp)
{
struct sctp_datamsg *msg;
msg = kmalloc(sizeof(struct sctp_datamsg), gfp);
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index e47ac0d1a6d..96984f7a2d6 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -68,7 +68,7 @@ static void sctp_endpoint_bh_rcv(struct sctp_endpoint *ep);
*/
static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
struct sock *sk,
- unsigned int __nocast gfp)
+ gfp_t gfp)
{
struct sctp_sock *sp = sctp_sk(sk);
memset(ep, 0, sizeof(struct sctp_endpoint));
@@ -138,8 +138,7 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
/* Create a sctp_endpoint with all that boring stuff initialized.
* Returns NULL if there isn't enough memory.
*/
-struct sctp_endpoint *sctp_endpoint_new(struct sock *sk,
- unsigned int __nocast gfp)
+struct sctp_endpoint *sctp_endpoint_new(struct sock *sk, gfp_t gfp)
{
struct sctp_endpoint *ep;
@@ -193,8 +192,7 @@ static void sctp_endpoint_destroy(struct sctp_endpoint *ep)
sctp_unhash_endpoint(ep);
/* Free up the HMAC transform. */
- if (sctp_sk(ep->base.sk)->hmac)
- sctp_crypto_free_tfm(sctp_sk(ep->base.sk)->hmac);
+ sctp_crypto_free_tfm(sctp_sk(ep->base.sk)->hmac);
/* Cleanup. */
sctp_inq_free(&ep->base.inqueue);
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 742be9171b7..28f32243397 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -236,8 +236,8 @@ int sctp_rcv(struct sk_buff *skb)
}
/* SCTP seems to always need a timestamp right now (FIXME) */
- if (skb->stamp.tv_sec == 0) {
- do_gettimeofday(&skb->stamp);
+ if (skb->tstamp.off_sec == 0) {
+ __net_timestamp(skb);
sock_enable_timestamp(sk);
}
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index e9b2fd480d6..fa3be2b8fb5 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -66,8 +66,8 @@
#include <linux/seq_file.h>
#include <net/protocol.h>
-#include <net/tcp.h>
#include <net/ndisc.h>
+#include <net/ip.h>
#include <net/ipv6.h>
#include <net/transp_v6.h>
#include <net/addrconf.h>
@@ -641,10 +641,7 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk,
else
newinet->pmtudisc = IP_PMTUDISC_WANT;
-#ifdef INET_REFCNT_DEBUG
- atomic_inc(&inet6_sock_nr);
- atomic_inc(&inet_sock_nr);
-#endif
+ sk_refcnt_debug_inc(newsk);
if (newsk->sk_prot->init(newsk)) {
sk_common_release(newsk);
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index 98d49ec9b74..6e4dc28874d 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -57,6 +57,7 @@ static struct snmp_mib sctp_snmp_list[] = {
SNMP_MIB_ITEM("SctpReasmUsrMsgs", SCTP_MIB_REASMUSRMSGS),
SNMP_MIB_ITEM("SctpOutSCTPPacks", SCTP_MIB_OUTSCTPPACKS),
SNMP_MIB_ITEM("SctpInSCTPPacks", SCTP_MIB_INSCTPPACKS),
+ SNMP_MIB_SENTINEL
};
/* Return the current value of a particular entry in the mib by adding its
@@ -68,9 +69,7 @@ fold_field(void *mib[], int nr)
unsigned long res = 0;
int i;
- for (i = 0; i < NR_CPUS; i++) {
- if (!cpu_possible(i))
- continue;
+ for_each_cpu(i) {
res +=
*((unsigned long *) (((void *) per_cpu_ptr(mib[0], i)) +
sizeof (unsigned long) * nr));
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index ce9245e71fc..26de4d3e1bd 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -62,7 +62,7 @@
/* Global data structures. */
struct sctp_globals sctp_globals;
struct proc_dir_entry *proc_net_sctp;
-DEFINE_SNMP_STAT(struct sctp_mib, sctp_statistics);
+DEFINE_SNMP_STAT(struct sctp_mib, sctp_statistics) __read_mostly;
struct idr sctp_assocs_id;
DEFINE_SPINLOCK(sctp_assocs_id_lock);
@@ -78,8 +78,8 @@ static struct sctp_pf *sctp_pf_inet_specific;
static struct sctp_af *sctp_af_v4_specific;
static struct sctp_af *sctp_af_v6_specific;
-kmem_cache_t *sctp_chunk_cachep;
-kmem_cache_t *sctp_bucket_cachep;
+kmem_cache_t *sctp_chunk_cachep __read_mostly;
+kmem_cache_t *sctp_bucket_cachep __read_mostly;
extern int sctp_snmp_proc_init(void);
extern int sctp_snmp_proc_exit(void);
@@ -147,7 +147,7 @@ static void sctp_v4_copy_addrlist(struct list_head *addrlist,
struct sctp_sockaddr_entry *addr;
rcu_read_lock();
- if ((in_dev = __in_dev_get(dev)) == NULL) {
+ if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
rcu_read_unlock();
return;
}
@@ -219,7 +219,7 @@ static void sctp_free_local_addr_list(void)
/* Copy the local addresses which are valid for 'scope' into 'bp'. */
int sctp_copy_local_addr_list(struct sctp_bind_addr *bp, sctp_scope_t scope,
- unsigned int __nocast gfp, int copy_flags)
+ gfp_t gfp, int copy_flags)
{
struct sctp_sockaddr_entry *addr;
int error = 0;
@@ -593,9 +593,7 @@ static struct sock *sctp_v4_create_accept_sk(struct sock *sk,
newinet->mc_index = 0;
newinet->mc_list = NULL;
-#ifdef INET_REFCNT_DEBUG
- atomic_inc(&inet_sock_nr);
-#endif
+ sk_refcnt_debug_inc(newsk);
if (newsk->sk_prot->init(newsk)) {
sk_common_release(newsk);
@@ -1244,6 +1242,10 @@ SCTP_STATIC __exit void sctp_exit(void)
module_init(sctp_init);
module_exit(sctp_exit);
+/*
+ * __stringify doesn't likes enums, so use IPPROTO_SCTP value (132) directly.
+ */
+MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-132");
MODULE_AUTHOR("Linux Kernel SCTP developers <lksctp-developers@lists.sourceforge.net>");
MODULE_DESCRIPTION("Support for the SCTP protocol (RFC2960)");
MODULE_LICENSE("GPL");
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 00d32b7c826..660c61bdf16 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -78,7 +78,7 @@ static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep,
static int sctp_process_param(struct sctp_association *asoc,
union sctp_params param,
const union sctp_addr *peer_addr,
- unsigned int __nocast gfp);
+ gfp_t gfp);
/* What was the inbound interface for this chunk? */
int sctp_chunk_iif(const struct sctp_chunk *chunk)
@@ -174,7 +174,7 @@ void sctp_init_cause(struct sctp_chunk *chunk, __u16 cause_code,
*/
struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
const struct sctp_bind_addr *bp,
- unsigned int __nocast gfp, int vparam_len)
+ gfp_t gfp, int vparam_len)
{
sctp_inithdr_t init;
union sctp_params addrs;
@@ -261,7 +261,7 @@ nodata:
struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
const struct sctp_chunk *chunk,
- unsigned int __nocast gfp, int unkparam_len)
+ gfp_t gfp, int unkparam_len)
{
sctp_inithdr_t initack;
struct sctp_chunk *retval;
@@ -554,7 +554,7 @@ struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc,
dp.ppid = sinfo->sinfo_ppid;
/* Set the flags for an unordered send. */
- if (sinfo->sinfo_flags & MSG_UNORDERED) {
+ if (sinfo->sinfo_flags & SCTP_UNORDERED) {
flags |= SCTP_DATA_UNORDERED;
dp.ssn = 0;
} else
@@ -1234,7 +1234,7 @@ void sctp_chunk_assign_tsn(struct sctp_chunk *chunk)
/* Create a CLOSED association to use with an incoming packet. */
struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep,
struct sctp_chunk *chunk,
- unsigned int __nocast gfp)
+ gfp_t gfp)
{
struct sctp_association *asoc;
struct sk_buff *skb;
@@ -1349,7 +1349,7 @@ nodata:
struct sctp_association *sctp_unpack_cookie(
const struct sctp_endpoint *ep,
const struct sctp_association *asoc,
- struct sctp_chunk *chunk, unsigned int __nocast gfp,
+ struct sctp_chunk *chunk, gfp_t gfp,
int *error, struct sctp_chunk **errp)
{
struct sctp_association *retval = NULL;
@@ -1362,6 +1362,7 @@ struct sctp_association *sctp_unpack_cookie(
char *key;
sctp_scope_t scope;
struct sk_buff *skb = chunk->skb;
+ struct timeval tv;
headersize = sizeof(sctp_chunkhdr_t) + SCTP_SECRET_SIZE;
bodysize = ntohs(chunk->chunk_hdr->length) - headersize;
@@ -1434,7 +1435,8 @@ no_hmac:
* an association, there is no need to check cookie's expiration
* for init collision case of lost COOKIE ACK.
*/
- if (!asoc && tv_lt(bear_cookie->expiration, skb->stamp)) {
+ skb_get_timestamp(skb, &tv);
+ if (!asoc && tv_lt(bear_cookie->expiration, tv)) {
__u16 len;
/*
* Section 3.3.10.3 Stale Cookie Error (3)
@@ -1447,10 +1449,9 @@ no_hmac:
len = ntohs(chunk->chunk_hdr->length);
*errp = sctp_make_op_error_space(asoc, chunk, len);
if (*errp) {
- suseconds_t usecs = (skb->stamp.tv_sec -
+ suseconds_t usecs = (tv.tv_sec -
bear_cookie->expiration.tv_sec) * 1000000L +
- skb->stamp.tv_usec -
- bear_cookie->expiration.tv_usec;
+ tv.tv_usec - bear_cookie->expiration.tv_usec;
usecs = htonl(usecs);
sctp_init_cause(*errp, SCTP_ERROR_STALE_COOKIE,
@@ -1813,7 +1814,7 @@ int sctp_verify_init(const struct sctp_association *asoc,
*/
int sctp_process_init(struct sctp_association *asoc, sctp_cid_t cid,
const union sctp_addr *peer_addr,
- sctp_init_chunk_t *peer_init, unsigned int __nocast gfp)
+ sctp_init_chunk_t *peer_init, gfp_t gfp)
{
union sctp_params param;
struct sctp_transport *transport;
@@ -1984,7 +1985,7 @@ nomem:
static int sctp_process_param(struct sctp_association *asoc,
union sctp_params param,
const union sctp_addr *peer_addr,
- unsigned int __nocast gfp)
+ gfp_t gfp)
{
union sctp_addr addr;
int i;
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 39c970b5b19..f84173ea8ec 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -63,7 +63,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
void *event_arg,
sctp_disposition_t status,
sctp_cmd_seq_t *commands,
- unsigned int __nocast gfp);
+ gfp_t gfp);
static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
sctp_state_t state,
struct sctp_endpoint *ep,
@@ -71,7 +71,7 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
void *event_arg,
sctp_disposition_t status,
sctp_cmd_seq_t *commands,
- unsigned int __nocast gfp);
+ gfp_t gfp);
/********************************************************************
* Helper functions
@@ -498,7 +498,7 @@ static int sctp_cmd_process_init(sctp_cmd_seq_t *commands,
struct sctp_association *asoc,
struct sctp_chunk *chunk,
sctp_init_chunk_t *peer_init,
- unsigned int __nocast gfp)
+ gfp_t gfp)
{
int error;
@@ -853,7 +853,7 @@ int sctp_do_sm(sctp_event_t event_type, sctp_subtype_t subtype,
struct sctp_endpoint *ep,
struct sctp_association *asoc,
void *event_arg,
- unsigned int __nocast gfp)
+ gfp_t gfp)
{
sctp_cmd_seq_t commands;
const sctp_sm_table_entry_t *state_fn;
@@ -898,7 +898,7 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
void *event_arg,
sctp_disposition_t status,
sctp_cmd_seq_t *commands,
- unsigned int __nocast gfp)
+ gfp_t gfp)
{
int error;
@@ -986,7 +986,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
void *event_arg,
sctp_disposition_t status,
sctp_cmd_seq_t *commands,
- unsigned int __nocast gfp)
+ gfp_t gfp)
{
int error = 0;
int force;
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 86073df418f..505c7de10c5 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -2414,6 +2414,17 @@ sctp_disposition_t sctp_sf_do_9_2_shutdown(const struct sctp_endpoint *ep,
skb_pull(chunk->skb, sizeof(sctp_shutdownhdr_t));
chunk->subh.shutdown_hdr = sdh;
+ /* API 5.3.1.5 SCTP_SHUTDOWN_EVENT
+ * When a peer sends a SHUTDOWN, SCTP delivers this notification to
+ * inform the application that it should cease sending data.
+ */
+ ev = sctp_ulpevent_make_shutdown_event(asoc, 0, GFP_ATOMIC);
+ if (!ev) {
+ disposition = SCTP_DISPOSITION_NOMEM;
+ goto out;
+ }
+ sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev));
+
/* Upon the reception of the SHUTDOWN, the peer endpoint shall
* - enter the SHUTDOWN-RECEIVED state,
* - stop accepting new data from its SCTP user
@@ -2439,17 +2450,6 @@ sctp_disposition_t sctp_sf_do_9_2_shutdown(const struct sctp_endpoint *ep,
sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_CTSN,
SCTP_U32(chunk->subh.shutdown_hdr->cum_tsn_ack));
- /* API 5.3.1.5 SCTP_SHUTDOWN_EVENT
- * When a peer sends a SHUTDOWN, SCTP delivers this notification to
- * inform the application that it should cease sending data.
- */
- ev = sctp_ulpevent_make_shutdown_event(asoc, 0, GFP_ATOMIC);
- if (!ev) {
- disposition = SCTP_DISPOSITION_NOMEM;
- goto out;
- }
- sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev));
-
out:
return disposition;
}
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 091a66f06a3..b529af5e6f2 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1010,6 +1010,19 @@ static int __sctp_connect(struct sock* sk,
err = -EAGAIN;
goto out_free;
}
+ } else {
+ /*
+ * If an unprivileged user inherits a 1-many
+ * style socket with open associations on a
+ * privileged port, it MAY be permitted to
+ * accept new associations, but it SHOULD NOT
+ * be permitted to open new associations.
+ */
+ if (ep->base.bind_addr.port < PROT_SOCK &&
+ !capable(CAP_NET_BIND_SERVICE)) {
+ err = -EACCES;
+ goto out_free;
+ }
}
scope = sctp_scope(&to);
@@ -1389,27 +1402,27 @@ SCTP_STATIC int sctp_sendmsg(struct kiocb *iocb, struct sock *sk,
SCTP_DEBUG_PRINTK("msg_len: %zu, sinfo_flags: 0x%x\n",
msg_len, sinfo_flags);
- /* MSG_EOF or MSG_ABORT cannot be set on a TCP-style socket. */
- if (sctp_style(sk, TCP) && (sinfo_flags & (MSG_EOF | MSG_ABORT))) {
+ /* SCTP_EOF or SCTP_ABORT cannot be set on a TCP-style socket. */
+ if (sctp_style(sk, TCP) && (sinfo_flags & (SCTP_EOF | SCTP_ABORT))) {
err = -EINVAL;
goto out_nounlock;
}
- /* If MSG_EOF is set, no data can be sent. Disallow sending zero
- * length messages when MSG_EOF|MSG_ABORT is not set.
- * If MSG_ABORT is set, the message length could be non zero with
+ /* If SCTP_EOF is set, no data can be sent. Disallow sending zero
+ * length messages when SCTP_EOF|SCTP_ABORT is not set.
+ * If SCTP_ABORT is set, the message length could be non zero with
* the msg_iov set to the user abort reason.
*/
- if (((sinfo_flags & MSG_EOF) && (msg_len > 0)) ||
- (!(sinfo_flags & (MSG_EOF|MSG_ABORT)) && (msg_len == 0))) {
+ if (((sinfo_flags & SCTP_EOF) && (msg_len > 0)) ||
+ (!(sinfo_flags & (SCTP_EOF|SCTP_ABORT)) && (msg_len == 0))) {
err = -EINVAL;
goto out_nounlock;
}
- /* If MSG_ADDR_OVER is set, there must be an address
+ /* If SCTP_ADDR_OVER is set, there must be an address
* specified in msg_name.
*/
- if ((sinfo_flags & MSG_ADDR_OVER) && (!msg->msg_name)) {
+ if ((sinfo_flags & SCTP_ADDR_OVER) && (!msg->msg_name)) {
err = -EINVAL;
goto out_nounlock;
}
@@ -1458,14 +1471,14 @@ SCTP_STATIC int sctp_sendmsg(struct kiocb *iocb, struct sock *sk,
goto out_unlock;
}
- if (sinfo_flags & MSG_EOF) {
+ if (sinfo_flags & SCTP_EOF) {
SCTP_DEBUG_PRINTK("Shutting down association: %p\n",
asoc);
sctp_primitive_SHUTDOWN(asoc, NULL);
err = 0;
goto out_unlock;
}
- if (sinfo_flags & MSG_ABORT) {
+ if (sinfo_flags & SCTP_ABORT) {
SCTP_DEBUG_PRINTK("Aborting association: %p\n", asoc);
sctp_primitive_ABORT(asoc, msg);
err = 0;
@@ -1477,7 +1490,7 @@ SCTP_STATIC int sctp_sendmsg(struct kiocb *iocb, struct sock *sk,
if (!asoc) {
SCTP_DEBUG_PRINTK("There is no association yet.\n");
- if (sinfo_flags & (MSG_EOF | MSG_ABORT)) {
+ if (sinfo_flags & (SCTP_EOF | SCTP_ABORT)) {
err = -EINVAL;
goto out_unlock;
}
@@ -1515,6 +1528,19 @@ SCTP_STATIC int sctp_sendmsg(struct kiocb *iocb, struct sock *sk,
err = -EAGAIN;
goto out_unlock;
}
+ } else {
+ /*
+ * If an unprivileged user inherits a one-to-many
+ * style socket with open associations on a privileged
+ * port, it MAY be permitted to accept new associations,
+ * but it SHOULD NOT be permitted to open new
+ * associations.
+ */
+ if (ep->base.bind_addr.port < PROT_SOCK &&
+ !capable(CAP_NET_BIND_SERVICE)) {
+ err = -EACCES;
+ goto out_unlock;
+ }
}
scope = sctp_scope(&to);
@@ -1611,10 +1637,10 @@ SCTP_STATIC int sctp_sendmsg(struct kiocb *iocb, struct sock *sk,
/* If an address is passed with the sendto/sendmsg call, it is used
* to override the primary destination address in the TCP model, or
- * when MSG_ADDR_OVER flag is set in the UDP model.
+ * when SCTP_ADDR_OVER flag is set in the UDP model.
*/
if ((sctp_style(sk, TCP) && msg_name) ||
- (sinfo_flags & MSG_ADDR_OVER)) {
+ (sinfo_flags & SCTP_ADDR_OVER)) {
chunk_tp = sctp_assoc_lookup_paddr(asoc, &to);
if (!chunk_tp) {
err = -EINVAL;
@@ -2306,16 +2332,14 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, int optl
return -EINVAL;
if (get_user(val, (int __user *)optval))
return -EFAULT;
- if ((val < 8) || (val > SCTP_MAX_CHUNK_LEN))
+ if ((val != 0) && ((val < 8) || (val > SCTP_MAX_CHUNK_LEN)))
return -EINVAL;
sp->user_frag = val;
- if (val) {
- /* Update the frag_point of the existing associations. */
- list_for_each(pos, &(sp->ep->asocs)) {
- asoc = list_entry(pos, struct sctp_association, asocs);
- asoc->frag_point = sctp_frag_point(sp, asoc->pmtu);
- }
+ /* Update the frag_point of the existing associations. */
+ list_for_each(pos, &(sp->ep->asocs)) {
+ asoc = list_entry(pos, struct sctp_association, asocs);
+ asoc->frag_point = sctp_frag_point(sp, asoc->pmtu);
}
return 0;
@@ -2384,14 +2408,14 @@ static int sctp_setsockopt_peer_primary_addr(struct sock *sk, char __user *optva
static int sctp_setsockopt_adaption_layer(struct sock *sk, char __user *optval,
int optlen)
{
- __u32 val;
+ struct sctp_setadaption adaption;
- if (optlen < sizeof(__u32))
+ if (optlen != sizeof(struct sctp_setadaption))
return -EINVAL;
- if (copy_from_user(&val, optval, sizeof(__u32)))
+ if (copy_from_user(&adaption, optval, optlen))
return -EFAULT;
- sctp_sk(sk)->adaption_ind = val;
+ sctp_sk(sk)->adaption_ind = adaption.ssb_adaption_ind;
return 0;
}
@@ -3159,8 +3183,9 @@ static int sctp_getsockopt_initmsg(struct sock *sk, int len, char __user *optval
return 0;
}
-static int sctp_getsockopt_peer_addrs_num(struct sock *sk, int len,
- char __user *optval, int __user *optlen)
+static int sctp_getsockopt_peer_addrs_num_old(struct sock *sk, int len,
+ char __user *optval,
+ int __user *optlen)
{
sctp_assoc_t id;
struct sctp_association *asoc;
@@ -3185,23 +3210,28 @@ static int sctp_getsockopt_peer_addrs_num(struct sock *sk, int len,
return cnt;
}
-static int sctp_getsockopt_peer_addrs(struct sock *sk, int len,
- char __user *optval, int __user *optlen)
+/*
+ * Old API for getting list of peer addresses. Does not work for 32-bit
+ * programs running on a 64-bit kernel
+ */
+static int sctp_getsockopt_peer_addrs_old(struct sock *sk, int len,
+ char __user *optval,
+ int __user *optlen)
{
struct sctp_association *asoc;
struct list_head *pos;
int cnt = 0;
- struct sctp_getaddrs getaddrs;
+ struct sctp_getaddrs_old getaddrs;
struct sctp_transport *from;
void __user *to;
union sctp_addr temp;
struct sctp_sock *sp = sctp_sk(sk);
int addrlen;
- if (len != sizeof(struct sctp_getaddrs))
+ if (len != sizeof(struct sctp_getaddrs_old))
return -EINVAL;
- if (copy_from_user(&getaddrs, optval, sizeof(struct sctp_getaddrs)))
+ if (copy_from_user(&getaddrs, optval, sizeof(struct sctp_getaddrs_old)))
return -EFAULT;
if (getaddrs.addr_num <= 0) return -EINVAL;
@@ -3225,15 +3255,69 @@ static int sctp_getsockopt_peer_addrs(struct sock *sk, int len,
if (cnt >= getaddrs.addr_num) break;
}
getaddrs.addr_num = cnt;
- if (copy_to_user(optval, &getaddrs, sizeof(struct sctp_getaddrs)))
+ if (copy_to_user(optval, &getaddrs, sizeof(struct sctp_getaddrs_old)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int sctp_getsockopt_peer_addrs(struct sock *sk, int len,
+ char __user *optval, int __user *optlen)
+{
+ struct sctp_association *asoc;
+ struct list_head *pos;
+ int cnt = 0;
+ struct sctp_getaddrs getaddrs;
+ struct sctp_transport *from;
+ void __user *to;
+ union sctp_addr temp;
+ struct sctp_sock *sp = sctp_sk(sk);
+ int addrlen;
+ size_t space_left;
+ int bytes_copied;
+
+ if (len < sizeof(struct sctp_getaddrs))
+ return -EINVAL;
+
+ if (copy_from_user(&getaddrs, optval, sizeof(struct sctp_getaddrs)))
+ return -EFAULT;
+
+ /* For UDP-style sockets, id specifies the association to query. */
+ asoc = sctp_id2assoc(sk, getaddrs.assoc_id);
+ if (!asoc)
+ return -EINVAL;
+
+ to = optval + offsetof(struct sctp_getaddrs,addrs);
+ space_left = len - sizeof(struct sctp_getaddrs) -
+ offsetof(struct sctp_getaddrs,addrs);
+
+ list_for_each(pos, &asoc->peer.transport_addr_list) {
+ from = list_entry(pos, struct sctp_transport, transports);
+ memcpy(&temp, &from->ipaddr, sizeof(temp));
+ sctp_get_pf_specific(sk->sk_family)->addr_v4map(sp, &temp);
+ addrlen = sctp_get_af_specific(sk->sk_family)->sockaddr_len;
+ if(space_left < addrlen)
+ return -ENOMEM;
+ temp.v4.sin_port = htons(temp.v4.sin_port);
+ if (copy_to_user(to, &temp, addrlen))
+ return -EFAULT;
+ to += addrlen;
+ cnt++;
+ space_left -= addrlen;
+ }
+
+ if (put_user(cnt, &((struct sctp_getaddrs __user *)optval)->addr_num))
+ return -EFAULT;
+ bytes_copied = ((char __user *)to) - optval;
+ if (put_user(bytes_copied, optlen))
return -EFAULT;
return 0;
}
-static int sctp_getsockopt_local_addrs_num(struct sock *sk, int len,
- char __user *optval,
- int __user *optlen)
+static int sctp_getsockopt_local_addrs_num_old(struct sock *sk, int len,
+ char __user *optval,
+ int __user *optlen)
{
sctp_assoc_t id;
struct sctp_bind_addr *bp;
@@ -3306,8 +3390,8 @@ done:
/* Helper function that copies local addresses to user and returns the number
* of addresses copied.
*/
-static int sctp_copy_laddrs_to_user(struct sock *sk, __u16 port, int max_addrs,
- void __user *to)
+static int sctp_copy_laddrs_to_user_old(struct sock *sk, __u16 port, int max_addrs,
+ void __user *to)
{
struct list_head *pos;
struct sctp_sockaddr_entry *addr;
@@ -3341,14 +3425,54 @@ static int sctp_copy_laddrs_to_user(struct sock *sk, __u16 port, int max_addrs,
return cnt;
}
-static int sctp_getsockopt_local_addrs(struct sock *sk, int len,
- char __user *optval, int __user *optlen)
+static int sctp_copy_laddrs_to_user(struct sock *sk, __u16 port,
+ void * __user *to, size_t space_left)
+{
+ struct list_head *pos;
+ struct sctp_sockaddr_entry *addr;
+ unsigned long flags;
+ union sctp_addr temp;
+ int cnt = 0;
+ int addrlen;
+
+ sctp_spin_lock_irqsave(&sctp_local_addr_lock, flags);
+ list_for_each(pos, &sctp_local_addr_list) {
+ addr = list_entry(pos, struct sctp_sockaddr_entry, list);
+ if ((PF_INET == sk->sk_family) &&
+ (AF_INET6 == addr->a.sa.sa_family))
+ continue;
+ memcpy(&temp, &addr->a, sizeof(temp));
+ sctp_get_pf_specific(sk->sk_family)->addr_v4map(sctp_sk(sk),
+ &temp);
+ addrlen = sctp_get_af_specific(temp.sa.sa_family)->sockaddr_len;
+ if(space_left<addrlen)
+ return -ENOMEM;
+ temp.v4.sin_port = htons(port);
+ if (copy_to_user(*to, &temp, addrlen)) {
+ sctp_spin_unlock_irqrestore(&sctp_local_addr_lock,
+ flags);
+ return -EFAULT;
+ }
+ *to += addrlen;
+ cnt ++;
+ space_left -= addrlen;
+ }
+ sctp_spin_unlock_irqrestore(&sctp_local_addr_lock, flags);
+
+ return cnt;
+}
+
+/* Old API for getting list of local addresses. Does not work for 32-bit
+ * programs running on a 64-bit kernel
+ */
+static int sctp_getsockopt_local_addrs_old(struct sock *sk, int len,
+ char __user *optval, int __user *optlen)
{
struct sctp_bind_addr *bp;
struct sctp_association *asoc;
struct list_head *pos;
int cnt = 0;
- struct sctp_getaddrs getaddrs;
+ struct sctp_getaddrs_old getaddrs;
struct sctp_sockaddr_entry *addr;
void __user *to;
union sctp_addr temp;
@@ -3357,10 +3481,10 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len,
rwlock_t *addr_lock;
int err = 0;
- if (len != sizeof(struct sctp_getaddrs))
+ if (len != sizeof(struct sctp_getaddrs_old))
return -EINVAL;
- if (copy_from_user(&getaddrs, optval, sizeof(struct sctp_getaddrs)))
+ if (copy_from_user(&getaddrs, optval, sizeof(struct sctp_getaddrs_old)))
return -EFAULT;
if (getaddrs.addr_num <= 0) return -EINVAL;
@@ -3392,8 +3516,9 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len,
addr = list_entry(bp->address_list.next,
struct sctp_sockaddr_entry, list);
if (sctp_is_any(&addr->a)) {
- cnt = sctp_copy_laddrs_to_user(sk, bp->port,
- getaddrs.addr_num, to);
+ cnt = sctp_copy_laddrs_to_user_old(sk, bp->port,
+ getaddrs.addr_num,
+ to);
if (cnt < 0) {
err = cnt;
goto unlock;
@@ -3419,7 +3544,7 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len,
copy_getaddrs:
getaddrs.addr_num = cnt;
- if (copy_to_user(optval, &getaddrs, sizeof(struct sctp_getaddrs)))
+ if (copy_to_user(optval, &getaddrs, sizeof(struct sctp_getaddrs_old)))
err = -EFAULT;
unlock:
@@ -3427,6 +3552,99 @@ unlock:
return err;
}
+static int sctp_getsockopt_local_addrs(struct sock *sk, int len,
+ char __user *optval, int __user *optlen)
+{
+ struct sctp_bind_addr *bp;
+ struct sctp_association *asoc;
+ struct list_head *pos;
+ int cnt = 0;
+ struct sctp_getaddrs getaddrs;
+ struct sctp_sockaddr_entry *addr;
+ void __user *to;
+ union sctp_addr temp;
+ struct sctp_sock *sp = sctp_sk(sk);
+ int addrlen;
+ rwlock_t *addr_lock;
+ int err = 0;
+ size_t space_left;
+ int bytes_copied;
+
+ if (len <= sizeof(struct sctp_getaddrs))
+ return -EINVAL;
+
+ if (copy_from_user(&getaddrs, optval, sizeof(struct sctp_getaddrs)))
+ return -EFAULT;
+
+ /*
+ * For UDP-style sockets, id specifies the association to query.
+ * If the id field is set to the value '0' then the locally bound
+ * addresses are returned without regard to any particular
+ * association.
+ */
+ if (0 == getaddrs.assoc_id) {
+ bp = &sctp_sk(sk)->ep->base.bind_addr;
+ addr_lock = &sctp_sk(sk)->ep->base.addr_lock;
+ } else {
+ asoc = sctp_id2assoc(sk, getaddrs.assoc_id);
+ if (!asoc)
+ return -EINVAL;
+ bp = &asoc->base.bind_addr;
+ addr_lock = &asoc->base.addr_lock;
+ }
+
+ to = optval + offsetof(struct sctp_getaddrs,addrs);
+ space_left = len - sizeof(struct sctp_getaddrs) -
+ offsetof(struct sctp_getaddrs,addrs);
+
+ sctp_read_lock(addr_lock);
+
+ /* If the endpoint is bound to 0.0.0.0 or ::0, get the valid
+ * addresses from the global local address list.
+ */
+ if (sctp_list_single_entry(&bp->address_list)) {
+ addr = list_entry(bp->address_list.next,
+ struct sctp_sockaddr_entry, list);
+ if (sctp_is_any(&addr->a)) {
+ cnt = sctp_copy_laddrs_to_user(sk, bp->port,
+ &to, space_left);
+ if (cnt < 0) {
+ err = cnt;
+ goto unlock;
+ }
+ goto copy_getaddrs;
+ }
+ }
+
+ list_for_each(pos, &bp->address_list) {
+ addr = list_entry(pos, struct sctp_sockaddr_entry, list);
+ memcpy(&temp, &addr->a, sizeof(temp));
+ sctp_get_pf_specific(sk->sk_family)->addr_v4map(sp, &temp);
+ addrlen = sctp_get_af_specific(temp.sa.sa_family)->sockaddr_len;
+ if(space_left < addrlen)
+ return -ENOMEM; /*fixme: right error?*/
+ temp.v4.sin_port = htons(temp.v4.sin_port);
+ if (copy_to_user(to, &temp, addrlen)) {
+ err = -EFAULT;
+ goto unlock;
+ }
+ to += addrlen;
+ cnt ++;
+ space_left -= addrlen;
+ }
+
+copy_getaddrs:
+ if (put_user(cnt, &((struct sctp_getaddrs __user *)optval)->addr_num))
+ return -EFAULT;
+ bytes_copied = ((char __user *)to) - optval;
+ if (put_user(bytes_copied, optlen))
+ return -EFAULT;
+
+unlock:
+ sctp_read_unlock(addr_lock);
+ return err;
+}
+
/* 7.1.10 Set Primary Address (SCTP_PRIMARY_ADDR)
*
* Requests that the local SCTP stack use the enclosed peer address as
@@ -3478,17 +3696,15 @@ static int sctp_getsockopt_primary_addr(struct sock *sk, int len,
static int sctp_getsockopt_adaption_layer(struct sock *sk, int len,
char __user *optval, int __user *optlen)
{
- __u32 val;
+ struct sctp_setadaption adaption;
- if (len < sizeof(__u32))
+ if (len != sizeof(struct sctp_setadaption))
return -EINVAL;
- len = sizeof(__u32);
- val = sctp_sk(sk)->adaption_ind;
- if (put_user(len, optlen))
- return -EFAULT;
- if (copy_to_user(optval, &val, len))
+ adaption.ssb_adaption_ind = sctp_sk(sk)->adaption_ind;
+ if (copy_to_user(optval, &adaption, len))
return -EFAULT;
+
return 0;
}
@@ -3807,12 +4023,20 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
case SCTP_INITMSG:
retval = sctp_getsockopt_initmsg(sk, len, optval, optlen);
break;
- case SCTP_GET_PEER_ADDRS_NUM:
- retval = sctp_getsockopt_peer_addrs_num(sk, len, optval,
+ case SCTP_GET_PEER_ADDRS_NUM_OLD:
+ retval = sctp_getsockopt_peer_addrs_num_old(sk, len, optval,
+ optlen);
+ break;
+ case SCTP_GET_LOCAL_ADDRS_NUM_OLD:
+ retval = sctp_getsockopt_local_addrs_num_old(sk, len, optval,
+ optlen);
+ break;
+ case SCTP_GET_PEER_ADDRS_OLD:
+ retval = sctp_getsockopt_peer_addrs_old(sk, len, optval,
optlen);
break;
- case SCTP_GET_LOCAL_ADDRS_NUM:
- retval = sctp_getsockopt_local_addrs_num(sk, len, optval,
+ case SCTP_GET_LOCAL_ADDRS_OLD:
+ retval = sctp_getsockopt_local_addrs_old(sk, len, optval,
optlen);
break;
case SCTP_GET_PEER_ADDRS:
@@ -4194,8 +4418,7 @@ out:
sctp_release_sock(sk);
return err;
cleanup:
- if (tfm)
- sctp_crypto_free_tfm(tfm);
+ sctp_crypto_free_tfm(tfm);
goto out;
}
@@ -4439,8 +4662,8 @@ SCTP_STATIC int sctp_msghdr_parse(const struct msghdr *msg,
/* Minimally, validate the sinfo_flags. */
if (cmsgs->info->sinfo_flags &
- ~(MSG_UNORDERED | MSG_ADDR_OVER |
- MSG_ABORT | MSG_EOF))
+ ~(SCTP_UNORDERED | SCTP_ADDR_OVER |
+ SCTP_ABORT | SCTP_EOF))
return -EINVAL;
break;
@@ -4892,7 +5115,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
sctp_skb_for_each(skb, &oldsk->sk_receive_queue, tmp) {
event = sctp_skb2event(skb);
if (event->asoc == assoc) {
- __skb_unlink(skb, skb->list);
+ __skb_unlink(skb, &oldsk->sk_receive_queue);
__skb_queue_tail(&newsk->sk_receive_queue, skb);
}
}
@@ -4921,7 +5144,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
sctp_skb_for_each(skb, &oldsp->pd_lobby, tmp) {
event = sctp_skb2event(skb);
if (event->asoc == assoc) {
- __skb_unlink(skb, skb->list);
+ __skb_unlink(skb, &oldsp->pd_lobby);
__skb_queue_tail(queue, skb);
}
}
diff --git a/net/sctp/ssnmap.c b/net/sctp/ssnmap.c
index 25037daf3fa..cbe2513d282 100644
--- a/net/sctp/ssnmap.c
+++ b/net/sctp/ssnmap.c
@@ -58,7 +58,7 @@ static inline size_t sctp_ssnmap_size(__u16 in, __u16 out)
* Allocate room to store at least 'len' contiguous TSNs.
*/
struct sctp_ssnmap *sctp_ssnmap_new(__u16 in, __u16 out,
- unsigned int __nocast gfp)
+ gfp_t gfp)
{
struct sctp_ssnmap *retval;
int size;
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index dc4893474f1..75b28dd634f 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -42,6 +42,7 @@
*/
#include <net/sctp/structs.h>
+#include <net/sctp/sctp.h>
#include <linux/sysctl.h>
static ctl_handler sctp_sysctl_jiffies_ms;
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index d2f04ebe508..6bc27200e6c 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -57,7 +57,7 @@
/* Initialize a new transport from provided memory. */
static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
const union sctp_addr *addr,
- unsigned int __nocast gfp)
+ gfp_t gfp)
{
/* Copy in the address. */
peer->ipaddr = *addr;
@@ -122,7 +122,7 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
/* Allocate and initialize a new transport. */
struct sctp_transport *sctp_transport_new(const union sctp_addr *addr,
- unsigned int __nocast gfp)
+ gfp_t gfp)
{
struct sctp_transport *transport;
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index 0abd5101107..e049f41faa4 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -74,7 +74,7 @@ SCTP_STATIC void sctp_ulpevent_init(struct sctp_ulpevent *event, int msg_flags)
/* Create a new sctp_ulpevent. */
SCTP_STATIC struct sctp_ulpevent *sctp_ulpevent_new(int size, int msg_flags,
- unsigned int __nocast gfp)
+ gfp_t gfp)
{
struct sctp_ulpevent *event;
struct sk_buff *skb;
@@ -136,7 +136,7 @@ static inline void sctp_ulpevent_release_owner(struct sctp_ulpevent *event)
struct sctp_ulpevent *sctp_ulpevent_make_assoc_change(
const struct sctp_association *asoc,
__u16 flags, __u16 state, __u16 error, __u16 outbound,
- __u16 inbound, unsigned int __nocast gfp)
+ __u16 inbound, gfp_t gfp)
{
struct sctp_ulpevent *event;
struct sctp_assoc_change *sac;
@@ -237,7 +237,7 @@ fail:
struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change(
const struct sctp_association *asoc,
const struct sockaddr_storage *aaddr,
- int flags, int state, int error, unsigned int __nocast gfp)
+ int flags, int state, int error, gfp_t gfp)
{
struct sctp_ulpevent *event;
struct sctp_paddr_change *spc;
@@ -350,7 +350,7 @@ fail:
*/
struct sctp_ulpevent *sctp_ulpevent_make_remote_error(
const struct sctp_association *asoc, struct sctp_chunk *chunk,
- __u16 flags, unsigned int __nocast gfp)
+ __u16 flags, gfp_t gfp)
{
struct sctp_ulpevent *event;
struct sctp_remote_error *sre;
@@ -448,7 +448,7 @@ fail:
*/
struct sctp_ulpevent *sctp_ulpevent_make_send_failed(
const struct sctp_association *asoc, struct sctp_chunk *chunk,
- __u16 flags, __u32 error, unsigned int __nocast gfp)
+ __u16 flags, __u32 error, gfp_t gfp)
{
struct sctp_ulpevent *event;
struct sctp_send_failed *ssf;
@@ -557,7 +557,7 @@ fail:
*/
struct sctp_ulpevent *sctp_ulpevent_make_shutdown_event(
const struct sctp_association *asoc,
- __u16 flags, unsigned int __nocast gfp)
+ __u16 flags, gfp_t gfp)
{
struct sctp_ulpevent *event;
struct sctp_shutdown_event *sse;
@@ -620,7 +620,7 @@ fail:
* 5.3.1.6 SCTP_ADAPTION_INDICATION
*/
struct sctp_ulpevent *sctp_ulpevent_make_adaption_indication(
- const struct sctp_association *asoc, unsigned int __nocast gfp)
+ const struct sctp_association *asoc, gfp_t gfp)
{
struct sctp_ulpevent *event;
struct sctp_adaption_event *sai;
@@ -657,7 +657,7 @@ fail:
*/
struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
struct sctp_chunk *chunk,
- unsigned int __nocast gfp)
+ gfp_t gfp)
{
struct sctp_ulpevent *event = NULL;
struct sk_buff *skb;
@@ -698,7 +698,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
event->ssn = ntohs(chunk->subh.data_hdr->ssn);
event->ppid = chunk->subh.data_hdr->ppid;
if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) {
- event->flags |= MSG_UNORDERED;
+ event->flags |= SCTP_UNORDERED;
event->cumtsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map);
}
event->tsn = ntohl(chunk->subh.data_hdr->tsn);
@@ -719,7 +719,7 @@ fail:
*/
struct sctp_ulpevent *sctp_ulpevent_make_pdapi(
const struct sctp_association *asoc, __u32 indication,
- unsigned int __nocast gfp)
+ gfp_t gfp)
{
struct sctp_ulpevent *event;
struct sctp_pdapi_event *pd;
@@ -824,7 +824,7 @@ void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event,
*
* recvmsg() flags:
*
- * MSG_UNORDERED - This flag is present when the message was sent
+ * SCTP_UNORDERED - This flag is present when the message was sent
* non-ordered.
*/
sinfo.sinfo_flags = event->flags;
@@ -839,7 +839,7 @@ void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event,
* This field will hold the current cumulative TSN as
* known by the underlying SCTP layer. Note this field is
* ignored when sending and only valid for a receive
- * operation when sinfo_flags are set to MSG_UNORDERED.
+ * operation when sinfo_flags are set to SCTP_UNORDERED.
*/
sinfo.sinfo_cumtsn = event->cumtsn;
/* sinfo_assoc_id: sizeof (sctp_assoc_t)
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 8bbc279d6c9..2080b2d28c9 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -50,9 +50,9 @@
/* Forward declarations for internal helpers. */
static struct sctp_ulpevent * sctp_ulpq_reasm(struct sctp_ulpq *ulpq,
- struct sctp_ulpevent *);
+ struct sctp_ulpevent *);
static struct sctp_ulpevent * sctp_ulpq_order(struct sctp_ulpq *,
- struct sctp_ulpevent *);
+ struct sctp_ulpevent *);
/* 1st Level Abstractions */
@@ -100,7 +100,7 @@ void sctp_ulpq_free(struct sctp_ulpq *ulpq)
/* Process an incoming DATA chunk. */
int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
- unsigned int __nocast gfp)
+ gfp_t gfp)
{
struct sk_buff_head temp;
sctp_data_chunk_t *hdr;
@@ -125,7 +125,9 @@ int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
event = sctp_ulpq_order(ulpq, event);
}
- /* Send event to the ULP. */
+ /* Send event to the ULP. 'event' is the sctp_ulpevent for
+ * very first SKB on the 'temp' list.
+ */
if (event)
sctp_ulpq_tail_event(ulpq, event);
@@ -158,14 +160,18 @@ static int sctp_ulpq_clear_pd(struct sctp_ulpq *ulpq)
return sctp_clear_pd(ulpq->asoc->base.sk);
}
-
-
+/* If the SKB of 'event' is on a list, it is the first such member
+ * of that list.
+ */
int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
{
struct sock *sk = ulpq->asoc->base.sk;
- struct sk_buff_head *queue;
+ struct sk_buff_head *queue, *skb_list;
+ struct sk_buff *skb = sctp_event2skb(event);
int clear_pd = 0;
+ skb_list = (struct sk_buff_head *) skb->prev;
+
/* If the socket is just going to throw this away, do not
* even try to deliver it.
*/
@@ -197,10 +203,10 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
/* If we are harvesting multiple skbs they will be
* collected on a list.
*/
- if (sctp_event2skb(event)->list)
- sctp_skb_list_tail(sctp_event2skb(event)->list, queue);
+ if (skb_list)
+ sctp_skb_list_tail(skb_list, queue);
else
- __skb_queue_tail(queue, sctp_event2skb(event));
+ __skb_queue_tail(queue, skb);
/* Did we just complete partial delivery and need to get
* rolling again? Move pending data to the receive
@@ -214,10 +220,11 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
return 1;
out_free:
- if (sctp_event2skb(event)->list)
- sctp_queue_purge_ulpevents(sctp_event2skb(event)->list);
+ if (skb_list)
+ sctp_queue_purge_ulpevents(skb_list);
else
sctp_ulpevent_free(event);
+
return 0;
}
@@ -269,7 +276,7 @@ static inline void sctp_ulpq_store_reasm(struct sctp_ulpq *ulpq,
* payload was fragmented on the way and ip had to reassemble them.
* We add the rest of skb's to the first skb's fraglist.
*/
-static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff *f_frag, struct sk_buff *l_frag)
+static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff_head *queue, struct sk_buff *f_frag, struct sk_buff *l_frag)
{
struct sk_buff *pos;
struct sctp_ulpevent *event;
@@ -294,7 +301,7 @@ static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff *f_frag,
skb_shinfo(f_frag)->frag_list = pos;
/* Remove the first fragment from the reassembly queue. */
- __skb_unlink(f_frag, f_frag->list);
+ __skb_unlink(f_frag, queue);
while (pos) {
pnext = pos->next;
@@ -304,7 +311,7 @@ static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff *f_frag,
f_frag->data_len += pos->len;
/* Remove the fragment from the reassembly queue. */
- __skb_unlink(pos, pos->list);
+ __skb_unlink(pos, queue);
/* Break if we have reached the last fragment. */
if (pos == l_frag)
@@ -375,7 +382,7 @@ static inline struct sctp_ulpevent *sctp_ulpq_retrieve_reassembled(struct sctp_u
done:
return retval;
found:
- retval = sctp_make_reassembled_event(first_frag, pos);
+ retval = sctp_make_reassembled_event(&ulpq->reasm, first_frag, pos);
if (retval)
retval->msg_flags |= MSG_EOR;
goto done;
@@ -435,7 +442,7 @@ static inline struct sctp_ulpevent *sctp_ulpq_retrieve_partial(struct sctp_ulpq
* further.
*/
done:
- retval = sctp_make_reassembled_event(first_frag, last_frag);
+ retval = sctp_make_reassembled_event(&ulpq->reasm, first_frag, last_frag);
if (retval && is_last)
retval->msg_flags |= MSG_EOR;
@@ -527,7 +534,7 @@ static inline struct sctp_ulpevent *sctp_ulpq_retrieve_first(struct sctp_ulpq *u
* further.
*/
done:
- retval = sctp_make_reassembled_event(first_frag, last_frag);
+ retval = sctp_make_reassembled_event(&ulpq->reasm, first_frag, last_frag);
return retval;
}
@@ -537,6 +544,7 @@ done:
static inline void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq,
struct sctp_ulpevent *event)
{
+ struct sk_buff_head *event_list;
struct sk_buff *pos, *tmp;
struct sctp_ulpevent *cevent;
struct sctp_stream *in;
@@ -547,6 +555,8 @@ static inline void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq,
ssn = event->ssn;
in = &ulpq->asoc->ssnmap->in;
+ event_list = (struct sk_buff_head *) sctp_event2skb(event)->prev;
+
/* We are holding the chunks by stream, by SSN. */
sctp_skb_for_each(pos, &ulpq->lobby, tmp) {
cevent = (struct sctp_ulpevent *) pos->cb;
@@ -567,10 +577,10 @@ static inline void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq,
/* Found it, so mark in the ssnmap. */
sctp_ssn_next(in, sid);
- __skb_unlink(pos, pos->list);
+ __skb_unlink(pos, &ulpq->lobby);
/* Attach all gathered skbs to the event. */
- __skb_queue_tail(sctp_event2skb(event)->list, pos);
+ __skb_queue_tail(event_list, pos);
}
}
@@ -626,7 +636,7 @@ static inline void sctp_ulpq_store_ordered(struct sctp_ulpq *ulpq,
}
static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq,
- struct sctp_ulpevent *event)
+ struct sctp_ulpevent *event)
{
__u16 sid, ssn;
struct sctp_stream *in;
@@ -667,7 +677,7 @@ static inline void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq)
{
struct sk_buff *pos, *tmp;
struct sctp_ulpevent *cevent;
- struct sctp_ulpevent *event = NULL;
+ struct sctp_ulpevent *event;
struct sctp_stream *in;
struct sk_buff_head temp;
__u16 csid, cssn;
@@ -675,6 +685,8 @@ static inline void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq)
in = &ulpq->asoc->ssnmap->in;
/* We are holding the chunks by stream, by SSN. */
+ skb_queue_head_init(&temp);
+ event = NULL;
sctp_skb_for_each(pos, &ulpq->lobby, tmp) {
cevent = (struct sctp_ulpevent *) pos->cb;
csid = cevent->stream;
@@ -686,19 +698,20 @@ static inline void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq)
/* Found it, so mark in the ssnmap. */
sctp_ssn_next(in, csid);
- __skb_unlink(pos, pos->list);
+ __skb_unlink(pos, &ulpq->lobby);
if (!event) {
/* Create a temporary list to collect chunks on. */
event = sctp_skb2event(pos);
- skb_queue_head_init(&temp);
__skb_queue_tail(&temp, sctp_event2skb(event));
} else {
/* Attach all gathered skbs to the event. */
- __skb_queue_tail(sctp_event2skb(event)->list, pos);
+ __skb_queue_tail(&temp, pos);
}
}
- /* Send event to the ULP. */
+ /* Send event to the ULP. 'event' is the sctp_ulpevent for
+ * very first SKB on the 'temp' list.
+ */
if (event)
sctp_ulpq_tail_event(ulpq, event);
}
@@ -779,7 +792,7 @@ static __u16 sctp_ulpq_renege_frags(struct sctp_ulpq *ulpq, __u16 needed)
/* Partial deliver the first message as there is pressure on rwnd. */
void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq,
struct sctp_chunk *chunk,
- unsigned int __nocast gfp)
+ gfp_t gfp)
{
struct sctp_ulpevent *event;
struct sctp_association *asoc;
@@ -803,7 +816,7 @@ void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq,
/* Renege some packets to make room for an incoming chunk. */
void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
- unsigned int __nocast gfp)
+ gfp_t gfp)
{
struct sctp_association *asoc;
__u16 needed, freed;
@@ -842,7 +855,7 @@ void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
/* Notify the application if an association is aborted and in
* partial delivery mode. Send up any pending received messages.
*/
-void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, unsigned int __nocast gfp)
+void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp)
{
struct sctp_ulpevent *ev = NULL;
struct sock *sk;
diff --git a/net/socket.c b/net/socket.c
index 6f2a1788197..3145103cdf5 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -70,6 +70,8 @@
#include <linux/seq_file.h>
#include <linux/wanrouter.h>
#include <linux/if_bridge.h>
+#include <linux/if_frad.h>
+#include <linux/if_vlan.h>
#include <linux/init.h>
#include <linux/poll.h>
#include <linux/cache.h>
@@ -272,7 +274,7 @@ int move_addr_to_user(void *kaddr, int klen, void __user *uaddr, int __user *ule
#define SOCKFS_MAGIC 0x534F434B
-static kmem_cache_t * sock_inode_cachep;
+static kmem_cache_t * sock_inode_cachep __read_mostly;
static struct inode *sock_alloc_inode(struct super_block *sb)
{
@@ -331,7 +333,7 @@ static struct super_block *sockfs_get_sb(struct file_system_type *fs_type,
return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC);
}
-static struct vfsmount *sock_mnt;
+static struct vfsmount *sock_mnt __read_mostly;
static struct file_system_type sock_fs_type = {
.name = "sockfs",
@@ -404,6 +406,7 @@ int sock_map_fd(struct socket *sock)
file->f_mode = FMODE_READ | FMODE_WRITE;
file->f_flags = O_RDWR;
file->f_pos = 0;
+ file->private_data = sock;
fd_install(fd, file);
}
@@ -436,6 +439,9 @@ struct socket *sockfd_lookup(int fd, int *err)
return NULL;
}
+ if (file->f_op == &socket_file_ops)
+ return file->private_data; /* set in sock_map_fd */
+
inode = file->f_dentry->d_inode;
if (!S_ISSOCK(inode->i_mode)) {
*err = -ENOTSOCK;
@@ -661,7 +667,7 @@ static ssize_t sock_aio_read(struct kiocb *iocb, char __user *ubuf,
}
iocb->private = x;
x->kiocb = iocb;
- sock = SOCKET_I(iocb->ki_filp->f_dentry->d_inode);
+ sock = iocb->ki_filp->private_data;
x->async_msg.msg_name = NULL;
x->async_msg.msg_namelen = 0;
@@ -703,7 +709,7 @@ static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *ubuf,
}
iocb->private = x;
x->kiocb = iocb;
- sock = SOCKET_I(iocb->ki_filp->f_dentry->d_inode);
+ sock = iocb->ki_filp->private_data;
x->async_msg.msg_name = NULL;
x->async_msg.msg_namelen = 0;
@@ -720,13 +726,13 @@ static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *ubuf,
return __sock_sendmsg(iocb, sock, &x->async_msg, size);
}
-ssize_t sock_sendpage(struct file *file, struct page *page,
- int offset, size_t size, loff_t *ppos, int more)
+static ssize_t sock_sendpage(struct file *file, struct page *page,
+ int offset, size_t size, loff_t *ppos, int more)
{
struct socket *sock;
int flags;
- sock = SOCKET_I(file->f_dentry->d_inode);
+ sock = file->private_data;
flags = !(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;
if (more)
@@ -735,14 +741,14 @@ ssize_t sock_sendpage(struct file *file, struct page *page,
return sock->ops->sendpage(sock, page, offset, size, flags);
}
-static int sock_readv_writev(int type, struct inode * inode,
+static int sock_readv_writev(int type,
struct file * file, const struct iovec * iov,
long count, size_t size)
{
struct msghdr msg;
struct socket *sock;
- sock = SOCKET_I(inode);
+ sock = file->private_data;
msg.msg_name = NULL;
msg.msg_namelen = 0;
@@ -769,7 +775,7 @@ static ssize_t sock_readv(struct file *file, const struct iovec *vector,
int i;
for (i = 0 ; i < count ; i++)
tot_len += vector[i].iov_len;
- return sock_readv_writev(VERIFY_WRITE, file->f_dentry->d_inode,
+ return sock_readv_writev(VERIFY_WRITE,
file, vector, count, tot_len);
}
@@ -780,7 +786,7 @@ static ssize_t sock_writev(struct file *file, const struct iovec *vector,
int i;
for (i = 0 ; i < count ; i++)
tot_len += vector[i].iov_len;
- return sock_readv_writev(VERIFY_READ, file->f_dentry->d_inode,
+ return sock_readv_writev(VERIFY_READ,
file, vector, count, tot_len);
}
@@ -834,7 +840,7 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
void __user *argp = (void __user *)arg;
int pid, err;
- sock = SOCKET_I(file->f_dentry->d_inode);
+ sock = file->private_data;
if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) {
err = dev_ioctl(cmd, argp);
} else
@@ -933,18 +939,18 @@ static unsigned int sock_poll(struct file *file, poll_table * wait)
/*
* We can't return errors to poll, so it's either yes or no.
*/
- sock = SOCKET_I(file->f_dentry->d_inode);
+ sock = file->private_data;
return sock->ops->poll(file, sock, wait);
}
static int sock_mmap(struct file * file, struct vm_area_struct * vma)
{
- struct socket *sock = SOCKET_I(file->f_dentry->d_inode);
+ struct socket *sock = file->private_data;
return sock->ops->mmap(file, sock, vma);
}
-int sock_close(struct inode *inode, struct file *filp)
+static int sock_close(struct inode *inode, struct file *filp)
{
/*
* It was possible the inode is NULL we were
@@ -989,7 +995,7 @@ static int sock_fasync(int fd, struct file *filp, int on)
return -ENOMEM;
}
- sock = SOCKET_I(filp->f_dentry->d_inode);
+ sock = filp->private_data;
if ((sk=sock->sk) == NULL) {
kfree(fna);
@@ -1139,8 +1145,11 @@ static int __sock_create(int family, int type, int protocol, struct socket **res
if (!try_module_get(net_families[family]->owner))
goto out_release;
- if ((err = net_families[family]->create(sock, protocol)) < 0)
+ if ((err = net_families[family]->create(sock, protocol)) < 0) {
+ sock->ops = NULL;
goto out_module_put;
+ }
+
/*
* Now to bump the refcnt of the [loadable] module that owns this
* socket at sock_release time we decrement its refcnt.
@@ -1354,16 +1363,16 @@ asmlinkage long sys_accept(int fd, struct sockaddr __user *upeer_sockaddr, int _
newsock->type = sock->type;
newsock->ops = sock->ops;
- err = security_socket_accept(sock, newsock);
- if (err)
- goto out_release;
-
/*
* We don't need try_module_get here, as the listening socket (sock)
* has the protocol module (sock->ops->owner) held.
*/
__module_get(newsock->ops->owner);
+ err = security_socket_accept(sock, newsock);
+ if (err)
+ goto out_release;
+
err = sock->ops->accept(sock, newsock, sock->file->f_flags);
if (err < 0)
goto out_release;
@@ -1694,7 +1703,9 @@ asmlinkage long sys_sendmsg(int fd, struct msghdr __user *msg, unsigned flags)
struct socket *sock;
char address[MAX_SOCK_ADDR];
struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
- unsigned char ctl[sizeof(struct cmsghdr) + 20]; /* 20 is size of ipv6_pktinfo */
+ unsigned char ctl[sizeof(struct cmsghdr) + 20]
+ __attribute__ ((aligned (sizeof(__kernel_size_t))));
+ /* 20 is size of ipv6_pktinfo */
unsigned char *ctl_buf = ctl;
struct msghdr msg_sys;
int err, ctl_len, iov_size, total_len;
@@ -1739,10 +1750,11 @@ asmlinkage long sys_sendmsg(int fd, struct msghdr __user *msg, unsigned flags)
goto out_freeiov;
ctl_len = msg_sys.msg_controllen;
if ((MSG_CMSG_COMPAT & flags) && ctl_len) {
- err = cmsghdr_from_user_compat_to_kern(&msg_sys, ctl, sizeof(ctl));
+ err = cmsghdr_from_user_compat_to_kern(&msg_sys, sock->sk, ctl, sizeof(ctl));
if (err)
goto out_freeiov;
ctl_buf = msg_sys.msg_control;
+ ctl_len = msg_sys.msg_controllen;
} else if (ctl_len) {
if (ctl_len > sizeof(ctl))
{
@@ -1855,7 +1867,8 @@ asmlinkage long sys_recvmsg(int fd, struct msghdr __user *msg, unsigned int flag
if (err < 0)
goto out_freeiov;
}
- err = __put_user(msg_sys.msg_flags, COMPAT_FLAGS(msg));
+ err = __put_user((msg_sys.msg_flags & ~MSG_CMSG_COMPAT),
+ COMPAT_FLAGS(msg));
if (err)
goto out_freeiov;
if (MSG_CMSG_COMPAT & flags)
@@ -2023,9 +2036,6 @@ int sock_unregister(int family)
return 0;
}
-
-extern void sk_init(void);
-
void __init sock_init(void)
{
/*
diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile
index 46a2ce00a29..cdcab9ca4c6 100644
--- a/net/sunrpc/Makefile
+++ b/net/sunrpc/Makefile
@@ -6,7 +6,7 @@
obj-$(CONFIG_SUNRPC) += sunrpc.o
obj-$(CONFIG_SUNRPC_GSS) += auth_gss/
-sunrpc-y := clnt.o xprt.o sched.o \
+sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
auth.o auth_null.o auth_unix.o \
svc.o svcsock.o svcauth.o svcauth_unix.o \
pmap_clnt.o timer.o xdr.o \
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 505e2d4b3d6..8c7756036e9 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -11,7 +11,6 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/errno.h>
-#include <linux/socket.h>
#include <linux/sunrpc/clnt.h>
#include <linux/spinlock.h>
@@ -300,11 +299,10 @@ put_rpccred(struct rpc_cred *cred)
void
rpcauth_unbindcred(struct rpc_task *task)
{
- struct rpc_auth *auth = task->tk_auth;
struct rpc_cred *cred = task->tk_msg.rpc_cred;
dprintk("RPC: %4d releasing %s cred %p\n",
- task->tk_pid, auth->au_ops->au_name, cred);
+ task->tk_pid, task->tk_auth->au_ops->au_name, cred);
put_rpccred(cred);
task->tk_msg.rpc_cred = NULL;
@@ -313,22 +311,22 @@ rpcauth_unbindcred(struct rpc_task *task)
u32 *
rpcauth_marshcred(struct rpc_task *task, u32 *p)
{
- struct rpc_auth *auth = task->tk_auth;
struct rpc_cred *cred = task->tk_msg.rpc_cred;
dprintk("RPC: %4d marshaling %s cred %p\n",
- task->tk_pid, auth->au_ops->au_name, cred);
+ task->tk_pid, task->tk_auth->au_ops->au_name, cred);
+
return cred->cr_ops->crmarshal(task, p);
}
u32 *
rpcauth_checkverf(struct rpc_task *task, u32 *p)
{
- struct rpc_auth *auth = task->tk_auth;
struct rpc_cred *cred = task->tk_msg.rpc_cred;
dprintk("RPC: %4d validating %s cred %p\n",
- task->tk_pid, auth->au_ops->au_name, cred);
+ task->tk_pid, task->tk_auth->au_ops->au_name, cred);
+
return cred->cr_ops->crvalidate(task, p);
}
@@ -364,12 +362,12 @@ rpcauth_unwrap_resp(struct rpc_task *task, kxdrproc_t decode, void *rqstp,
int
rpcauth_refreshcred(struct rpc_task *task)
{
- struct rpc_auth *auth = task->tk_auth;
struct rpc_cred *cred = task->tk_msg.rpc_cred;
int err;
dprintk("RPC: %4d refreshing %s cred %p\n",
- task->tk_pid, auth->au_ops->au_name, cred);
+ task->tk_pid, task->tk_auth->au_ops->au_name, cred);
+
err = cred->cr_ops->crrefresh(task);
if (err < 0)
task->tk_status = err;
diff --git a/net/sunrpc/auth_gss/Makefile b/net/sunrpc/auth_gss/Makefile
index fe1b874084b..f3431a7e33d 100644
--- a/net/sunrpc/auth_gss/Makefile
+++ b/net/sunrpc/auth_gss/Makefile
@@ -10,7 +10,7 @@ auth_rpcgss-objs := auth_gss.o gss_generic_token.o \
obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o
rpcsec_gss_krb5-objs := gss_krb5_mech.o gss_krb5_seal.o gss_krb5_unseal.o \
- gss_krb5_seqnum.o
+ gss_krb5_seqnum.o gss_krb5_wrap.o
obj-$(CONFIG_RPCSEC_GSS_SPKM3) += rpcsec_gss_spkm3.o
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 2f7b867161d..f44f46f1d8e 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -42,9 +42,8 @@
#include <linux/init.h>
#include <linux/types.h>
#include <linux/slab.h>
-#include <linux/socket.h>
-#include <linux/in.h>
#include <linux/sched.h>
+#include <linux/pagemap.h>
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/auth.h>
#include <linux/sunrpc/auth_gss.h>
@@ -846,10 +845,8 @@ gss_marshal(struct rpc_task *task, u32 *p)
/* We compute the checksum for the verifier over the xdr-encoded bytes
* starting with the xid and ending at the end of the credential: */
- iov.iov_base = req->rq_snd_buf.head[0].iov_base;
- if (task->tk_client->cl_xprt->stream)
- /* See clnt.c:call_header() */
- iov.iov_base += 4;
+ iov.iov_base = xprt_skip_transport_header(task->tk_xprt,
+ req->rq_snd_buf.head[0].iov_base);
iov.iov_len = (u8 *)p - (u8 *)iov.iov_base;
xdr_buf_from_iov(&iov, &verf_buf);
@@ -857,9 +854,7 @@ gss_marshal(struct rpc_task *task, u32 *p)
*p++ = htonl(RPC_AUTH_GSS);
mic.data = (u8 *)(p + 1);
- maj_stat = gss_get_mic(ctx->gc_gss_ctx,
- GSS_C_QOP_DEFAULT,
- &verf_buf, &mic);
+ maj_stat = gss_get_mic(ctx->gc_gss_ctx, &verf_buf, &mic);
if (maj_stat == GSS_S_CONTEXT_EXPIRED) {
cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE;
} else if (maj_stat != 0) {
@@ -890,10 +885,8 @@ static u32 *
gss_validate(struct rpc_task *task, u32 *p)
{
struct rpc_cred *cred = task->tk_msg.rpc_cred;
- struct gss_cred *gss_cred = container_of(cred, struct gss_cred,
- gc_base);
struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
- u32 seq, qop_state;
+ u32 seq;
struct kvec iov;
struct xdr_buf verf_buf;
struct xdr_netobj mic;
@@ -914,23 +907,14 @@ gss_validate(struct rpc_task *task, u32 *p)
mic.data = (u8 *)p;
mic.len = len;
- maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic, &qop_state);
+ maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic);
if (maj_stat == GSS_S_CONTEXT_EXPIRED)
cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE;
if (maj_stat)
goto out_bad;
- switch (gss_cred->gc_service) {
- case RPC_GSS_SVC_NONE:
- /* verifier data, flavor, length: */
- task->tk_auth->au_rslack = XDR_QUADLEN(len) + 2;
- break;
- case RPC_GSS_SVC_INTEGRITY:
- /* verifier data, flavor, length, length, sequence number: */
- task->tk_auth->au_rslack = XDR_QUADLEN(len) + 4;
- break;
- case RPC_GSS_SVC_PRIVACY:
- goto out_bad;
- }
+ /* We leave it to unwrap to calculate au_rslack. For now we just
+ * calculate the length of the verifier: */
+ task->tk_auth->au_verfsize = XDR_QUADLEN(len) + 2;
gss_put_ctx(ctx);
dprintk("RPC: %4u GSS gss_validate: gss_verify_mic succeeded.\n",
task->tk_pid);
@@ -975,8 +959,7 @@ gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
p = iov->iov_base + iov->iov_len;
mic.data = (u8 *)(p + 1);
- maj_stat = gss_get_mic(ctx->gc_gss_ctx,
- GSS_C_QOP_DEFAULT, &integ_buf, &mic);
+ maj_stat = gss_get_mic(ctx->gc_gss_ctx, &integ_buf, &mic);
status = -EIO; /* XXX? */
if (maj_stat == GSS_S_CONTEXT_EXPIRED)
cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE;
@@ -990,6 +973,113 @@ gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
return 0;
}
+static void
+priv_release_snd_buf(struct rpc_rqst *rqstp)
+{
+ int i;
+
+ for (i=0; i < rqstp->rq_enc_pages_num; i++)
+ __free_page(rqstp->rq_enc_pages[i]);
+ kfree(rqstp->rq_enc_pages);
+}
+
+static int
+alloc_enc_pages(struct rpc_rqst *rqstp)
+{
+ struct xdr_buf *snd_buf = &rqstp->rq_snd_buf;
+ int first, last, i;
+
+ if (snd_buf->page_len == 0) {
+ rqstp->rq_enc_pages_num = 0;
+ return 0;
+ }
+
+ first = snd_buf->page_base >> PAGE_CACHE_SHIFT;
+ last = (snd_buf->page_base + snd_buf->page_len - 1) >> PAGE_CACHE_SHIFT;
+ rqstp->rq_enc_pages_num = last - first + 1 + 1;
+ rqstp->rq_enc_pages
+ = kmalloc(rqstp->rq_enc_pages_num * sizeof(struct page *),
+ GFP_NOFS);
+ if (!rqstp->rq_enc_pages)
+ goto out;
+ for (i=0; i < rqstp->rq_enc_pages_num; i++) {
+ rqstp->rq_enc_pages[i] = alloc_page(GFP_NOFS);
+ if (rqstp->rq_enc_pages[i] == NULL)
+ goto out_free;
+ }
+ rqstp->rq_release_snd_buf = priv_release_snd_buf;
+ return 0;
+out_free:
+ for (i--; i >= 0; i--) {
+ __free_page(rqstp->rq_enc_pages[i]);
+ }
+out:
+ return -EAGAIN;
+}
+
+static inline int
+gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
+ kxdrproc_t encode, struct rpc_rqst *rqstp, u32 *p, void *obj)
+{
+ struct xdr_buf *snd_buf = &rqstp->rq_snd_buf;
+ u32 offset;
+ u32 maj_stat;
+ int status;
+ u32 *opaque_len;
+ struct page **inpages;
+ int first;
+ int pad;
+ struct kvec *iov;
+ char *tmp;
+
+ opaque_len = p++;
+ offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base;
+ *p++ = htonl(rqstp->rq_seqno);
+
+ status = encode(rqstp, p, obj);
+ if (status)
+ return status;
+
+ status = alloc_enc_pages(rqstp);
+ if (status)
+ return status;
+ first = snd_buf->page_base >> PAGE_CACHE_SHIFT;
+ inpages = snd_buf->pages + first;
+ snd_buf->pages = rqstp->rq_enc_pages;
+ snd_buf->page_base -= first << PAGE_CACHE_SHIFT;
+ /* Give the tail its own page, in case we need extra space in the
+ * head when wrapping: */
+ if (snd_buf->page_len || snd_buf->tail[0].iov_len) {
+ tmp = page_address(rqstp->rq_enc_pages[rqstp->rq_enc_pages_num - 1]);
+ memcpy(tmp, snd_buf->tail[0].iov_base, snd_buf->tail[0].iov_len);
+ snd_buf->tail[0].iov_base = tmp;
+ }
+ maj_stat = gss_wrap(ctx->gc_gss_ctx, offset, snd_buf, inpages);
+ /* RPC_SLACK_SPACE should prevent this ever happening: */
+ BUG_ON(snd_buf->len > snd_buf->buflen);
+ status = -EIO;
+ /* We're assuming that when GSS_S_CONTEXT_EXPIRED, the encryption was
+ * done anyway, so it's safe to put the request on the wire: */
+ if (maj_stat == GSS_S_CONTEXT_EXPIRED)
+ cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE;
+ else if (maj_stat)
+ return status;
+
+ *opaque_len = htonl(snd_buf->len - offset);
+ /* guess whether we're in the head or the tail: */
+ if (snd_buf->page_len || snd_buf->tail[0].iov_len)
+ iov = snd_buf->tail;
+ else
+ iov = snd_buf->head;
+ p = iov->iov_base + iov->iov_len;
+ pad = 3 - ((snd_buf->len - offset - 1) & 3);
+ memset(p, 0, pad);
+ iov->iov_len += pad;
+ snd_buf->len += pad;
+
+ return 0;
+}
+
static int
gss_wrap_req(struct rpc_task *task,
kxdrproc_t encode, void *rqstp, u32 *p, void *obj)
@@ -1017,6 +1107,8 @@ gss_wrap_req(struct rpc_task *task,
rqstp, p, obj);
break;
case RPC_GSS_SVC_PRIVACY:
+ status = gss_wrap_req_priv(cred, ctx, encode,
+ rqstp, p, obj);
break;
}
out:
@@ -1054,8 +1146,7 @@ gss_unwrap_resp_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
if (xdr_buf_read_netobj(rcv_buf, &mic, mic_offset))
return status;
- maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf,
- &mic, NULL);
+ maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, &mic);
if (maj_stat == GSS_S_CONTEXT_EXPIRED)
cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE;
if (maj_stat != GSS_S_COMPLETE)
@@ -1063,6 +1154,35 @@ gss_unwrap_resp_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
return 0;
}
+static inline int
+gss_unwrap_resp_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
+ struct rpc_rqst *rqstp, u32 **p)
+{
+ struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf;
+ u32 offset;
+ u32 opaque_len;
+ u32 maj_stat;
+ int status = -EIO;
+
+ opaque_len = ntohl(*(*p)++);
+ offset = (u8 *)(*p) - (u8 *)rcv_buf->head[0].iov_base;
+ if (offset + opaque_len > rcv_buf->len)
+ return status;
+ /* remove padding: */
+ rcv_buf->len = offset + opaque_len;
+
+ maj_stat = gss_unwrap(ctx->gc_gss_ctx, offset, rcv_buf);
+ if (maj_stat == GSS_S_CONTEXT_EXPIRED)
+ cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE;
+ if (maj_stat != GSS_S_COMPLETE)
+ return status;
+ if (ntohl(*(*p)++) != rqstp->rq_seqno)
+ return status;
+
+ return 0;
+}
+
+
static int
gss_unwrap_resp(struct rpc_task *task,
kxdrproc_t decode, void *rqstp, u32 *p, void *obj)
@@ -1071,6 +1191,9 @@ gss_unwrap_resp(struct rpc_task *task,
struct gss_cred *gss_cred = container_of(cred, struct gss_cred,
gc_base);
struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
+ u32 *savedp = p;
+ struct kvec *head = ((struct rpc_rqst *)rqstp)->rq_rcv_buf.head;
+ int savedlen = head->iov_len;
int status = -EIO;
if (ctx->gc_proc != RPC_GSS_PROC_DATA)
@@ -1084,8 +1207,14 @@ gss_unwrap_resp(struct rpc_task *task,
goto out;
break;
case RPC_GSS_SVC_PRIVACY:
+ status = gss_unwrap_resp_priv(cred, ctx, rqstp, &p);
+ if (status)
+ goto out;
break;
}
+ /* take into account extra slack for integrity and privacy cases: */
+ task->tk_auth->au_rslack = task->tk_auth->au_verfsize + (p - savedp)
+ + (savedlen - head->iov_len);
out_decode:
status = decode(rqstp, p, obj);
out:
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index 24c21f2a33a..97c981fa6b8 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -37,7 +37,7 @@
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/slab.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
#include <linux/crypto.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
@@ -75,9 +75,7 @@ krb5_encrypt(
memcpy(local_iv, iv, crypto_tfm_alg_ivsize(tfm));
memcpy(out, in, length);
- sg[0].page = virt_to_page(out);
- sg[0].offset = offset_in_page(out);
- sg[0].length = length;
+ sg_set_buf(sg, out, length);
ret = crypto_cipher_encrypt_iv(tfm, sg, sg, length, local_iv);
@@ -117,9 +115,7 @@ krb5_decrypt(
memcpy(local_iv,iv, crypto_tfm_alg_ivsize(tfm));
memcpy(out, in, length);
- sg[0].page = virt_to_page(out);
- sg[0].offset = offset_in_page(out);
- sg[0].length = length;
+ sg_set_buf(sg, out, length);
ret = crypto_cipher_decrypt_iv(tfm, sg, sg, length, local_iv);
@@ -132,24 +128,91 @@ out:
EXPORT_SYMBOL(krb5_decrypt);
-static void
-buf_to_sg(struct scatterlist *sg, char *ptr, int len) {
- sg->page = virt_to_page(ptr);
- sg->offset = offset_in_page(ptr);
- sg->length = len;
+static int
+process_xdr_buf(struct xdr_buf *buf, int offset, int len,
+ int (*actor)(struct scatterlist *, void *), void *data)
+{
+ int i, page_len, thislen, page_offset, ret = 0;
+ struct scatterlist sg[1];
+
+ if (offset >= buf->head[0].iov_len) {
+ offset -= buf->head[0].iov_len;
+ } else {
+ thislen = buf->head[0].iov_len - offset;
+ if (thislen > len)
+ thislen = len;
+ sg_set_buf(sg, buf->head[0].iov_base + offset, thislen);
+ ret = actor(sg, data);
+ if (ret)
+ goto out;
+ offset = 0;
+ len -= thislen;
+ }
+ if (len == 0)
+ goto out;
+
+ if (offset >= buf->page_len) {
+ offset -= buf->page_len;
+ } else {
+ page_len = buf->page_len - offset;
+ if (page_len > len)
+ page_len = len;
+ len -= page_len;
+ page_offset = (offset + buf->page_base) & (PAGE_CACHE_SIZE - 1);
+ i = (offset + buf->page_base) >> PAGE_CACHE_SHIFT;
+ thislen = PAGE_CACHE_SIZE - page_offset;
+ do {
+ if (thislen > page_len)
+ thislen = page_len;
+ sg->page = buf->pages[i];
+ sg->offset = page_offset;
+ sg->length = thislen;
+ ret = actor(sg, data);
+ if (ret)
+ goto out;
+ page_len -= thislen;
+ i++;
+ page_offset = 0;
+ thislen = PAGE_CACHE_SIZE;
+ } while (page_len != 0);
+ offset = 0;
+ }
+ if (len == 0)
+ goto out;
+
+ if (offset < buf->tail[0].iov_len) {
+ thislen = buf->tail[0].iov_len - offset;
+ if (thislen > len)
+ thislen = len;
+ sg_set_buf(sg, buf->tail[0].iov_base + offset, thislen);
+ ret = actor(sg, data);
+ len -= thislen;
+ }
+ if (len != 0)
+ ret = -EINVAL;
+out:
+ return ret;
+}
+
+static int
+checksummer(struct scatterlist *sg, void *data)
+{
+ struct crypto_tfm *tfm = (struct crypto_tfm *)data;
+
+ crypto_digest_update(tfm, sg, 1);
+
+ return 0;
}
/* checksum the plaintext data and hdrlen bytes of the token header */
s32
make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body,
- struct xdr_netobj *cksum)
+ int body_offset, struct xdr_netobj *cksum)
{
char *cksumname;
struct crypto_tfm *tfm = NULL; /* XXX add to ctx? */
struct scatterlist sg[1];
u32 code = GSS_S_FAILURE;
- int len, thislen, offset;
- int i;
switch (cksumtype) {
case CKSUMTYPE_RSA_MD5:
@@ -160,50 +223,173 @@ make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body,
" unsupported checksum %d", cksumtype);
goto out;
}
- if (!(tfm = crypto_alloc_tfm(cksumname, 0)))
+ if (!(tfm = crypto_alloc_tfm(cksumname, CRYPTO_TFM_REQ_MAY_SLEEP)))
goto out;
cksum->len = crypto_tfm_alg_digestsize(tfm);
if ((cksum->data = kmalloc(cksum->len, GFP_KERNEL)) == NULL)
goto out;
crypto_digest_init(tfm);
- buf_to_sg(sg, header, hdrlen);
+ sg_set_buf(sg, header, hdrlen);
crypto_digest_update(tfm, sg, 1);
- if (body->head[0].iov_len) {
- buf_to_sg(sg, body->head[0].iov_base, body->head[0].iov_len);
- crypto_digest_update(tfm, sg, 1);
- }
-
- len = body->page_len;
- if (len != 0) {
- offset = body->page_base & (PAGE_CACHE_SIZE - 1);
- i = body->page_base >> PAGE_CACHE_SHIFT;
- thislen = PAGE_CACHE_SIZE - offset;
- do {
- if (thislen > len)
- thislen = len;
- sg->page = body->pages[i];
- sg->offset = offset;
- sg->length = thislen;
- kmap(sg->page); /* XXX kmap_atomic? */
- crypto_digest_update(tfm, sg, 1);
- kunmap(sg->page);
- len -= thislen;
- i++;
- offset = 0;
- thislen = PAGE_CACHE_SIZE;
- } while(len != 0);
- }
- if (body->tail[0].iov_len) {
- buf_to_sg(sg, body->tail[0].iov_base, body->tail[0].iov_len);
- crypto_digest_update(tfm, sg, 1);
- }
+ process_xdr_buf(body, body_offset, body->len - body_offset,
+ checksummer, tfm);
crypto_digest_final(tfm, cksum->data);
code = 0;
out:
- if (tfm)
- crypto_free_tfm(tfm);
+ crypto_free_tfm(tfm);
return code;
}
EXPORT_SYMBOL(make_checksum);
+
+struct encryptor_desc {
+ u8 iv[8]; /* XXX hard-coded blocksize */
+ struct crypto_tfm *tfm;
+ int pos;
+ struct xdr_buf *outbuf;
+ struct page **pages;
+ struct scatterlist infrags[4];
+ struct scatterlist outfrags[4];
+ int fragno;
+ int fraglen;
+};
+
+static int
+encryptor(struct scatterlist *sg, void *data)
+{
+ struct encryptor_desc *desc = data;
+ struct xdr_buf *outbuf = desc->outbuf;
+ struct page *in_page;
+ int thislen = desc->fraglen + sg->length;
+ int fraglen, ret;
+ int page_pos;
+
+ /* Worst case is 4 fragments: head, end of page 1, start
+ * of page 2, tail. Anything more is a bug. */
+ BUG_ON(desc->fragno > 3);
+ desc->infrags[desc->fragno] = *sg;
+ desc->outfrags[desc->fragno] = *sg;
+
+ page_pos = desc->pos - outbuf->head[0].iov_len;
+ if (page_pos >= 0 && page_pos < outbuf->page_len) {
+ /* pages are not in place: */
+ int i = (page_pos + outbuf->page_base) >> PAGE_CACHE_SHIFT;
+ in_page = desc->pages[i];
+ } else {
+ in_page = sg->page;
+ }
+ desc->infrags[desc->fragno].page = in_page;
+ desc->fragno++;
+ desc->fraglen += sg->length;
+ desc->pos += sg->length;
+
+ fraglen = thislen & 7; /* XXX hardcoded blocksize */
+ thislen -= fraglen;
+
+ if (thislen == 0)
+ return 0;
+
+ ret = crypto_cipher_encrypt_iv(desc->tfm, desc->outfrags, desc->infrags,
+ thislen, desc->iv);
+ if (ret)
+ return ret;
+ if (fraglen) {
+ desc->outfrags[0].page = sg->page;
+ desc->outfrags[0].offset = sg->offset + sg->length - fraglen;
+ desc->outfrags[0].length = fraglen;
+ desc->infrags[0] = desc->outfrags[0];
+ desc->infrags[0].page = in_page;
+ desc->fragno = 1;
+ desc->fraglen = fraglen;
+ } else {
+ desc->fragno = 0;
+ desc->fraglen = 0;
+ }
+ return 0;
+}
+
+int
+gss_encrypt_xdr_buf(struct crypto_tfm *tfm, struct xdr_buf *buf, int offset,
+ struct page **pages)
+{
+ int ret;
+ struct encryptor_desc desc;
+
+ BUG_ON((buf->len - offset) % crypto_tfm_alg_blocksize(tfm) != 0);
+
+ memset(desc.iv, 0, sizeof(desc.iv));
+ desc.tfm = tfm;
+ desc.pos = offset;
+ desc.outbuf = buf;
+ desc.pages = pages;
+ desc.fragno = 0;
+ desc.fraglen = 0;
+
+ ret = process_xdr_buf(buf, offset, buf->len - offset, encryptor, &desc);
+ return ret;
+}
+
+EXPORT_SYMBOL(gss_encrypt_xdr_buf);
+
+struct decryptor_desc {
+ u8 iv[8]; /* XXX hard-coded blocksize */
+ struct crypto_tfm *tfm;
+ struct scatterlist frags[4];
+ int fragno;
+ int fraglen;
+};
+
+static int
+decryptor(struct scatterlist *sg, void *data)
+{
+ struct decryptor_desc *desc = data;
+ int thislen = desc->fraglen + sg->length;
+ int fraglen, ret;
+
+ /* Worst case is 4 fragments: head, end of page 1, start
+ * of page 2, tail. Anything more is a bug. */
+ BUG_ON(desc->fragno > 3);
+ desc->frags[desc->fragno] = *sg;
+ desc->fragno++;
+ desc->fraglen += sg->length;
+
+ fraglen = thislen & 7; /* XXX hardcoded blocksize */
+ thislen -= fraglen;
+
+ if (thislen == 0)
+ return 0;
+
+ ret = crypto_cipher_decrypt_iv(desc->tfm, desc->frags, desc->frags,
+ thislen, desc->iv);
+ if (ret)
+ return ret;
+ if (fraglen) {
+ desc->frags[0].page = sg->page;
+ desc->frags[0].offset = sg->offset + sg->length - fraglen;
+ desc->frags[0].length = fraglen;
+ desc->fragno = 1;
+ desc->fraglen = fraglen;
+ } else {
+ desc->fragno = 0;
+ desc->fraglen = 0;
+ }
+ return 0;
+}
+
+int
+gss_decrypt_xdr_buf(struct crypto_tfm *tfm, struct xdr_buf *buf, int offset)
+{
+ struct decryptor_desc desc;
+
+ /* XXXJBF: */
+ BUG_ON((buf->len - offset) % crypto_tfm_alg_blocksize(tfm) != 0);
+
+ memset(desc.iv, 0, sizeof(desc.iv));
+ desc.tfm = tfm;
+ desc.fragno = 0;
+ desc.fraglen = 0;
+ return process_xdr_buf(buf, offset, buf->len - offset, decryptor, &desc);
+}
+
+EXPORT_SYMBOL(gss_decrypt_xdr_buf);
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index cf726510df8..5f1f806a0b1 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -39,7 +39,6 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/sunrpc/auth.h>
-#include <linux/in.h>
#include <linux/sunrpc/gss_krb5.h>
#include <linux/sunrpc/xdr.h>
#include <linux/crypto.h>
@@ -185,52 +184,18 @@ static void
gss_delete_sec_context_kerberos(void *internal_ctx) {
struct krb5_ctx *kctx = internal_ctx;
- if (kctx->seq)
- crypto_free_tfm(kctx->seq);
- if (kctx->enc)
- crypto_free_tfm(kctx->enc);
- if (kctx->mech_used.data)
- kfree(kctx->mech_used.data);
+ crypto_free_tfm(kctx->seq);
+ crypto_free_tfm(kctx->enc);
+ kfree(kctx->mech_used.data);
kfree(kctx);
}
-static u32
-gss_verify_mic_kerberos(struct gss_ctx *ctx,
- struct xdr_buf *message,
- struct xdr_netobj *mic_token,
- u32 *qstate) {
- u32 maj_stat = 0;
- int qop_state;
- struct krb5_ctx *kctx = ctx->internal_ctx_id;
-
- maj_stat = krb5_read_token(kctx, mic_token, message, &qop_state,
- KG_TOK_MIC_MSG);
- if (!maj_stat && qop_state)
- *qstate = qop_state;
-
- dprintk("RPC: gss_verify_mic_kerberos returning %d\n", maj_stat);
- return maj_stat;
-}
-
-static u32
-gss_get_mic_kerberos(struct gss_ctx *ctx,
- u32 qop,
- struct xdr_buf *message,
- struct xdr_netobj *mic_token) {
- u32 err = 0;
- struct krb5_ctx *kctx = ctx->internal_ctx_id;
-
- err = krb5_make_token(kctx, qop, message, mic_token, KG_TOK_MIC_MSG);
-
- dprintk("RPC: gss_get_mic_kerberos returning %d\n",err);
-
- return err;
-}
-
static struct gss_api_ops gss_kerberos_ops = {
.gss_import_sec_context = gss_import_sec_context_kerberos,
.gss_get_mic = gss_get_mic_kerberos,
.gss_verify_mic = gss_verify_mic_kerberos,
+ .gss_wrap = gss_wrap_kerberos,
+ .gss_unwrap = gss_unwrap_kerberos,
.gss_delete_sec_context = gss_delete_sec_context_kerberos,
};
@@ -245,6 +210,11 @@ static struct pf_desc gss_kerberos_pfs[] = {
.service = RPC_GSS_SVC_INTEGRITY,
.name = "krb5i",
},
+ [2] = {
+ .pseudoflavor = RPC_AUTH_GSS_KRB5P,
+ .service = RPC_GSS_SVC_PRIVACY,
+ .name = "krb5p",
+ },
};
static struct gss_api_mech gss_kerberos_mech = {
diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c
index afeeb8715a7..13f8ae97945 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seal.c
@@ -70,22 +70,13 @@
# define RPCDBG_FACILITY RPCDBG_AUTH
#endif
-static inline int
-gss_krb5_padding(int blocksize, int length) {
- /* Most of the code is block-size independent but in practice we
- * use only 8: */
- BUG_ON(blocksize != 8);
- return 8 - (length & 7);
-}
-
u32
-krb5_make_token(struct krb5_ctx *ctx, int qop_req,
- struct xdr_buf *text, struct xdr_netobj *token,
- int toktype)
+gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text,
+ struct xdr_netobj *token)
{
+ struct krb5_ctx *ctx = gss_ctx->internal_ctx_id;
s32 checksum_type;
struct xdr_netobj md5cksum = {.len = 0, .data = NULL};
- int blocksize = 0, tmsglen;
unsigned char *ptr, *krb5_hdr, *msg_start;
s32 now;
@@ -93,9 +84,6 @@ krb5_make_token(struct krb5_ctx *ctx, int qop_req,
now = get_seconds();
- if (qop_req != 0)
- goto out_err;
-
switch (ctx->signalg) {
case SGN_ALG_DES_MAC_MD5:
checksum_type = CKSUMTYPE_RSA_MD5;
@@ -111,21 +99,13 @@ krb5_make_token(struct krb5_ctx *ctx, int qop_req,
goto out_err;
}
- if (toktype == KG_TOK_WRAP_MSG) {
- blocksize = crypto_tfm_alg_blocksize(ctx->enc);
- tmsglen = blocksize + text->len
- + gss_krb5_padding(blocksize, blocksize + text->len);
- } else {
- tmsglen = 0;
- }
-
- token->len = g_token_size(&ctx->mech_used, 22 + tmsglen);
+ token->len = g_token_size(&ctx->mech_used, 22);
ptr = token->data;
- g_make_token_header(&ctx->mech_used, 22 + tmsglen, &ptr);
+ g_make_token_header(&ctx->mech_used, 22, &ptr);
- *ptr++ = (unsigned char) ((toktype>>8)&0xff);
- *ptr++ = (unsigned char) (toktype&0xff);
+ *ptr++ = (unsigned char) ((KG_TOK_MIC_MSG>>8)&0xff);
+ *ptr++ = (unsigned char) (KG_TOK_MIC_MSG&0xff);
/* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */
krb5_hdr = ptr - 2;
@@ -133,17 +113,9 @@ krb5_make_token(struct krb5_ctx *ctx, int qop_req,
*(u16 *)(krb5_hdr + 2) = htons(ctx->signalg);
memset(krb5_hdr + 4, 0xff, 4);
- if (toktype == KG_TOK_WRAP_MSG)
- *(u16 *)(krb5_hdr + 4) = htons(ctx->sealalg);
- if (toktype == KG_TOK_WRAP_MSG) {
- /* XXX removing support for now */
- goto out_err;
- } else { /* Sign only. */
- if (make_checksum(checksum_type, krb5_hdr, 8, text,
- &md5cksum))
+ if (make_checksum(checksum_type, krb5_hdr, 8, text, 0, &md5cksum))
goto out_err;
- }
switch (ctx->signalg) {
case SGN_ALG_DES_MAC_MD5:
diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c
index 8767fc53183..2030475d98e 100644
--- a/net/sunrpc/auth_gss/gss_krb5_unseal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c
@@ -68,21 +68,14 @@
#endif
-/* message_buffer is an input if toktype is MIC and an output if it is WRAP:
- * If toktype is MIC: read_token is a mic token, and message_buffer is the
- * data that the mic was supposedly taken over.
- * If toktype is WRAP: read_token is a wrap token, and message_buffer is used
- * to return the decrypted data.
- */
+/* read_token is a mic token, and message_buffer is the data that the mic was
+ * supposedly taken over. */
-/* XXX will need to change prototype and/or just split into a separate function
- * when we add privacy (because read_token will be in pages too). */
u32
-krb5_read_token(struct krb5_ctx *ctx,
- struct xdr_netobj *read_token,
- struct xdr_buf *message_buffer,
- int *qop_state, int toktype)
+gss_verify_mic_kerberos(struct gss_ctx *gss_ctx,
+ struct xdr_buf *message_buffer, struct xdr_netobj *read_token)
{
+ struct krb5_ctx *ctx = gss_ctx->internal_ctx_id;
int signalg;
int sealalg;
s32 checksum_type;
@@ -100,16 +93,12 @@ krb5_read_token(struct krb5_ctx *ctx,
read_token->len))
goto out;
- if ((*ptr++ != ((toktype>>8)&0xff)) || (*ptr++ != (toktype&0xff)))
+ if ((*ptr++ != ((KG_TOK_MIC_MSG>>8)&0xff)) ||
+ (*ptr++ != ( KG_TOK_MIC_MSG &0xff)) )
goto out;
/* XXX sanity-check bodysize?? */
- if (toktype == KG_TOK_WRAP_MSG) {
- /* XXX gone */
- goto out;
- }
-
/* get the sign and seal algorithms */
signalg = ptr[0] + (ptr[1] << 8);
@@ -120,14 +109,7 @@ krb5_read_token(struct krb5_ctx *ctx,
if ((ptr[4] != 0xff) || (ptr[5] != 0xff))
goto out;
- if (((toktype != KG_TOK_WRAP_MSG) && (sealalg != 0xffff)) ||
- ((toktype == KG_TOK_WRAP_MSG) && (sealalg == 0xffff)))
- goto out;
-
- /* in the current spec, there is only one valid seal algorithm per
- key type, so a simple comparison is ok */
-
- if ((toktype == KG_TOK_WRAP_MSG) && !(sealalg == ctx->sealalg))
+ if (sealalg != 0xffff)
goto out;
/* there are several mappings of seal algorithms to sign algorithms,
@@ -154,7 +136,7 @@ krb5_read_token(struct krb5_ctx *ctx,
switch (signalg) {
case SGN_ALG_DES_MAC_MD5:
ret = make_checksum(checksum_type, ptr - 2, 8,
- message_buffer, &md5cksum);
+ message_buffer, 0, &md5cksum);
if (ret)
goto out;
@@ -175,9 +157,6 @@ krb5_read_token(struct krb5_ctx *ctx,
/* it got through unscathed. Make sure the context is unexpired */
- if (qop_state)
- *qop_state = GSS_C_QOP_DEFAULT;
-
now = get_seconds();
ret = GSS_S_CONTEXT_EXPIRED;
diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c
new file mode 100644
index 00000000000..af777cf9f25
--- /dev/null
+++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c
@@ -0,0 +1,363 @@
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/jiffies.h>
+#include <linux/sunrpc/gss_krb5.h>
+#include <linux/random.h>
+#include <linux/pagemap.h>
+#include <asm/scatterlist.h>
+#include <linux/crypto.h>
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY RPCDBG_AUTH
+#endif
+
+static inline int
+gss_krb5_padding(int blocksize, int length)
+{
+ /* Most of the code is block-size independent but currently we
+ * use only 8: */
+ BUG_ON(blocksize != 8);
+ return 8 - (length & 7);
+}
+
+static inline void
+gss_krb5_add_padding(struct xdr_buf *buf, int offset, int blocksize)
+{
+ int padding = gss_krb5_padding(blocksize, buf->len - offset);
+ char *p;
+ struct kvec *iov;
+
+ if (buf->page_len || buf->tail[0].iov_len)
+ iov = &buf->tail[0];
+ else
+ iov = &buf->head[0];
+ p = iov->iov_base + iov->iov_len;
+ iov->iov_len += padding;
+ buf->len += padding;
+ memset(p, padding, padding);
+}
+
+static inline int
+gss_krb5_remove_padding(struct xdr_buf *buf, int blocksize)
+{
+ u8 *ptr;
+ u8 pad;
+ int len = buf->len;
+
+ if (len <= buf->head[0].iov_len) {
+ pad = *(u8 *)(buf->head[0].iov_base + len - 1);
+ if (pad > buf->head[0].iov_len)
+ return -EINVAL;
+ buf->head[0].iov_len -= pad;
+ goto out;
+ } else
+ len -= buf->head[0].iov_len;
+ if (len <= buf->page_len) {
+ int last = (buf->page_base + len - 1)
+ >>PAGE_CACHE_SHIFT;
+ int offset = (buf->page_base + len - 1)
+ & (PAGE_CACHE_SIZE - 1);
+ ptr = kmap_atomic(buf->pages[last], KM_SKB_SUNRPC_DATA);
+ pad = *(ptr + offset);
+ kunmap_atomic(ptr, KM_SKB_SUNRPC_DATA);
+ goto out;
+ } else
+ len -= buf->page_len;
+ BUG_ON(len > buf->tail[0].iov_len);
+ pad = *(u8 *)(buf->tail[0].iov_base + len - 1);
+out:
+ /* XXX: NOTE: we do not adjust the page lengths--they represent
+ * a range of data in the real filesystem page cache, and we need
+ * to know that range so the xdr code can properly place read data.
+ * However adjusting the head length, as we do above, is harmless.
+ * In the case of a request that fits into a single page, the server
+ * also uses length and head length together to determine the original
+ * start of the request to copy the request for deferal; so it's
+ * easier on the server if we adjust head and tail length in tandem.
+ * It's not really a problem that we don't fool with the page and
+ * tail lengths, though--at worst badly formed xdr might lead the
+ * server to attempt to parse the padding.
+ * XXX: Document all these weird requirements for gss mechanism
+ * wrap/unwrap functions. */
+ if (pad > blocksize)
+ return -EINVAL;
+ if (buf->len > pad)
+ buf->len -= pad;
+ else
+ return -EINVAL;
+ return 0;
+}
+
+static inline void
+make_confounder(char *p, int blocksize)
+{
+ static u64 i = 0;
+ u64 *q = (u64 *)p;
+
+ /* rfc1964 claims this should be "random". But all that's really
+ * necessary is that it be unique. And not even that is necessary in
+ * our case since our "gssapi" implementation exists only to support
+ * rpcsec_gss, so we know that the only buffers we will ever encrypt
+ * already begin with a unique sequence number. Just to hedge my bets
+ * I'll make a half-hearted attempt at something unique, but ensuring
+ * uniqueness would mean worrying about atomicity and rollover, and I
+ * don't care enough. */
+
+ BUG_ON(blocksize != 8);
+ *q = i++;
+}
+
+/* Assumptions: the head and tail of inbuf are ours to play with.
+ * The pages, however, may be real pages in the page cache and we replace
+ * them with scratch pages from **pages before writing to them. */
+/* XXX: obviously the above should be documentation of wrap interface,
+ * and shouldn't be in this kerberos-specific file. */
+
+/* XXX factor out common code with seal/unseal. */
+
+u32
+gss_wrap_kerberos(struct gss_ctx *ctx, int offset,
+ struct xdr_buf *buf, struct page **pages)
+{
+ struct krb5_ctx *kctx = ctx->internal_ctx_id;
+ s32 checksum_type;
+ struct xdr_netobj md5cksum = {.len = 0, .data = NULL};
+ int blocksize = 0, plainlen;
+ unsigned char *ptr, *krb5_hdr, *msg_start;
+ s32 now;
+ int headlen;
+ struct page **tmp_pages;
+
+ dprintk("RPC: gss_wrap_kerberos\n");
+
+ now = get_seconds();
+
+ switch (kctx->signalg) {
+ case SGN_ALG_DES_MAC_MD5:
+ checksum_type = CKSUMTYPE_RSA_MD5;
+ break;
+ default:
+ dprintk("RPC: gss_krb5_seal: kctx->signalg %d not"
+ " supported\n", kctx->signalg);
+ goto out_err;
+ }
+ if (kctx->sealalg != SEAL_ALG_NONE && kctx->sealalg != SEAL_ALG_DES) {
+ dprintk("RPC: gss_krb5_seal: kctx->sealalg %d not supported\n",
+ kctx->sealalg);
+ goto out_err;
+ }
+
+ blocksize = crypto_tfm_alg_blocksize(kctx->enc);
+ gss_krb5_add_padding(buf, offset, blocksize);
+ BUG_ON((buf->len - offset) % blocksize);
+ plainlen = blocksize + buf->len - offset;
+
+ headlen = g_token_size(&kctx->mech_used, 22 + plainlen) -
+ (buf->len - offset);
+
+ ptr = buf->head[0].iov_base + offset;
+ /* shift data to make room for header. */
+ /* XXX Would be cleverer to encrypt while copying. */
+ /* XXX bounds checking, slack, etc. */
+ memmove(ptr + headlen, ptr, buf->head[0].iov_len - offset);
+ buf->head[0].iov_len += headlen;
+ buf->len += headlen;
+ BUG_ON((buf->len - offset - headlen) % blocksize);
+
+ g_make_token_header(&kctx->mech_used, 22 + plainlen, &ptr);
+
+
+ *ptr++ = (unsigned char) ((KG_TOK_WRAP_MSG>>8)&0xff);
+ *ptr++ = (unsigned char) (KG_TOK_WRAP_MSG&0xff);
+
+ /* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */
+ krb5_hdr = ptr - 2;
+ msg_start = krb5_hdr + 24;
+ /* XXXJBF: */ BUG_ON(buf->head[0].iov_base + offset + headlen != msg_start + blocksize);
+
+ *(u16 *)(krb5_hdr + 2) = htons(kctx->signalg);
+ memset(krb5_hdr + 4, 0xff, 4);
+ *(u16 *)(krb5_hdr + 4) = htons(kctx->sealalg);
+
+ make_confounder(msg_start, blocksize);
+
+ /* XXXJBF: UGH!: */
+ tmp_pages = buf->pages;
+ buf->pages = pages;
+ if (make_checksum(checksum_type, krb5_hdr, 8, buf,
+ offset + headlen - blocksize, &md5cksum))
+ goto out_err;
+ buf->pages = tmp_pages;
+
+ switch (kctx->signalg) {
+ case SGN_ALG_DES_MAC_MD5:
+ if (krb5_encrypt(kctx->seq, NULL, md5cksum.data,
+ md5cksum.data, md5cksum.len))
+ goto out_err;
+ memcpy(krb5_hdr + 16,
+ md5cksum.data + md5cksum.len - KRB5_CKSUM_LENGTH,
+ KRB5_CKSUM_LENGTH);
+
+ dprintk("RPC: make_seal_token: cksum data: \n");
+ print_hexl((u32 *) (krb5_hdr + 16), KRB5_CKSUM_LENGTH, 0);
+ break;
+ default:
+ BUG();
+ }
+
+ kfree(md5cksum.data);
+
+ /* XXX would probably be more efficient to compute checksum
+ * and encrypt at the same time: */
+ if ((krb5_make_seq_num(kctx->seq, kctx->initiate ? 0 : 0xff,
+ kctx->seq_send, krb5_hdr + 16, krb5_hdr + 8)))
+ goto out_err;
+
+ if (gss_encrypt_xdr_buf(kctx->enc, buf, offset + headlen - blocksize,
+ pages))
+ goto out_err;
+
+ kctx->seq_send++;
+
+ return ((kctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE);
+out_err:
+ if (md5cksum.data) kfree(md5cksum.data);
+ return GSS_S_FAILURE;
+}
+
+u32
+gss_unwrap_kerberos(struct gss_ctx *ctx, int offset, struct xdr_buf *buf)
+{
+ struct krb5_ctx *kctx = ctx->internal_ctx_id;
+ int signalg;
+ int sealalg;
+ s32 checksum_type;
+ struct xdr_netobj md5cksum = {.len = 0, .data = NULL};
+ s32 now;
+ int direction;
+ s32 seqnum;
+ unsigned char *ptr;
+ int bodysize;
+ u32 ret = GSS_S_DEFECTIVE_TOKEN;
+ void *data_start, *orig_start;
+ int data_len;
+ int blocksize;
+
+ dprintk("RPC: gss_unwrap_kerberos\n");
+
+ ptr = (u8 *)buf->head[0].iov_base + offset;
+ if (g_verify_token_header(&kctx->mech_used, &bodysize, &ptr,
+ buf->len - offset))
+ goto out;
+
+ if ((*ptr++ != ((KG_TOK_WRAP_MSG>>8)&0xff)) ||
+ (*ptr++ != (KG_TOK_WRAP_MSG &0xff)) )
+ goto out;
+
+ /* XXX sanity-check bodysize?? */
+
+ /* get the sign and seal algorithms */
+
+ signalg = ptr[0] + (ptr[1] << 8);
+ sealalg = ptr[2] + (ptr[3] << 8);
+
+ /* Sanity checks */
+
+ if ((ptr[4] != 0xff) || (ptr[5] != 0xff))
+ goto out;
+
+ if (sealalg == 0xffff)
+ goto out;
+
+ /* in the current spec, there is only one valid seal algorithm per
+ key type, so a simple comparison is ok */
+
+ if (sealalg != kctx->sealalg)
+ goto out;
+
+ /* there are several mappings of seal algorithms to sign algorithms,
+ but few enough that we can try them all. */
+
+ if ((kctx->sealalg == SEAL_ALG_NONE && signalg > 1) ||
+ (kctx->sealalg == SEAL_ALG_1 && signalg != SGN_ALG_3) ||
+ (kctx->sealalg == SEAL_ALG_DES3KD &&
+ signalg != SGN_ALG_HMAC_SHA1_DES3_KD))
+ goto out;
+
+ if (gss_decrypt_xdr_buf(kctx->enc, buf,
+ ptr + 22 - (unsigned char *)buf->head[0].iov_base))
+ goto out;
+
+ /* compute the checksum of the message */
+
+ /* initialize the the cksum */
+ switch (signalg) {
+ case SGN_ALG_DES_MAC_MD5:
+ checksum_type = CKSUMTYPE_RSA_MD5;
+ break;
+ default:
+ ret = GSS_S_DEFECTIVE_TOKEN;
+ goto out;
+ }
+
+ switch (signalg) {
+ case SGN_ALG_DES_MAC_MD5:
+ ret = make_checksum(checksum_type, ptr - 2, 8, buf,
+ ptr + 22 - (unsigned char *)buf->head[0].iov_base, &md5cksum);
+ if (ret)
+ goto out;
+
+ ret = krb5_encrypt(kctx->seq, NULL, md5cksum.data,
+ md5cksum.data, md5cksum.len);
+ if (ret)
+ goto out;
+
+ if (memcmp(md5cksum.data + 8, ptr + 14, 8)) {
+ ret = GSS_S_BAD_SIG;
+ goto out;
+ }
+ break;
+ default:
+ ret = GSS_S_DEFECTIVE_TOKEN;
+ goto out;
+ }
+
+ /* it got through unscathed. Make sure the context is unexpired */
+
+ now = get_seconds();
+
+ ret = GSS_S_CONTEXT_EXPIRED;
+ if (now > kctx->endtime)
+ goto out;
+
+ /* do sequencing checks */
+
+ ret = GSS_S_BAD_SIG;
+ if ((ret = krb5_get_seq_num(kctx->seq, ptr + 14, ptr + 6, &direction,
+ &seqnum)))
+ goto out;
+
+ if ((kctx->initiate && direction != 0xff) ||
+ (!kctx->initiate && direction != 0))
+ goto out;
+
+ /* Copy the data back to the right position. XXX: Would probably be
+ * better to copy and encrypt at the same time. */
+
+ blocksize = crypto_tfm_alg_blocksize(kctx->enc);
+ data_start = ptr + 22 + blocksize;
+ orig_start = buf->head[0].iov_base + offset;
+ data_len = (buf->head[0].iov_base + buf->head[0].iov_len) - data_start;
+ memmove(orig_start, data_start, data_len);
+ buf->head[0].iov_len -= (data_start - orig_start);
+ buf->len -= (data_start - orig_start);
+
+ ret = GSS_S_DEFECTIVE_TOKEN;
+ if (gss_krb5_remove_padding(buf, blocksize))
+ goto out;
+
+ ret = GSS_S_COMPLETE;
+out:
+ if (md5cksum.data) kfree(md5cksum.data);
+ return ret;
+}
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c
index 9dfb68377d6..b048bf672da 100644
--- a/net/sunrpc/auth_gss/gss_mech_switch.c
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@@ -35,7 +35,6 @@
#include <linux/types.h>
#include <linux/slab.h>
-#include <linux/socket.h>
#include <linux/module.h>
#include <linux/sunrpc/msg_prot.h>
#include <linux/sunrpc/gss_asn1.h>
@@ -251,13 +250,11 @@ gss_import_sec_context(const void *input_token, size_t bufsize,
u32
gss_get_mic(struct gss_ctx *context_handle,
- u32 qop,
struct xdr_buf *message,
struct xdr_netobj *mic_token)
{
return context_handle->mech_type->gm_ops
->gss_get_mic(context_handle,
- qop,
message,
mic_token);
}
@@ -267,16 +264,34 @@ gss_get_mic(struct gss_ctx *context_handle,
u32
gss_verify_mic(struct gss_ctx *context_handle,
struct xdr_buf *message,
- struct xdr_netobj *mic_token,
- u32 *qstate)
+ struct xdr_netobj *mic_token)
{
return context_handle->mech_type->gm_ops
->gss_verify_mic(context_handle,
message,
- mic_token,
- qstate);
+ mic_token);
}
+u32
+gss_wrap(struct gss_ctx *ctx_id,
+ int offset,
+ struct xdr_buf *buf,
+ struct page **inpages)
+{
+ return ctx_id->mech_type->gm_ops
+ ->gss_wrap(ctx_id, offset, buf, inpages);
+}
+
+u32
+gss_unwrap(struct gss_ctx *ctx_id,
+ int offset,
+ struct xdr_buf *buf)
+{
+ return ctx_id->mech_type->gm_ops
+ ->gss_unwrap(ctx_id, offset, buf);
+}
+
+
/* gss_delete_sec_context: free all resources associated with context_handle.
* Note this differs from the RFC 2744-specified prototype in that we don't
* bother returning an output token, since it would never be used anyway. */
diff --git a/net/sunrpc/auth_gss/gss_spkm3_mech.c b/net/sunrpc/auth_gss/gss_spkm3_mech.c
index dad05994c3e..39b3edc1469 100644
--- a/net/sunrpc/auth_gss/gss_spkm3_mech.c
+++ b/net/sunrpc/auth_gss/gss_spkm3_mech.c
@@ -214,32 +214,23 @@ static void
gss_delete_sec_context_spkm3(void *internal_ctx) {
struct spkm3_ctx *sctx = internal_ctx;
- if(sctx->derived_integ_key)
- crypto_free_tfm(sctx->derived_integ_key);
- if(sctx->derived_conf_key)
- crypto_free_tfm(sctx->derived_conf_key);
- if(sctx->share_key.data)
- kfree(sctx->share_key.data);
- if(sctx->mech_used.data)
- kfree(sctx->mech_used.data);
+ crypto_free_tfm(sctx->derived_integ_key);
+ crypto_free_tfm(sctx->derived_conf_key);
+ kfree(sctx->share_key.data);
+ kfree(sctx->mech_used.data);
kfree(sctx);
}
static u32
gss_verify_mic_spkm3(struct gss_ctx *ctx,
struct xdr_buf *signbuf,
- struct xdr_netobj *checksum,
- u32 *qstate) {
+ struct xdr_netobj *checksum)
+{
u32 maj_stat = 0;
- int qop_state = 0;
struct spkm3_ctx *sctx = ctx->internal_ctx_id;
dprintk("RPC: gss_verify_mic_spkm3 calling spkm3_read_token\n");
- maj_stat = spkm3_read_token(sctx, checksum, signbuf, &qop_state,
- SPKM_MIC_TOK);
-
- if (!maj_stat && qop_state)
- *qstate = qop_state;
+ maj_stat = spkm3_read_token(sctx, checksum, signbuf, SPKM_MIC_TOK);
dprintk("RPC: gss_verify_mic_spkm3 returning %d\n", maj_stat);
return maj_stat;
@@ -247,15 +238,15 @@ gss_verify_mic_spkm3(struct gss_ctx *ctx,
static u32
gss_get_mic_spkm3(struct gss_ctx *ctx,
- u32 qop,
struct xdr_buf *message_buffer,
- struct xdr_netobj *message_token) {
+ struct xdr_netobj *message_token)
+{
u32 err = 0;
struct spkm3_ctx *sctx = ctx->internal_ctx_id;
dprintk("RPC: gss_get_mic_spkm3\n");
- err = spkm3_make_token(sctx, qop, message_buffer,
+ err = spkm3_make_token(sctx, message_buffer,
message_token, SPKM_MIC_TOK);
return err;
}
@@ -268,8 +259,8 @@ static struct gss_api_ops gss_spkm3_ops = {
};
static struct pf_desc gss_spkm3_pfs[] = {
- {RPC_AUTH_GSS_SPKM, 0, RPC_GSS_SVC_NONE, "spkm3"},
- {RPC_AUTH_GSS_SPKMI, 0, RPC_GSS_SVC_INTEGRITY, "spkm3i"},
+ {RPC_AUTH_GSS_SPKM, RPC_GSS_SVC_NONE, "spkm3"},
+ {RPC_AUTH_GSS_SPKMI, RPC_GSS_SVC_INTEGRITY, "spkm3i"},
};
static struct gss_api_mech gss_spkm3_mech = {
diff --git a/net/sunrpc/auth_gss/gss_spkm3_seal.c b/net/sunrpc/auth_gss/gss_spkm3_seal.c
index 25339868d46..148201e929d 100644
--- a/net/sunrpc/auth_gss/gss_spkm3_seal.c
+++ b/net/sunrpc/auth_gss/gss_spkm3_seal.c
@@ -51,7 +51,7 @@
*/
u32
-spkm3_make_token(struct spkm3_ctx *ctx, int qop_req,
+spkm3_make_token(struct spkm3_ctx *ctx,
struct xdr_buf * text, struct xdr_netobj * token,
int toktype)
{
@@ -68,8 +68,6 @@ spkm3_make_token(struct spkm3_ctx *ctx, int qop_req,
dprintk("RPC: spkm3_make_token\n");
now = jiffies;
- if (qop_req != 0)
- goto out_err;
if (ctx->ctx_id.len != 16) {
dprintk("RPC: spkm3_make_token BAD ctx_id.len %d\n",
diff --git a/net/sunrpc/auth_gss/gss_spkm3_unseal.c b/net/sunrpc/auth_gss/gss_spkm3_unseal.c
index 65ce81bf0bc..c3c0d958610 100644
--- a/net/sunrpc/auth_gss/gss_spkm3_unseal.c
+++ b/net/sunrpc/auth_gss/gss_spkm3_unseal.c
@@ -52,7 +52,7 @@ u32
spkm3_read_token(struct spkm3_ctx *ctx,
struct xdr_netobj *read_token, /* checksum */
struct xdr_buf *message_buffer, /* signbuf */
- int *qop_state, int toktype)
+ int toktype)
{
s32 code;
struct xdr_netobj wire_cksum = {.len =0, .data = NULL};
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 5c8fe3bfc49..e4ada15ed85 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -250,6 +250,7 @@ out:
}
static struct cache_detail rsi_cache = {
+ .owner = THIS_MODULE,
.hash_size = RSI_HASHMAX,
.hash_table = rsi_table,
.name = "auth.rpcsec.init",
@@ -436,6 +437,7 @@ out:
}
static struct cache_detail rsc_cache = {
+ .owner = THIS_MODULE,
.hash_size = RSC_HASHMAX,
.hash_table = rsc_table,
.name = "auth.rpcsec.context",
@@ -564,8 +566,7 @@ gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci,
if (rqstp->rq_deferred) /* skip verification of revisited request */
return SVC_OK;
- if (gss_verify_mic(ctx_id, &rpchdr, &checksum, NULL)
- != GSS_S_COMPLETE) {
+ if (gss_verify_mic(ctx_id, &rpchdr, &checksum) != GSS_S_COMPLETE) {
*authp = rpcsec_gsserr_credproblem;
return SVC_DENIED;
}
@@ -602,7 +603,7 @@ gss_write_verf(struct svc_rqst *rqstp, struct gss_ctx *ctx_id, u32 seq)
xdr_buf_from_iov(&iov, &verf_data);
p = rqstp->rq_res.head->iov_base + rqstp->rq_res.head->iov_len;
mic.data = (u8 *)(p + 1);
- maj_stat = gss_get_mic(ctx_id, 0, &verf_data, &mic);
+ maj_stat = gss_get_mic(ctx_id, &verf_data, &mic);
if (maj_stat != GSS_S_COMPLETE)
return -1;
*p++ = htonl(mic.len);
@@ -708,7 +709,7 @@ unwrap_integ_data(struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx)
goto out;
if (read_bytes_from_xdr_buf(buf, integ_len + 4, mic.data, mic.len))
goto out;
- maj_stat = gss_verify_mic(ctx, &integ_buf, &mic, NULL);
+ maj_stat = gss_verify_mic(ctx, &integ_buf, &mic);
if (maj_stat != GSS_S_COMPLETE)
goto out;
if (ntohl(svc_getu32(&buf->head[0])) != seq)
@@ -1010,7 +1011,7 @@ svcauth_gss_release(struct svc_rqst *rqstp)
resv = &resbuf->tail[0];
}
mic.data = (u8 *)resv->iov_base + resv->iov_len + 4;
- if (gss_get_mic(gsd->rsci->mechctx, 0, &integ_buf, &mic))
+ if (gss_get_mic(gsd->rsci->mechctx, &integ_buf, &mic))
goto out_err;
svc_putu32(resv, htonl(mic.len));
memset(mic.data + mic.len, 0,
@@ -1074,7 +1075,9 @@ gss_svc_init(void)
void
gss_svc_shutdown(void)
{
- cache_unregister(&rsc_cache);
- cache_unregister(&rsi_cache);
+ if (cache_unregister(&rsc_cache))
+ printk(KERN_ERR "auth_rpcgss: failed to unregister rsc cache\n");
+ if (cache_unregister(&rsi_cache))
+ printk(KERN_ERR "auth_rpcgss: failed to unregister rsi cache\n");
svc_auth_unregister(RPC_AUTH_GSS);
}
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index 9b72d3abf82..f56767aaa92 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -7,9 +7,7 @@
*/
#include <linux/types.h>
-#include <linux/socket.h>
#include <linux/module.h>
-#include <linux/in.h>
#include <linux/utsname.h>
#include <linux/sunrpc/clnt.h>
#include <linux/sched.h>
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 4ff297a9b15..890fb5ea0dc 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -9,8 +9,6 @@
#include <linux/types.h>
#include <linux/sched.h>
#include <linux/module.h>
-#include <linux/socket.h>
-#include <linux/in.h>
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/auth.h>
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 900f5bc7e33..f509e999276 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -177,7 +177,7 @@ void cache_register(struct cache_detail *cd)
cd->proc_ent = proc_mkdir(cd->name, proc_net_rpc);
if (cd->proc_ent) {
struct proc_dir_entry *p;
- cd->proc_ent->owner = THIS_MODULE;
+ cd->proc_ent->owner = cd->owner;
cd->channel_ent = cd->content_ent = NULL;
p = create_proc_entry("flush", S_IFREG|S_IRUSR|S_IWUSR,
@@ -185,7 +185,7 @@ void cache_register(struct cache_detail *cd)
cd->flush_ent = p;
if (p) {
p->proc_fops = &cache_flush_operations;
- p->owner = THIS_MODULE;
+ p->owner = cd->owner;
p->data = cd;
}
@@ -195,7 +195,7 @@ void cache_register(struct cache_detail *cd)
cd->channel_ent = p;
if (p) {
p->proc_fops = &cache_file_operations;
- p->owner = THIS_MODULE;
+ p->owner = cd->owner;
p->data = cd;
}
}
@@ -205,7 +205,7 @@ void cache_register(struct cache_detail *cd)
cd->content_ent = p;
if (p) {
p->proc_fops = &content_file_operations;
- p->owner = THIS_MODULE;
+ p->owner = cd->owner;
p->data = cd;
}
}
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index f17e6153b68..702ede309b0 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1,5 +1,5 @@
/*
- * linux/net/sunrpc/rpcclnt.c
+ * linux/net/sunrpc/clnt.c
*
* This file contains the high-level RPC interface.
* It is modeled as a finite state machine to support both synchronous
@@ -27,7 +27,6 @@
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/slab.h>
-#include <linux/in.h>
#include <linux/utsname.h>
#include <linux/sunrpc/clnt.h>
@@ -53,6 +52,7 @@ static void call_allocate(struct rpc_task *task);
static void call_encode(struct rpc_task *task);
static void call_decode(struct rpc_task *task);
static void call_bind(struct rpc_task *task);
+static void call_bind_status(struct rpc_task *task);
static void call_transmit(struct rpc_task *task);
static void call_status(struct rpc_task *task);
static void call_refresh(struct rpc_task *task);
@@ -517,15 +517,8 @@ void
rpc_setbufsize(struct rpc_clnt *clnt, unsigned int sndsize, unsigned int rcvsize)
{
struct rpc_xprt *xprt = clnt->cl_xprt;
-
- xprt->sndsize = 0;
- if (sndsize)
- xprt->sndsize = sndsize + RPC_SLACK_SPACE;
- xprt->rcvsize = 0;
- if (rcvsize)
- xprt->rcvsize = rcvsize + RPC_SLACK_SPACE;
- if (xprt_connected(xprt))
- xprt_sock_setbufsize(xprt);
+ if (xprt->ops->set_buffer_size)
+ xprt->ops->set_buffer_size(xprt, sndsize, rcvsize);
}
/*
@@ -685,13 +678,11 @@ call_allocate(struct rpc_task *task)
static void
call_encode(struct rpc_task *task)
{
- struct rpc_clnt *clnt = task->tk_client;
struct rpc_rqst *req = task->tk_rqstp;
struct xdr_buf *sndbuf = &req->rq_snd_buf;
struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
unsigned int bufsiz;
kxdrproc_t encode;
- int status;
u32 *p;
dprintk("RPC: %4d call_encode (status %d)\n",
@@ -719,11 +710,15 @@ call_encode(struct rpc_task *task)
rpc_exit(task, -EIO);
return;
}
- if (encode && (status = rpcauth_wrap_req(task, encode, req, p,
- task->tk_msg.rpc_argp)) < 0) {
- printk(KERN_WARNING "%s: can't encode arguments: %d\n",
- clnt->cl_protname, -status);
- rpc_exit(task, status);
+ if (encode == NULL)
+ return;
+
+ task->tk_status = rpcauth_wrap_req(task, encode, req, p,
+ task->tk_msg.rpc_argp);
+ if (task->tk_status == -ENOMEM) {
+ /* XXX: Is this sane? */
+ rpc_delay(task, 3*HZ);
+ task->tk_status = -EAGAIN;
}
}
@@ -734,43 +729,95 @@ static void
call_bind(struct rpc_task *task)
{
struct rpc_clnt *clnt = task->tk_client;
- struct rpc_xprt *xprt = clnt->cl_xprt;
-
- dprintk("RPC: %4d call_bind xprt %p %s connected\n", task->tk_pid,
- xprt, (xprt_connected(xprt) ? "is" : "is not"));
- task->tk_action = (xprt_connected(xprt)) ? call_transmit : call_connect;
+ dprintk("RPC: %4d call_bind (status %d)\n",
+ task->tk_pid, task->tk_status);
+ task->tk_action = call_connect;
if (!clnt->cl_port) {
- task->tk_action = call_connect;
- task->tk_timeout = RPC_CONNECT_TIMEOUT;
+ task->tk_action = call_bind_status;
+ task->tk_timeout = task->tk_xprt->bind_timeout;
rpc_getport(task, clnt);
}
}
/*
- * 4a. Connect to the RPC server (TCP case)
+ * 4a. Sort out bind result
+ */
+static void
+call_bind_status(struct rpc_task *task)
+{
+ int status = -EACCES;
+
+ if (task->tk_status >= 0) {
+ dprintk("RPC: %4d call_bind_status (status %d)\n",
+ task->tk_pid, task->tk_status);
+ task->tk_status = 0;
+ task->tk_action = call_connect;
+ return;
+ }
+
+ switch (task->tk_status) {
+ case -EACCES:
+ dprintk("RPC: %4d remote rpcbind: RPC program/version unavailable\n",
+ task->tk_pid);
+ rpc_delay(task, 3*HZ);
+ goto retry_bind;
+ case -ETIMEDOUT:
+ dprintk("RPC: %4d rpcbind request timed out\n",
+ task->tk_pid);
+ if (RPC_IS_SOFT(task)) {
+ status = -EIO;
+ break;
+ }
+ goto retry_bind;
+ case -EPFNOSUPPORT:
+ dprintk("RPC: %4d remote rpcbind service unavailable\n",
+ task->tk_pid);
+ break;
+ case -EPROTONOSUPPORT:
+ dprintk("RPC: %4d remote rpcbind version 2 unavailable\n",
+ task->tk_pid);
+ break;
+ default:
+ dprintk("RPC: %4d unrecognized rpcbind error (%d)\n",
+ task->tk_pid, -task->tk_status);
+ status = -EIO;
+ break;
+ }
+
+ rpc_exit(task, status);
+ return;
+
+retry_bind:
+ task->tk_status = 0;
+ task->tk_action = call_bind;
+ return;
+}
+
+/*
+ * 4b. Connect to the RPC server
*/
static void
call_connect(struct rpc_task *task)
{
- struct rpc_clnt *clnt = task->tk_client;
+ struct rpc_xprt *xprt = task->tk_xprt;
- dprintk("RPC: %4d call_connect status %d\n",
- task->tk_pid, task->tk_status);
+ dprintk("RPC: %4d call_connect xprt %p %s connected\n",
+ task->tk_pid, xprt,
+ (xprt_connected(xprt) ? "is" : "is not"));
- if (xprt_connected(clnt->cl_xprt)) {
- task->tk_action = call_transmit;
- return;
+ task->tk_action = call_transmit;
+ if (!xprt_connected(xprt)) {
+ task->tk_action = call_connect_status;
+ if (task->tk_status < 0)
+ return;
+ xprt_connect(task);
}
- task->tk_action = call_connect_status;
- if (task->tk_status < 0)
- return;
- xprt_connect(task);
}
/*
- * 4b. Sort out connect result
+ * 4c. Sort out connect result
*/
static void
call_connect_status(struct rpc_task *task)
@@ -778,6 +825,9 @@ call_connect_status(struct rpc_task *task)
struct rpc_clnt *clnt = task->tk_client;
int status = task->tk_status;
+ dprintk("RPC: %5u call_connect_status (status %d)\n",
+ task->tk_pid, task->tk_status);
+
task->tk_status = 0;
if (status >= 0) {
clnt->cl_stats->netreconn++;
@@ -785,17 +835,19 @@ call_connect_status(struct rpc_task *task)
return;
}
- /* Something failed: we may have to rebind */
+ /* Something failed: remote service port may have changed */
if (clnt->cl_autobind)
clnt->cl_port = 0;
+
switch (status) {
case -ENOTCONN:
case -ETIMEDOUT:
case -EAGAIN:
- task->tk_action = (clnt->cl_port == 0) ? call_bind : call_connect;
+ task->tk_action = call_bind;
break;
default:
rpc_exit(task, -EIO);
+ break;
}
}
@@ -815,10 +867,12 @@ call_transmit(struct rpc_task *task)
if (task->tk_status != 0)
return;
/* Encode here so that rpcsec_gss can use correct sequence number. */
- if (!task->tk_rqstp->rq_bytes_sent)
+ if (task->tk_rqstp->rq_bytes_sent == 0) {
call_encode(task);
- if (task->tk_status < 0)
- return;
+ /* Did the encode result in an error condition? */
+ if (task->tk_status != 0)
+ goto out_nosend;
+ }
xprt_transmit(task);
if (task->tk_status < 0)
return;
@@ -826,6 +880,10 @@ call_transmit(struct rpc_task *task)
task->tk_action = NULL;
rpc_wake_up_task(task);
}
+ return;
+out_nosend:
+ /* release socket write lock before attempting to handle error */
+ xprt_abort_transmit(task);
}
/*
@@ -1020,13 +1078,12 @@ static u32 *
call_header(struct rpc_task *task)
{
struct rpc_clnt *clnt = task->tk_client;
- struct rpc_xprt *xprt = clnt->cl_xprt;
struct rpc_rqst *req = task->tk_rqstp;
u32 *p = req->rq_svec[0].iov_base;
/* FIXME: check buffer size? */
- if (xprt->stream)
- *p++ = 0; /* fill in later */
+
+ p = xprt_skip_transport_header(task->tk_xprt, p);
*p++ = req->rq_xid; /* XID */
*p++ = htonl(RPC_CALL); /* CALL */
*p++ = htonl(RPC_VERSION); /* RPC version */
diff --git a/net/sunrpc/pmap_clnt.c b/net/sunrpc/pmap_clnt.c
index 4e81f276692..a398575f94b 100644
--- a/net/sunrpc/pmap_clnt.c
+++ b/net/sunrpc/pmap_clnt.c
@@ -26,7 +26,7 @@
#define PMAP_GETPORT 3
static struct rpc_procinfo pmap_procedures[];
-static struct rpc_clnt * pmap_create(char *, struct sockaddr_in *, int);
+static struct rpc_clnt * pmap_create(char *, struct sockaddr_in *, int, int);
static void pmap_getport_done(struct rpc_task *);
static struct rpc_program pmap_program;
static DEFINE_SPINLOCK(pmap_lock);
@@ -65,7 +65,7 @@ rpc_getport(struct rpc_task *task, struct rpc_clnt *clnt)
map->pm_binding = 1;
spin_unlock(&pmap_lock);
- pmap_clnt = pmap_create(clnt->cl_server, sap, map->pm_prot);
+ pmap_clnt = pmap_create(clnt->cl_server, sap, map->pm_prot, 0);
if (IS_ERR(pmap_clnt)) {
task->tk_status = PTR_ERR(pmap_clnt);
goto bailout;
@@ -112,7 +112,7 @@ rpc_getport_external(struct sockaddr_in *sin, __u32 prog, __u32 vers, int prot)
NIPQUAD(sin->sin_addr.s_addr), prog, vers, prot);
sprintf(hostname, "%u.%u.%u.%u", NIPQUAD(sin->sin_addr.s_addr));
- pmap_clnt = pmap_create(hostname, sin, prot);
+ pmap_clnt = pmap_create(hostname, sin, prot, 0);
if (IS_ERR(pmap_clnt))
return PTR_ERR(pmap_clnt);
@@ -171,7 +171,7 @@ rpc_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay)
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
- pmap_clnt = pmap_create("localhost", &sin, IPPROTO_UDP);
+ pmap_clnt = pmap_create("localhost", &sin, IPPROTO_UDP, 1);
if (IS_ERR(pmap_clnt)) {
error = PTR_ERR(pmap_clnt);
dprintk("RPC: couldn't create pmap client. Error = %d\n", error);
@@ -198,7 +198,7 @@ rpc_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay)
}
static struct rpc_clnt *
-pmap_create(char *hostname, struct sockaddr_in *srvaddr, int proto)
+pmap_create(char *hostname, struct sockaddr_in *srvaddr, int proto, int privileged)
{
struct rpc_xprt *xprt;
struct rpc_clnt *clnt;
@@ -208,6 +208,8 @@ pmap_create(char *hostname, struct sockaddr_in *srvaddr, int proto)
if (IS_ERR(xprt))
return (struct rpc_clnt *)xprt;
xprt->addr.sin_port = htons(RPC_PMAP_PORT);
+ if (!privileged)
+ xprt->resvport = 0;
/* printk("pmap: create clnt\n"); */
clnt = rpc_new_client(xprt, hostname,
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 554f224c044..4f188d0a5d1 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -3,7 +3,7 @@
*
* Userland/kernel interface for rpcauth_gss.
* Code shamelessly plagiarized from fs/nfsd/nfsctl.c
- * and fs/driverfs/inode.c
+ * and fs/sysfs/inode.c
*
* Copyright (c) 2002, Trond Myklebust <trond.myklebust@fys.uio.no>
*
@@ -28,13 +28,13 @@
#include <linux/workqueue.h>
#include <linux/sunrpc/rpc_pipe_fs.h>
-static struct vfsmount *rpc_mount;
+static struct vfsmount *rpc_mount __read_mostly;
static int rpc_mount_count;
static struct file_system_type rpc_pipe_fs_type;
-static kmem_cache_t *rpc_inode_cachep;
+static kmem_cache_t *rpc_inode_cachep __read_mostly;
#define RPC_UPCALL_TIMEOUT (30*HZ)
@@ -76,25 +76,35 @@ int
rpc_queue_upcall(struct inode *inode, struct rpc_pipe_msg *msg)
{
struct rpc_inode *rpci = RPC_I(inode);
- int res = 0;
+ int res = -EPIPE;
down(&inode->i_sem);
+ if (rpci->ops == NULL)
+ goto out;
if (rpci->nreaders) {
list_add_tail(&msg->list, &rpci->pipe);
rpci->pipelen += msg->len;
+ res = 0;
} else if (rpci->flags & RPC_PIPE_WAIT_FOR_OPEN) {
if (list_empty(&rpci->pipe))
schedule_delayed_work(&rpci->queue_timeout,
RPC_UPCALL_TIMEOUT);
list_add_tail(&msg->list, &rpci->pipe);
rpci->pipelen += msg->len;
- } else
- res = -EPIPE;
+ res = 0;
+ }
+out:
up(&inode->i_sem);
wake_up(&rpci->waitq);
return res;
}
+static inline void
+rpc_inode_setowner(struct inode *inode, void *private)
+{
+ RPC_I(inode)->private = private;
+}
+
static void
rpc_close_pipes(struct inode *inode)
{
@@ -111,15 +121,10 @@ rpc_close_pipes(struct inode *inode)
rpci->ops->release_pipe(inode);
rpci->ops = NULL;
}
+ rpc_inode_setowner(inode, NULL);
up(&inode->i_sem);
}
-static inline void
-rpc_inode_setowner(struct inode *inode, void *private)
-{
- RPC_I(inode)->private = private;
-}
-
static struct inode *
rpc_alloc_inode(struct super_block *sb)
{
@@ -501,7 +506,6 @@ repeat:
dentry = dvec[--n];
if (dentry->d_inode) {
rpc_close_pipes(dentry->d_inode);
- rpc_inode_setowner(dentry->d_inode, NULL);
simple_unlink(dir, dentry);
}
dput(dentry);
@@ -576,10 +580,8 @@ __rpc_rmdir(struct inode *dir, struct dentry *dentry)
int error;
shrink_dcache_parent(dentry);
- if (dentry->d_inode) {
+ if (dentry->d_inode)
rpc_close_pipes(dentry->d_inode);
- rpc_inode_setowner(dentry->d_inode, NULL);
- }
if ((error = simple_rmdir(dir, dentry)) != 0)
return error;
if (!error) {
@@ -732,7 +734,6 @@ rpc_unlink(char *path)
d_drop(dentry);
if (dentry->d_inode) {
rpc_close_pipes(dentry->d_inode);
- rpc_inode_setowner(dentry->d_inode, NULL);
error = simple_unlink(dir, dentry);
}
dput(dentry);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 2d9eb7fbd52..54e60a65750 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -34,10 +34,10 @@ static int rpc_task_id;
#define RPC_BUFFER_MAXSIZE (2048)
#define RPC_BUFFER_POOLSIZE (8)
#define RPC_TASK_POOLSIZE (8)
-static kmem_cache_t *rpc_task_slabp;
-static kmem_cache_t *rpc_buffer_slabp;
-static mempool_t *rpc_task_mempool;
-static mempool_t *rpc_buffer_mempool;
+static kmem_cache_t *rpc_task_slabp __read_mostly;
+static kmem_cache_t *rpc_buffer_slabp __read_mostly;
+static mempool_t *rpc_task_mempool __read_mostly;
+static mempool_t *rpc_buffer_mempool __read_mostly;
static void __rpc_default_timer(struct rpc_task *task);
static void rpciod_killall(void);
@@ -719,7 +719,7 @@ static void rpc_async_schedule(void *arg)
void *
rpc_malloc(struct rpc_task *task, size_t size)
{
- int gfp;
+ gfp_t gfp;
if (task->tk_flags & RPC_TASK_SWAPPER)
gfp = GFP_ATOMIC;
diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c
new file mode 100644
index 00000000000..8f97e90f36c
--- /dev/null
+++ b/net/sunrpc/socklib.c
@@ -0,0 +1,175 @@
+/*
+ * linux/net/sunrpc/socklib.c
+ *
+ * Common socket helper routines for RPC client and server
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/types.h>
+#include <linux/pagemap.h>
+#include <linux/udp.h>
+#include <linux/sunrpc/xdr.h>
+
+
+/**
+ * skb_read_bits - copy some data bits from skb to internal buffer
+ * @desc: sk_buff copy helper
+ * @to: copy destination
+ * @len: number of bytes to copy
+ *
+ * Possibly called several times to iterate over an sk_buff and copy
+ * data out of it.
+ */
+static size_t skb_read_bits(skb_reader_t *desc, void *to, size_t len)
+{
+ if (len > desc->count)
+ len = desc->count;
+ if (skb_copy_bits(desc->skb, desc->offset, to, len))
+ return 0;
+ desc->count -= len;
+ desc->offset += len;
+ return len;
+}
+
+/**
+ * skb_read_and_csum_bits - copy and checksum from skb to buffer
+ * @desc: sk_buff copy helper
+ * @to: copy destination
+ * @len: number of bytes to copy
+ *
+ * Same as skb_read_bits, but calculate a checksum at the same time.
+ */
+static size_t skb_read_and_csum_bits(skb_reader_t *desc, void *to, size_t len)
+{
+ unsigned int csum2, pos;
+
+ if (len > desc->count)
+ len = desc->count;
+ pos = desc->offset;
+ csum2 = skb_copy_and_csum_bits(desc->skb, pos, to, len, 0);
+ desc->csum = csum_block_add(desc->csum, csum2, pos);
+ desc->count -= len;
+ desc->offset += len;
+ return len;
+}
+
+/**
+ * xdr_partial_copy_from_skb - copy data out of an skb
+ * @xdr: target XDR buffer
+ * @base: starting offset
+ * @desc: sk_buff copy helper
+ * @copy_actor: virtual method for copying data
+ *
+ */
+ssize_t xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, skb_reader_t *desc, skb_read_actor_t copy_actor)
+{
+ struct page **ppage = xdr->pages;
+ unsigned int len, pglen = xdr->page_len;
+ ssize_t copied = 0;
+ int ret;
+
+ len = xdr->head[0].iov_len;
+ if (base < len) {
+ len -= base;
+ ret = copy_actor(desc, (char *)xdr->head[0].iov_base + base, len);
+ copied += ret;
+ if (ret != len || !desc->count)
+ goto out;
+ base = 0;
+ } else
+ base -= len;
+
+ if (unlikely(pglen == 0))
+ goto copy_tail;
+ if (unlikely(base >= pglen)) {
+ base -= pglen;
+ goto copy_tail;
+ }
+ if (base || xdr->page_base) {
+ pglen -= base;
+ base += xdr->page_base;
+ ppage += base >> PAGE_CACHE_SHIFT;
+ base &= ~PAGE_CACHE_MASK;
+ }
+ do {
+ char *kaddr;
+
+ /* ACL likes to be lazy in allocating pages - ACLs
+ * are small by default but can get huge. */
+ if (unlikely(*ppage == NULL)) {
+ *ppage = alloc_page(GFP_ATOMIC);
+ if (unlikely(*ppage == NULL)) {
+ if (copied == 0)
+ copied = -ENOMEM;
+ goto out;
+ }
+ }
+
+ len = PAGE_CACHE_SIZE;
+ kaddr = kmap_atomic(*ppage, KM_SKB_SUNRPC_DATA);
+ if (base) {
+ len -= base;
+ if (pglen < len)
+ len = pglen;
+ ret = copy_actor(desc, kaddr + base, len);
+ base = 0;
+ } else {
+ if (pglen < len)
+ len = pglen;
+ ret = copy_actor(desc, kaddr, len);
+ }
+ flush_dcache_page(*ppage);
+ kunmap_atomic(kaddr, KM_SKB_SUNRPC_DATA);
+ copied += ret;
+ if (ret != len || !desc->count)
+ goto out;
+ ppage++;
+ } while ((pglen -= len) != 0);
+copy_tail:
+ len = xdr->tail[0].iov_len;
+ if (base < len)
+ copied += copy_actor(desc, (char *)xdr->tail[0].iov_base + base, len - base);
+out:
+ return copied;
+}
+
+/**
+ * csum_partial_copy_to_xdr - checksum and copy data
+ * @xdr: target XDR buffer
+ * @skb: source skb
+ *
+ * We have set things up such that we perform the checksum of the UDP
+ * packet in parallel with the copies into the RPC client iovec. -DaveM
+ */
+int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
+{
+ skb_reader_t desc;
+
+ desc.skb = skb;
+ desc.offset = sizeof(struct udphdr);
+ desc.count = skb->len - desc.offset;
+
+ if (skb->ip_summed == CHECKSUM_UNNECESSARY)
+ goto no_checksum;
+
+ desc.csum = csum_partial(skb->data, desc.offset, skb->csum);
+ if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_and_csum_bits) < 0)
+ return -1;
+ if (desc.offset != skb->len) {
+ unsigned int csum2;
+ csum2 = skb_checksum(skb, desc.offset, skb->len - desc.offset, 0);
+ desc.csum = csum_block_add(desc.csum, csum2, desc.offset);
+ }
+ if (desc.count)
+ return -1;
+ if ((unsigned short)csum_fold(desc.csum))
+ return -1;
+ return 0;
+no_checksum:
+ if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits) < 0)
+ return -1;
+ if (desc.count)
+ return -1;
+ return 0;
+}
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 9b67dc19944..4979f226e28 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -35,13 +35,13 @@ static int rpc_proc_show(struct seq_file *seq, void *v) {
int i, j;
seq_printf(seq,
- "net %d %d %d %d\n",
+ "net %u %u %u %u\n",
statp->netcnt,
statp->netudpcnt,
statp->nettcpcnt,
statp->nettcpconn);
seq_printf(seq,
- "rpc %d %d %d\n",
+ "rpc %u %u %u\n",
statp->rpccnt,
statp->rpcretrans,
statp->rpcauthrefresh);
@@ -50,10 +50,10 @@ static int rpc_proc_show(struct seq_file *seq, void *v) {
const struct rpc_version *vers = prog->version[i];
if (!vers)
continue;
- seq_printf(seq, "proc%d %d",
+ seq_printf(seq, "proc%u %u",
vers->number, vers->nrprocs);
for (j = 0; j < vers->nrprocs; j++)
- seq_printf(seq, " %d",
+ seq_printf(seq, " %u",
vers->procs[j].p_count);
seq_putc(seq, '\n');
}
@@ -83,13 +83,13 @@ void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) {
int i, j;
seq_printf(seq,
- "net %d %d %d %d\n",
+ "net %u %u %u %u\n",
statp->netcnt,
statp->netudpcnt,
statp->nettcpcnt,
statp->nettcpconn);
seq_printf(seq,
- "rpc %d %d %d %d %d\n",
+ "rpc %u %u %u %u %u\n",
statp->rpccnt,
statp->rpcbadfmt+statp->rpcbadauth+statp->rpcbadclnt,
statp->rpcbadfmt,
@@ -99,9 +99,9 @@ void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) {
for (i = 0; i < prog->pg_nvers; i++) {
if (!(vers = prog->pg_vers[i]) || !(proc = vers->vs_proc))
continue;
- seq_printf(seq, "proc%d %d", i, vers->vs_nproc);
+ seq_printf(seq, "proc%d %u", i, vers->vs_nproc);
for (j = 0; j < vers->vs_nproc; j++, proc++)
- seq_printf(seq, " %d", proc->pc_count);
+ seq_printf(seq, " %u", proc->pc_count);
seq_putc(seq, '\n');
}
}
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 62a07349527..a03d4b600c9 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -10,7 +10,6 @@
#include <linux/module.h>
#include <linux/types.h>
-#include <linux/socket.h>
#include <linux/sched.h>
#include <linux/uio.h>
#include <linux/unistd.h>
@@ -64,8 +63,6 @@ EXPORT_SYMBOL(rpc_mkpipe);
/* Client transport */
EXPORT_SYMBOL(xprt_create_proto);
EXPORT_SYMBOL(xprt_set_timeout);
-EXPORT_SYMBOL(xprt_udp_slot_table_entries);
-EXPORT_SYMBOL(xprt_tcp_slot_table_entries);
/* Client credential cache */
EXPORT_SYMBOL(rpcauth_register);
@@ -176,8 +173,10 @@ cleanup_sunrpc(void)
{
unregister_rpc_pipefs();
rpc_destroy_mempool();
- cache_unregister(&auth_domain_cache);
- cache_unregister(&ip_map_cache);
+ if (cache_unregister(&auth_domain_cache))
+ printk(KERN_ERR "sunrpc: failed to unregister auth_domain cache\n");
+ if (cache_unregister(&ip_map_cache))
+ printk(KERN_ERR "sunrpc: failed to unregister ip_map cache\n");
#ifdef RPC_DEBUG
rpc_unregister_sysctl();
#endif
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
index bde8147ef2d..dda4f0c6351 100644
--- a/net/sunrpc/svcauth.c
+++ b/net/sunrpc/svcauth.c
@@ -143,6 +143,7 @@ static void auth_domain_drop(struct cache_head *item, struct cache_detail *cd)
struct cache_detail auth_domain_cache = {
+ .owner = THIS_MODULE,
.hash_size = DN_HASHMAX,
.hash_table = auth_domain_table,
.name = "auth.domain",
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index d6baf6fdf8a..cac2e774dd8 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -242,6 +242,7 @@ static int ip_map_show(struct seq_file *m,
struct cache_detail ip_map_cache = {
+ .owner = THIS_MODULE,
.hash_size = IP_HASHMAX,
.hash_table = ip_table,
.name = "auth.unix.ip",
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 56db8f13e6c..f16e7cdd615 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -34,7 +34,7 @@
#include <net/sock.h>
#include <net/checksum.h>
#include <net/ip.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <asm/uaccess.h>
#include <asm/ioctls.h>
@@ -512,15 +512,14 @@ svc_sock_setbufsize(struct socket *sock, unsigned int snd, unsigned int rcv)
static void
svc_udp_data_ready(struct sock *sk, int count)
{
- struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data);
+ struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
- if (!svsk)
- goto out;
- dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n",
- svsk, sk, count, test_bit(SK_BUSY, &svsk->sk_flags));
- set_bit(SK_DATA, &svsk->sk_flags);
- svc_sock_enqueue(svsk);
- out:
+ if (svsk) {
+ dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n",
+ svsk, sk, count, test_bit(SK_BUSY, &svsk->sk_flags));
+ set_bit(SK_DATA, &svsk->sk_flags);
+ svc_sock_enqueue(svsk);
+ }
if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
wake_up_interruptible(sk->sk_sleep);
}
@@ -540,7 +539,7 @@ svc_write_space(struct sock *sk)
}
if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) {
- printk(KERN_WARNING "RPC svc_write_space: some sleeping on %p\n",
+ dprintk("RPC svc_write_space: someone sleeping on %p\n",
svsk);
wake_up_interruptible(sk->sk_sleep);
}
@@ -549,9 +548,6 @@ svc_write_space(struct sock *sk)
/*
* Receive a datagram from a UDP socket.
*/
-extern int
-csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb);
-
static int
svc_udp_recvfrom(struct svc_rqst *rqstp)
{
@@ -584,13 +580,16 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
/* possibly an icmp error */
dprintk("svc: recvfrom returned error %d\n", -err);
}
- if (skb->stamp.tv_sec == 0) {
- skb->stamp.tv_sec = xtime.tv_sec;
- skb->stamp.tv_usec = xtime.tv_nsec * 1000;
+ if (skb->tstamp.off_sec == 0) {
+ struct timeval tv;
+
+ tv.tv_sec = xtime.tv_sec;
+ tv.tv_usec = xtime.tv_nsec / NSEC_PER_USEC;
+ skb_set_timestamp(skb, &tv);
/* Don't enable netstamp, sunrpc doesn't
need that much accuracy */
}
- svsk->sk_sk->sk_stamp = skb->stamp;
+ skb_get_timestamp(skb, &svsk->sk_sk->sk_stamp);
set_bit(SK_DATA, &svsk->sk_flags); /* there may be more data... */
/*
@@ -689,31 +688,29 @@ svc_udp_init(struct svc_sock *svsk)
static void
svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
{
- struct svc_sock *svsk;
+ struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
dprintk("svc: socket %p TCP (listen) state change %d\n",
- sk, sk->sk_state);
+ sk, sk->sk_state);
- if (sk->sk_state != TCP_LISTEN) {
- /*
- * This callback may called twice when a new connection
- * is established as a child socket inherits everything
- * from a parent LISTEN socket.
- * 1) data_ready method of the parent socket will be called
- * when one of child sockets become ESTABLISHED.
- * 2) data_ready method of the child socket may be called
- * when it receives data before the socket is accepted.
- * In case of 2, we should ignore it silently.
- */
- goto out;
- }
- if (!(svsk = (struct svc_sock *) sk->sk_user_data)) {
- printk("svc: socket %p: no user data\n", sk);
- goto out;
+ /*
+ * This callback may called twice when a new connection
+ * is established as a child socket inherits everything
+ * from a parent LISTEN socket.
+ * 1) data_ready method of the parent socket will be called
+ * when one of child sockets become ESTABLISHED.
+ * 2) data_ready method of the child socket may be called
+ * when it receives data before the socket is accepted.
+ * In case of 2, we should ignore it silently.
+ */
+ if (sk->sk_state == TCP_LISTEN) {
+ if (svsk) {
+ set_bit(SK_CONN, &svsk->sk_flags);
+ svc_sock_enqueue(svsk);
+ } else
+ printk("svc: socket %p: no user data\n", sk);
}
- set_bit(SK_CONN, &svsk->sk_flags);
- svc_sock_enqueue(svsk);
- out:
+
if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
wake_up_interruptible_all(sk->sk_sleep);
}
@@ -724,18 +721,17 @@ svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
static void
svc_tcp_state_change(struct sock *sk)
{
- struct svc_sock *svsk;
+ struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
dprintk("svc: socket %p TCP (connected) state change %d (svsk %p)\n",
- sk, sk->sk_state, sk->sk_user_data);
+ sk, sk->sk_state, sk->sk_user_data);
- if (!(svsk = (struct svc_sock *) sk->sk_user_data)) {
+ if (!svsk)
printk("svc: socket %p: no user data\n", sk);
- goto out;
+ else {
+ set_bit(SK_CLOSE, &svsk->sk_flags);
+ svc_sock_enqueue(svsk);
}
- set_bit(SK_CLOSE, &svsk->sk_flags);
- svc_sock_enqueue(svsk);
- out:
if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
wake_up_interruptible_all(sk->sk_sleep);
}
@@ -743,15 +739,14 @@ svc_tcp_state_change(struct sock *sk)
static void
svc_tcp_data_ready(struct sock *sk, int count)
{
- struct svc_sock * svsk;
+ struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
dprintk("svc: socket %p TCP data ready (svsk %p)\n",
- sk, sk->sk_user_data);
- if (!(svsk = (struct svc_sock *)(sk->sk_user_data)))
- goto out;
- set_bit(SK_DATA, &svsk->sk_flags);
- svc_sock_enqueue(svsk);
- out:
+ sk, sk->sk_user_data);
+ if (svsk) {
+ set_bit(SK_DATA, &svsk->sk_flags);
+ svc_sock_enqueue(svsk);
+ }
if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
wake_up_interruptible(sk->sk_sleep);
}
@@ -1167,8 +1162,7 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout)
while (rqstp->rq_arghi < pages) {
struct page *p = alloc_page(GFP_KERNEL);
if (!p) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(HZ/2);
+ schedule_timeout_uninterruptible(msecs_to_jiffies(500));
continue;
}
rqstp->rq_argpages[rqstp->rq_arghi++] = p;
diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c
index 1b9616a12e2..1065904841f 100644
--- a/net/sunrpc/sysctl.c
+++ b/net/sunrpc/sysctl.c
@@ -119,8 +119,11 @@ done:
return 0;
}
+
static unsigned int min_slot_table_size = RPC_MIN_SLOT_TABLE;
static unsigned int max_slot_table_size = RPC_MAX_SLOT_TABLE;
+static unsigned int xprt_min_resvport_limit = RPC_MIN_RESVPORT;
+static unsigned int xprt_max_resvport_limit = RPC_MAX_RESVPORT;
static ctl_table debug_table[] = {
{
@@ -177,6 +180,28 @@ static ctl_table debug_table[] = {
.extra1 = &min_slot_table_size,
.extra2 = &max_slot_table_size
},
+ {
+ .ctl_name = CTL_MIN_RESVPORT,
+ .procname = "min_resvport",
+ .data = &xprt_min_resvport,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &xprt_min_resvport_limit,
+ .extra2 = &xprt_max_resvport_limit
+ },
+ {
+ .ctl_name = CTL_MAX_RESVPORT,
+ .procname = "max_resvport",
+ .data = &xprt_max_resvport,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &xprt_min_resvport_limit,
+ .extra2 = &xprt_max_resvport_limit
+ },
{ .ctl_name = 0 }
};
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 8a4d9c106af..32df43372ee 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -6,15 +6,12 @@
* Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
*/
+#include <linux/module.h>
#include <linux/types.h>
-#include <linux/socket.h>
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/pagemap.h>
#include <linux/errno.h>
-#include <linux/in.h>
-#include <linux/net.h>
-#include <net/sock.h>
#include <linux/sunrpc/xdr.h>
#include <linux/sunrpc/msg_prot.h>
@@ -176,178 +173,6 @@ xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset,
xdr->buflen += len;
}
-ssize_t
-xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base,
- skb_reader_t *desc,
- skb_read_actor_t copy_actor)
-{
- struct page **ppage = xdr->pages;
- unsigned int len, pglen = xdr->page_len;
- ssize_t copied = 0;
- int ret;
-
- len = xdr->head[0].iov_len;
- if (base < len) {
- len -= base;
- ret = copy_actor(desc, (char *)xdr->head[0].iov_base + base, len);
- copied += ret;
- if (ret != len || !desc->count)
- goto out;
- base = 0;
- } else
- base -= len;
-
- if (pglen == 0)
- goto copy_tail;
- if (base >= pglen) {
- base -= pglen;
- goto copy_tail;
- }
- if (base || xdr->page_base) {
- pglen -= base;
- base += xdr->page_base;
- ppage += base >> PAGE_CACHE_SHIFT;
- base &= ~PAGE_CACHE_MASK;
- }
- do {
- char *kaddr;
-
- /* ACL likes to be lazy in allocating pages - ACLs
- * are small by default but can get huge. */
- if (unlikely(*ppage == NULL)) {
- *ppage = alloc_page(GFP_ATOMIC);
- if (unlikely(*ppage == NULL)) {
- if (copied == 0)
- copied = -ENOMEM;
- goto out;
- }
- }
-
- len = PAGE_CACHE_SIZE;
- kaddr = kmap_atomic(*ppage, KM_SKB_SUNRPC_DATA);
- if (base) {
- len -= base;
- if (pglen < len)
- len = pglen;
- ret = copy_actor(desc, kaddr + base, len);
- base = 0;
- } else {
- if (pglen < len)
- len = pglen;
- ret = copy_actor(desc, kaddr, len);
- }
- flush_dcache_page(*ppage);
- kunmap_atomic(kaddr, KM_SKB_SUNRPC_DATA);
- copied += ret;
- if (ret != len || !desc->count)
- goto out;
- ppage++;
- } while ((pglen -= len) != 0);
-copy_tail:
- len = xdr->tail[0].iov_len;
- if (base < len)
- copied += copy_actor(desc, (char *)xdr->tail[0].iov_base + base, len - base);
-out:
- return copied;
-}
-
-
-int
-xdr_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen,
- struct xdr_buf *xdr, unsigned int base, int msgflags)
-{
- struct page **ppage = xdr->pages;
- unsigned int len, pglen = xdr->page_len;
- int err, ret = 0;
- ssize_t (*sendpage)(struct socket *, struct page *, int, size_t, int);
-
- len = xdr->head[0].iov_len;
- if (base < len || (addr != NULL && base == 0)) {
- struct kvec iov = {
- .iov_base = xdr->head[0].iov_base + base,
- .iov_len = len - base,
- };
- struct msghdr msg = {
- .msg_name = addr,
- .msg_namelen = addrlen,
- .msg_flags = msgflags,
- };
- if (xdr->len > len)
- msg.msg_flags |= MSG_MORE;
-
- if (iov.iov_len != 0)
- err = kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len);
- else
- err = kernel_sendmsg(sock, &msg, NULL, 0, 0);
- if (ret == 0)
- ret = err;
- else if (err > 0)
- ret += err;
- if (err != iov.iov_len)
- goto out;
- base = 0;
- } else
- base -= len;
-
- if (pglen == 0)
- goto copy_tail;
- if (base >= pglen) {
- base -= pglen;
- goto copy_tail;
- }
- if (base || xdr->page_base) {
- pglen -= base;
- base += xdr->page_base;
- ppage += base >> PAGE_CACHE_SHIFT;
- base &= ~PAGE_CACHE_MASK;
- }
-
- sendpage = sock->ops->sendpage ? : sock_no_sendpage;
- do {
- int flags = msgflags;
-
- len = PAGE_CACHE_SIZE;
- if (base)
- len -= base;
- if (pglen < len)
- len = pglen;
-
- if (pglen != len || xdr->tail[0].iov_len != 0)
- flags |= MSG_MORE;
-
- /* Hmm... We might be dealing with highmem pages */
- if (PageHighMem(*ppage))
- sendpage = sock_no_sendpage;
- err = sendpage(sock, *ppage, base, len, flags);
- if (ret == 0)
- ret = err;
- else if (err > 0)
- ret += err;
- if (err != len)
- goto out;
- base = 0;
- ppage++;
- } while ((pglen -= len) != 0);
-copy_tail:
- len = xdr->tail[0].iov_len;
- if (base < len) {
- struct kvec iov = {
- .iov_base = xdr->tail[0].iov_base + base,
- .iov_len = len - base,
- };
- struct msghdr msg = {
- .msg_flags = msgflags,
- };
- err = kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len);
- if (ret == 0)
- ret = err;
- else if (err > 0)
- ret += err;
- }
-out:
- return ret;
-}
-
/*
* Helper routines for doing 'memmove' like operations on a struct xdr_buf
@@ -993,6 +818,7 @@ xdr_xcode_array2(struct xdr_buf *buf, unsigned int base,
return -EINVAL;
} else {
if (xdr_decode_word(buf, base, &desc->array_len) != 0 ||
+ desc->array_len > desc->array_maxlen ||
(unsigned long) base + 4 + desc->array_len *
desc->elem_size > buf->len)
return -EINVAL;
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 3c654e06b08..6dda3860351 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -10,12 +10,12 @@
* one is available. Otherwise, it sleeps on the backlog queue
* (xprt_reserve).
* - Next, the caller puts together the RPC message, stuffs it into
- * the request struct, and calls xprt_call().
- * - xprt_call transmits the message and installs the caller on the
- * socket's wait list. At the same time, it installs a timer that
+ * the request struct, and calls xprt_transmit().
+ * - xprt_transmit sends the message and installs the caller on the
+ * transport's wait list. At the same time, it installs a timer that
* is run after the packet's timeout has expired.
* - When a packet arrives, the data_ready handler walks the list of
- * pending requests for that socket. If a matching XID is found, the
+ * pending requests for that transport. If a matching XID is found, the
* caller is woken up, and the timer removed.
* - When no reply arrives within the timeout interval, the timer is
* fired by the kernel and runs xprt_timer(). It either adjusts the
@@ -33,36 +33,17 @@
*
* Copyright (C) 1995-1997, Olaf Kirch <okir@monad.swb.de>
*
- * TCP callback races fixes (C) 1998 Red Hat Software <alan@redhat.com>
- * TCP send fixes (C) 1998 Red Hat Software <alan@redhat.com>
- * TCP NFS related read + write fixes
- * (C) 1999 Dave Airlie, University of Limerick, Ireland <airlied@linux.ie>
- *
- * Rewrite of larges part of the code in order to stabilize TCP stuff.
- * Fix behaviour when socket buffer is full.
- * (C) 1999 Trond Myklebust <trond.myklebust@fys.uio.no>
+ * Transport switch API copyright (C) 2005, Chuck Lever <cel@netapp.com>
*/
+#include <linux/module.h>
+
#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/capability.h>
-#include <linux/sched.h>
-#include <linux/errno.h>
-#include <linux/socket.h>
-#include <linux/in.h>
-#include <linux/net.h>
-#include <linux/mm.h>
-#include <linux/udp.h>
-#include <linux/tcp.h>
-#include <linux/sunrpc/clnt.h>
-#include <linux/file.h>
+#include <linux/interrupt.h>
#include <linux/workqueue.h>
#include <linux/random.h>
-#include <net/sock.h>
-#include <net/checksum.h>
-#include <net/udp.h>
-#include <net/tcp.h>
+#include <linux/sunrpc/clnt.h>
/*
* Local variables
@@ -73,81 +54,90 @@
# define RPCDBG_FACILITY RPCDBG_XPRT
#endif
-#define XPRT_MAX_BACKOFF (8)
-#define XPRT_IDLE_TIMEOUT (5*60*HZ)
-#define XPRT_MAX_RESVPORT (800)
-
/*
* Local functions
*/
static void xprt_request_init(struct rpc_task *, struct rpc_xprt *);
static inline void do_xprt_reserve(struct rpc_task *);
-static void xprt_disconnect(struct rpc_xprt *);
static void xprt_connect_status(struct rpc_task *task);
-static struct rpc_xprt * xprt_setup(int proto, struct sockaddr_in *ap,
- struct rpc_timeout *to);
-static struct socket *xprt_create_socket(struct rpc_xprt *, int, int);
-static void xprt_bind_socket(struct rpc_xprt *, struct socket *);
static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *);
-static int xprt_clear_backlog(struct rpc_xprt *xprt);
-
-#ifdef RPC_DEBUG_DATA
/*
- * Print the buffer contents (first 128 bytes only--just enough for
- * diropres return).
+ * The transport code maintains an estimate on the maximum number of out-
+ * standing RPC requests, using a smoothed version of the congestion
+ * avoidance implemented in 44BSD. This is basically the Van Jacobson
+ * congestion algorithm: If a retransmit occurs, the congestion window is
+ * halved; otherwise, it is incremented by 1/cwnd when
+ *
+ * - a reply is received and
+ * - a full number of requests are outstanding and
+ * - the congestion window hasn't been updated recently.
*/
-static void
-xprt_pktdump(char *msg, u32 *packet, unsigned int count)
-{
- u8 *buf = (u8 *) packet;
- int j;
-
- dprintk("RPC: %s\n", msg);
- for (j = 0; j < count && j < 128; j += 4) {
- if (!(j & 31)) {
- if (j)
- dprintk("\n");
- dprintk("0x%04x ", j);
- }
- dprintk("%02x%02x%02x%02x ",
- buf[j], buf[j+1], buf[j+2], buf[j+3]);
- }
- dprintk("\n");
-}
-#else
-static inline void
-xprt_pktdump(char *msg, u32 *packet, unsigned int count)
-{
- /* NOP */
-}
-#endif
+#define RPC_CWNDSHIFT (8U)
+#define RPC_CWNDSCALE (1U << RPC_CWNDSHIFT)
+#define RPC_INITCWND RPC_CWNDSCALE
+#define RPC_MAXCWND(xprt) ((xprt)->max_reqs << RPC_CWNDSHIFT)
-/*
- * Look up RPC transport given an INET socket
+#define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd)
+
+/**
+ * xprt_reserve_xprt - serialize write access to transports
+ * @task: task that is requesting access to the transport
+ *
+ * This prevents mixing the payload of separate requests, and prevents
+ * transport connects from colliding with writes. No congestion control
+ * is provided.
*/
-static inline struct rpc_xprt *
-xprt_from_sock(struct sock *sk)
+int xprt_reserve_xprt(struct rpc_task *task)
{
- return (struct rpc_xprt *) sk->sk_user_data;
+ struct rpc_xprt *xprt = task->tk_xprt;
+ struct rpc_rqst *req = task->tk_rqstp;
+
+ if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) {
+ if (task == xprt->snd_task)
+ return 1;
+ if (task == NULL)
+ return 0;
+ goto out_sleep;
+ }
+ xprt->snd_task = task;
+ if (req) {
+ req->rq_bytes_sent = 0;
+ req->rq_ntrans++;
+ }
+ return 1;
+
+out_sleep:
+ dprintk("RPC: %4d failed to lock transport %p\n",
+ task->tk_pid, xprt);
+ task->tk_timeout = 0;
+ task->tk_status = -EAGAIN;
+ if (req && req->rq_ntrans)
+ rpc_sleep_on(&xprt->resend, task, NULL, NULL);
+ else
+ rpc_sleep_on(&xprt->sending, task, NULL, NULL);
+ return 0;
}
/*
- * Serialize write access to sockets, in order to prevent different
- * requests from interfering with each other.
- * Also prevents TCP socket connects from colliding with writes.
+ * xprt_reserve_xprt_cong - serialize write access to transports
+ * @task: task that is requesting access to the transport
+ *
+ * Same as xprt_reserve_xprt, but Van Jacobson congestion control is
+ * integrated into the decision of whether a request is allowed to be
+ * woken up and given access to the transport.
*/
-static int
-__xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task)
+int xprt_reserve_xprt_cong(struct rpc_task *task)
{
+ struct rpc_xprt *xprt = task->tk_xprt;
struct rpc_rqst *req = task->tk_rqstp;
- if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate)) {
+ if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) {
if (task == xprt->snd_task)
return 1;
goto out_sleep;
}
- if (xprt->nocong || __xprt_get_cong(xprt, task)) {
+ if (__xprt_get_cong(xprt, task)) {
xprt->snd_task = task;
if (req) {
req->rq_bytes_sent = 0;
@@ -156,10 +146,10 @@ __xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task)
return 1;
}
smp_mb__before_clear_bit();
- clear_bit(XPRT_LOCKED, &xprt->sockstate);
+ clear_bit(XPRT_LOCKED, &xprt->state);
smp_mb__after_clear_bit();
out_sleep:
- dprintk("RPC: %4d failed to lock socket %p\n", task->tk_pid, xprt);
+ dprintk("RPC: %4d failed to lock transport %p\n", task->tk_pid, xprt);
task->tk_timeout = 0;
task->tk_status = -EAGAIN;
if (req && req->rq_ntrans)
@@ -169,26 +159,52 @@ out_sleep:
return 0;
}
-static inline int
-xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task)
+static inline int xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task)
{
int retval;
- spin_lock_bh(&xprt->sock_lock);
- retval = __xprt_lock_write(xprt, task);
- spin_unlock_bh(&xprt->sock_lock);
+ spin_lock_bh(&xprt->transport_lock);
+ retval = xprt->ops->reserve_xprt(task);
+ spin_unlock_bh(&xprt->transport_lock);
return retval;
}
+static void __xprt_lock_write_next(struct rpc_xprt *xprt)
+{
+ struct rpc_task *task;
+ struct rpc_rqst *req;
-static void
-__xprt_lock_write_next(struct rpc_xprt *xprt)
+ if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
+ return;
+
+ task = rpc_wake_up_next(&xprt->resend);
+ if (!task) {
+ task = rpc_wake_up_next(&xprt->sending);
+ if (!task)
+ goto out_unlock;
+ }
+
+ req = task->tk_rqstp;
+ xprt->snd_task = task;
+ if (req) {
+ req->rq_bytes_sent = 0;
+ req->rq_ntrans++;
+ }
+ return;
+
+out_unlock:
+ smp_mb__before_clear_bit();
+ clear_bit(XPRT_LOCKED, &xprt->state);
+ smp_mb__after_clear_bit();
+}
+
+static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt)
{
struct rpc_task *task;
- if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate))
+ if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
return;
- if (!xprt->nocong && RPCXPRT_CONGESTED(xprt))
+ if (RPCXPRT_CONGESTED(xprt))
goto out_unlock;
task = rpc_wake_up_next(&xprt->resend);
if (!task) {
@@ -196,7 +212,7 @@ __xprt_lock_write_next(struct rpc_xprt *xprt)
if (!task)
goto out_unlock;
}
- if (xprt->nocong || __xprt_get_cong(xprt, task)) {
+ if (__xprt_get_cong(xprt, task)) {
struct rpc_rqst *req = task->tk_rqstp;
xprt->snd_task = task;
if (req) {
@@ -207,87 +223,52 @@ __xprt_lock_write_next(struct rpc_xprt *xprt)
}
out_unlock:
smp_mb__before_clear_bit();
- clear_bit(XPRT_LOCKED, &xprt->sockstate);
+ clear_bit(XPRT_LOCKED, &xprt->state);
smp_mb__after_clear_bit();
}
-/*
- * Releases the socket for use by other requests.
+/**
+ * xprt_release_xprt - allow other requests to use a transport
+ * @xprt: transport with other tasks potentially waiting
+ * @task: task that is releasing access to the transport
+ *
+ * Note that "task" can be NULL. No congestion control is provided.
*/
-static void
-__xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task)
+void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
{
if (xprt->snd_task == task) {
xprt->snd_task = NULL;
smp_mb__before_clear_bit();
- clear_bit(XPRT_LOCKED, &xprt->sockstate);
+ clear_bit(XPRT_LOCKED, &xprt->state);
smp_mb__after_clear_bit();
__xprt_lock_write_next(xprt);
}
}
-static inline void
-xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task)
-{
- spin_lock_bh(&xprt->sock_lock);
- __xprt_release_write(xprt, task);
- spin_unlock_bh(&xprt->sock_lock);
-}
-
-/*
- * Write data to socket.
+/**
+ * xprt_release_xprt_cong - allow other requests to use a transport
+ * @xprt: transport with other tasks potentially waiting
+ * @task: task that is releasing access to the transport
+ *
+ * Note that "task" can be NULL. Another task is awoken to use the
+ * transport if the transport's congestion window allows it.
*/
-static inline int
-xprt_sendmsg(struct rpc_xprt *xprt, struct rpc_rqst *req)
+void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
{
- struct socket *sock = xprt->sock;
- struct xdr_buf *xdr = &req->rq_snd_buf;
- struct sockaddr *addr = NULL;
- int addrlen = 0;
- unsigned int skip;
- int result;
-
- if (!sock)
- return -ENOTCONN;
-
- xprt_pktdump("packet data:",
- req->rq_svec->iov_base,
- req->rq_svec->iov_len);
-
- /* For UDP, we need to provide an address */
- if (!xprt->stream) {
- addr = (struct sockaddr *) &xprt->addr;
- addrlen = sizeof(xprt->addr);
+ if (xprt->snd_task == task) {
+ xprt->snd_task = NULL;
+ smp_mb__before_clear_bit();
+ clear_bit(XPRT_LOCKED, &xprt->state);
+ smp_mb__after_clear_bit();
+ __xprt_lock_write_next_cong(xprt);
}
- /* Dont repeat bytes */
- skip = req->rq_bytes_sent;
-
- clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
- result = xdr_sendpages(sock, addr, addrlen, xdr, skip, MSG_DONTWAIT);
-
- dprintk("RPC: xprt_sendmsg(%d) = %d\n", xdr->len - skip, result);
-
- if (result >= 0)
- return result;
+}
- switch (result) {
- case -ECONNREFUSED:
- /* When the server has died, an ICMP port unreachable message
- * prompts ECONNREFUSED.
- */
- case -EAGAIN:
- break;
- case -ECONNRESET:
- case -ENOTCONN:
- case -EPIPE:
- /* connection broken */
- if (xprt->stream)
- result = -ENOTCONN;
- break;
- default:
- printk(KERN_NOTICE "RPC: sendmsg returned error %d\n", -result);
- }
- return result;
+static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+ spin_lock_bh(&xprt->transport_lock);
+ xprt->ops->release_xprt(xprt, task);
+ spin_unlock_bh(&xprt->transport_lock);
}
/*
@@ -321,26 +302,40 @@ __xprt_put_cong(struct rpc_xprt *xprt, struct rpc_rqst *req)
return;
req->rq_cong = 0;
xprt->cong -= RPC_CWNDSCALE;
- __xprt_lock_write_next(xprt);
+ __xprt_lock_write_next_cong(xprt);
}
-/*
- * Adjust RPC congestion window
+/**
+ * xprt_release_rqst_cong - housekeeping when request is complete
+ * @task: RPC request that recently completed
+ *
+ * Useful for transports that require congestion control.
+ */
+void xprt_release_rqst_cong(struct rpc_task *task)
+{
+ __xprt_put_cong(task->tk_xprt, task->tk_rqstp);
+}
+
+/**
+ * xprt_adjust_cwnd - adjust transport congestion window
+ * @task: recently completed RPC request used to adjust window
+ * @result: result code of completed RPC request
+ *
* We use a time-smoothed congestion estimator to avoid heavy oscillation.
*/
-static void
-xprt_adjust_cwnd(struct rpc_xprt *xprt, int result)
+void xprt_adjust_cwnd(struct rpc_task *task, int result)
{
- unsigned long cwnd;
+ struct rpc_rqst *req = task->tk_rqstp;
+ struct rpc_xprt *xprt = task->tk_xprt;
+ unsigned long cwnd = xprt->cwnd;
- cwnd = xprt->cwnd;
if (result >= 0 && cwnd <= xprt->cong) {
/* The (cwnd >> 1) term makes sure
* the result gets rounded properly. */
cwnd += (RPC_CWNDSCALE * RPC_CWNDSCALE + (cwnd >> 1)) / cwnd;
if (cwnd > RPC_MAXCWND(xprt))
cwnd = RPC_MAXCWND(xprt);
- __xprt_lock_write_next(xprt);
+ __xprt_lock_write_next_cong(xprt);
} else if (result == -ETIMEDOUT) {
cwnd >>= 1;
if (cwnd < RPC_CWNDSCALE)
@@ -349,11 +344,89 @@ xprt_adjust_cwnd(struct rpc_xprt *xprt, int result)
dprintk("RPC: cong %ld, cwnd was %ld, now %ld\n",
xprt->cong, xprt->cwnd, cwnd);
xprt->cwnd = cwnd;
+ __xprt_put_cong(xprt, req);
+}
+
+/**
+ * xprt_wake_pending_tasks - wake all tasks on a transport's pending queue
+ * @xprt: transport with waiting tasks
+ * @status: result code to plant in each task before waking it
+ *
+ */
+void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status)
+{
+ if (status < 0)
+ rpc_wake_up_status(&xprt->pending, status);
+ else
+ rpc_wake_up(&xprt->pending);
+}
+
+/**
+ * xprt_wait_for_buffer_space - wait for transport output buffer to clear
+ * @task: task to be put to sleep
+ *
+ */
+void xprt_wait_for_buffer_space(struct rpc_task *task)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+ struct rpc_xprt *xprt = req->rq_xprt;
+
+ task->tk_timeout = req->rq_timeout;
+ rpc_sleep_on(&xprt->pending, task, NULL, NULL);
+}
+
+/**
+ * xprt_write_space - wake the task waiting for transport output buffer space
+ * @xprt: transport with waiting tasks
+ *
+ * Can be called in a soft IRQ context, so xprt_write_space never sleeps.
+ */
+void xprt_write_space(struct rpc_xprt *xprt)
+{
+ if (unlikely(xprt->shutdown))
+ return;
+
+ spin_lock_bh(&xprt->transport_lock);
+ if (xprt->snd_task) {
+ dprintk("RPC: write space: waking waiting task on xprt %p\n",
+ xprt);
+ rpc_wake_up_task(xprt->snd_task);
+ }
+ spin_unlock_bh(&xprt->transport_lock);
+}
+
+/**
+ * xprt_set_retrans_timeout_def - set a request's retransmit timeout
+ * @task: task whose timeout is to be set
+ *
+ * Set a request's retransmit timeout based on the transport's
+ * default timeout parameters. Used by transports that don't adjust
+ * the retransmit timeout based on round-trip time estimation.
+ */
+void xprt_set_retrans_timeout_def(struct rpc_task *task)
+{
+ task->tk_timeout = task->tk_rqstp->rq_timeout;
}
/*
- * Reset the major timeout value
+ * xprt_set_retrans_timeout_rtt - set a request's retransmit timeout
+ * @task: task whose timeout is to be set
+ *
+ * Set a request's retransmit timeout using the RTT estimator.
*/
+void xprt_set_retrans_timeout_rtt(struct rpc_task *task)
+{
+ int timer = task->tk_msg.rpc_proc->p_timer;
+ struct rpc_rtt *rtt = task->tk_client->cl_rtt;
+ struct rpc_rqst *req = task->tk_rqstp;
+ unsigned long max_timeout = req->rq_xprt->timeout.to_maxval;
+
+ task->tk_timeout = rpc_calc_rto(rtt, timer);
+ task->tk_timeout <<= rpc_ntimeo(rtt, timer) + req->rq_retries;
+ if (task->tk_timeout > max_timeout || task->tk_timeout == 0)
+ task->tk_timeout = max_timeout;
+}
+
static void xprt_reset_majortimeo(struct rpc_rqst *req)
{
struct rpc_timeout *to = &req->rq_xprt->timeout;
@@ -368,8 +441,10 @@ static void xprt_reset_majortimeo(struct rpc_rqst *req)
req->rq_majortimeo += jiffies;
}
-/*
- * Adjust timeout values etc for next retransmit
+/**
+ * xprt_adjust_timeout - adjust timeout values for next retransmit
+ * @req: RPC request containing parameters to use for the adjustment
+ *
*/
int xprt_adjust_timeout(struct rpc_rqst *req)
{
@@ -391,9 +466,9 @@ int xprt_adjust_timeout(struct rpc_rqst *req)
req->rq_retries = 0;
xprt_reset_majortimeo(req);
/* Reset the RTT counters == "slow start" */
- spin_lock_bh(&xprt->sock_lock);
+ spin_lock_bh(&xprt->transport_lock);
rpc_init_rtt(req->rq_task->tk_client->cl_rtt, to->to_initval);
- spin_unlock_bh(&xprt->sock_lock);
+ spin_unlock_bh(&xprt->transport_lock);
pprintk("RPC: %lu timeout\n", jiffies);
status = -ETIMEDOUT;
}
@@ -405,133 +480,52 @@ int xprt_adjust_timeout(struct rpc_rqst *req)
return status;
}
-/*
- * Close down a transport socket
- */
-static void
-xprt_close(struct rpc_xprt *xprt)
-{
- struct socket *sock = xprt->sock;
- struct sock *sk = xprt->inet;
-
- if (!sk)
- return;
-
- write_lock_bh(&sk->sk_callback_lock);
- xprt->inet = NULL;
- xprt->sock = NULL;
-
- sk->sk_user_data = NULL;
- sk->sk_data_ready = xprt->old_data_ready;
- sk->sk_state_change = xprt->old_state_change;
- sk->sk_write_space = xprt->old_write_space;
- write_unlock_bh(&sk->sk_callback_lock);
-
- sk->sk_no_check = 0;
-
- sock_release(sock);
-}
-
-static void
-xprt_socket_autoclose(void *args)
+static void xprt_autoclose(void *args)
{
struct rpc_xprt *xprt = (struct rpc_xprt *)args;
xprt_disconnect(xprt);
- xprt_close(xprt);
+ xprt->ops->close(xprt);
xprt_release_write(xprt, NULL);
}
-/*
- * Mark a transport as disconnected
+/**
+ * xprt_disconnect - mark a transport as disconnected
+ * @xprt: transport to flag for disconnect
+ *
*/
-static void
-xprt_disconnect(struct rpc_xprt *xprt)
+void xprt_disconnect(struct rpc_xprt *xprt)
{
dprintk("RPC: disconnected transport %p\n", xprt);
- spin_lock_bh(&xprt->sock_lock);
+ spin_lock_bh(&xprt->transport_lock);
xprt_clear_connected(xprt);
- rpc_wake_up_status(&xprt->pending, -ENOTCONN);
- spin_unlock_bh(&xprt->sock_lock);
+ xprt_wake_pending_tasks(xprt, -ENOTCONN);
+ spin_unlock_bh(&xprt->transport_lock);
}
-/*
- * Used to allow disconnection when we've been idle
- */
static void
xprt_init_autodisconnect(unsigned long data)
{
struct rpc_xprt *xprt = (struct rpc_xprt *)data;
- spin_lock(&xprt->sock_lock);
+ spin_lock(&xprt->transport_lock);
if (!list_empty(&xprt->recv) || xprt->shutdown)
goto out_abort;
- if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate))
+ if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
goto out_abort;
- spin_unlock(&xprt->sock_lock);
- /* Let keventd close the socket */
- if (test_bit(XPRT_CONNECTING, &xprt->sockstate) != 0)
+ spin_unlock(&xprt->transport_lock);
+ if (xprt_connecting(xprt))
xprt_release_write(xprt, NULL);
else
schedule_work(&xprt->task_cleanup);
return;
out_abort:
- spin_unlock(&xprt->sock_lock);
-}
-
-static void xprt_socket_connect(void *args)
-{
- struct rpc_xprt *xprt = (struct rpc_xprt *)args;
- struct socket *sock = xprt->sock;
- int status = -EIO;
-
- if (xprt->shutdown || xprt->addr.sin_port == 0)
- goto out;
-
- /*
- * Start by resetting any existing state
- */
- xprt_close(xprt);
- sock = xprt_create_socket(xprt, xprt->prot, xprt->resvport);
- if (sock == NULL) {
- /* couldn't create socket or bind to reserved port;
- * this is likely a permanent error, so cause an abort */
- goto out;
- }
- xprt_bind_socket(xprt, sock);
- xprt_sock_setbufsize(xprt);
-
- status = 0;
- if (!xprt->stream)
- goto out;
-
- /*
- * Tell the socket layer to start connecting...
- */
- status = sock->ops->connect(sock, (struct sockaddr *) &xprt->addr,
- sizeof(xprt->addr), O_NONBLOCK);
- dprintk("RPC: %p connect status %d connected %d sock state %d\n",
- xprt, -status, xprt_connected(xprt), sock->sk->sk_state);
- if (status < 0) {
- switch (status) {
- case -EINPROGRESS:
- case -EALREADY:
- goto out_clear;
- }
- }
-out:
- if (status < 0)
- rpc_wake_up_status(&xprt->pending, status);
- else
- rpc_wake_up(&xprt->pending);
-out_clear:
- smp_mb__before_clear_bit();
- clear_bit(XPRT_CONNECTING, &xprt->sockstate);
- smp_mb__after_clear_bit();
+ spin_unlock(&xprt->transport_lock);
}
-/*
- * Attempt to connect a TCP socket.
+/**
+ * xprt_connect - schedule a transport connect operation
+ * @task: RPC task that is requesting the connect
*
*/
void xprt_connect(struct rpc_task *task)
@@ -552,37 +546,19 @@ void xprt_connect(struct rpc_task *task)
if (!xprt_lock_write(xprt, task))
return;
if (xprt_connected(xprt))
- goto out_write;
+ xprt_release_write(xprt, task);
+ else {
+ if (task->tk_rqstp)
+ task->tk_rqstp->rq_bytes_sent = 0;
- if (task->tk_rqstp)
- task->tk_rqstp->rq_bytes_sent = 0;
-
- task->tk_timeout = RPC_CONNECT_TIMEOUT;
- rpc_sleep_on(&xprt->pending, task, xprt_connect_status, NULL);
- if (!test_and_set_bit(XPRT_CONNECTING, &xprt->sockstate)) {
- /* Note: if we are here due to a dropped connection
- * we delay reconnecting by RPC_REESTABLISH_TIMEOUT/HZ
- * seconds
- */
- if (xprt->sock != NULL)
- schedule_delayed_work(&xprt->sock_connect,
- RPC_REESTABLISH_TIMEOUT);
- else {
- schedule_work(&xprt->sock_connect);
- if (!RPC_IS_ASYNC(task))
- flush_scheduled_work();
- }
+ task->tk_timeout = xprt->connect_timeout;
+ rpc_sleep_on(&xprt->pending, task, xprt_connect_status, NULL);
+ xprt->ops->connect(task);
}
return;
- out_write:
- xprt_release_write(xprt, task);
}
-/*
- * We arrive here when awoken from waiting on connection establishment.
- */
-static void
-xprt_connect_status(struct rpc_task *task)
+static void xprt_connect_status(struct rpc_task *task)
{
struct rpc_xprt *xprt = task->tk_xprt;
@@ -592,31 +568,42 @@ xprt_connect_status(struct rpc_task *task)
return;
}
- /* if soft mounted, just cause this RPC to fail */
- if (RPC_IS_SOFT(task))
- task->tk_status = -EIO;
-
switch (task->tk_status) {
case -ECONNREFUSED:
case -ECONNRESET:
+ dprintk("RPC: %4d xprt_connect_status: server %s refused connection\n",
+ task->tk_pid, task->tk_client->cl_server);
+ break;
case -ENOTCONN:
- return;
+ dprintk("RPC: %4d xprt_connect_status: connection broken\n",
+ task->tk_pid);
+ break;
case -ETIMEDOUT:
- dprintk("RPC: %4d xprt_connect_status: timed out\n",
+ dprintk("RPC: %4d xprt_connect_status: connect attempt timed out\n",
task->tk_pid);
break;
default:
- printk(KERN_ERR "RPC: error %d connecting to server %s\n",
- -task->tk_status, task->tk_client->cl_server);
+ dprintk("RPC: %4d xprt_connect_status: error %d connecting to server %s\n",
+ task->tk_pid, -task->tk_status, task->tk_client->cl_server);
+ xprt_release_write(xprt, task);
+ task->tk_status = -EIO;
+ return;
+ }
+
+ /* if soft mounted, just cause this RPC to fail */
+ if (RPC_IS_SOFT(task)) {
+ xprt_release_write(xprt, task);
+ task->tk_status = -EIO;
}
- xprt_release_write(xprt, task);
}
-/*
- * Look up the RPC request corresponding to a reply, and then lock it.
+/**
+ * xprt_lookup_rqst - find an RPC request corresponding to an XID
+ * @xprt: transport on which the original request was transmitted
+ * @xid: RPC XID of incoming reply
+ *
*/
-static inline struct rpc_rqst *
-xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid)
+struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid)
{
struct list_head *pos;
struct rpc_rqst *req = NULL;
@@ -631,556 +618,68 @@ xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid)
return req;
}
-/*
- * Complete reply received.
- * The TCP code relies on us to remove the request from xprt->pending.
- */
-static void
-xprt_complete_rqst(struct rpc_xprt *xprt, struct rpc_rqst *req, int copied)
-{
- struct rpc_task *task = req->rq_task;
- struct rpc_clnt *clnt = task->tk_client;
-
- /* Adjust congestion window */
- if (!xprt->nocong) {
- unsigned timer = task->tk_msg.rpc_proc->p_timer;
- xprt_adjust_cwnd(xprt, copied);
- __xprt_put_cong(xprt, req);
- if (timer) {
- if (req->rq_ntrans == 1)
- rpc_update_rtt(clnt->cl_rtt, timer,
- (long)jiffies - req->rq_xtime);
- rpc_set_timeo(clnt->cl_rtt, timer, req->rq_ntrans - 1);
- }
- }
-
-#ifdef RPC_PROFILE
- /* Profile only reads for now */
- if (copied > 1024) {
- static unsigned long nextstat;
- static unsigned long pkt_rtt, pkt_len, pkt_cnt;
-
- pkt_cnt++;
- pkt_len += req->rq_slen + copied;
- pkt_rtt += jiffies - req->rq_xtime;
- if (time_before(nextstat, jiffies)) {
- printk("RPC: %lu %ld cwnd\n", jiffies, xprt->cwnd);
- printk("RPC: %ld %ld %ld %ld stat\n",
- jiffies, pkt_cnt, pkt_len, pkt_rtt);
- pkt_rtt = pkt_len = pkt_cnt = 0;
- nextstat = jiffies + 5 * HZ;
- }
- }
-#endif
-
- dprintk("RPC: %4d has input (%d bytes)\n", task->tk_pid, copied);
- list_del_init(&req->rq_list);
- req->rq_received = req->rq_private_buf.len = copied;
-
- /* ... and wake up the process. */
- rpc_wake_up_task(task);
- return;
-}
-
-static size_t
-skb_read_bits(skb_reader_t *desc, void *to, size_t len)
-{
- if (len > desc->count)
- len = desc->count;
- if (skb_copy_bits(desc->skb, desc->offset, to, len))
- return 0;
- desc->count -= len;
- desc->offset += len;
- return len;
-}
-
-static size_t
-skb_read_and_csum_bits(skb_reader_t *desc, void *to, size_t len)
-{
- unsigned int csum2, pos;
-
- if (len > desc->count)
- len = desc->count;
- pos = desc->offset;
- csum2 = skb_copy_and_csum_bits(desc->skb, pos, to, len, 0);
- desc->csum = csum_block_add(desc->csum, csum2, pos);
- desc->count -= len;
- desc->offset += len;
- return len;
-}
-
-/*
- * We have set things up such that we perform the checksum of the UDP
- * packet in parallel with the copies into the RPC client iovec. -DaveM
- */
-int
-csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
-{
- skb_reader_t desc;
-
- desc.skb = skb;
- desc.offset = sizeof(struct udphdr);
- desc.count = skb->len - desc.offset;
-
- if (skb->ip_summed == CHECKSUM_UNNECESSARY)
- goto no_checksum;
-
- desc.csum = csum_partial(skb->data, desc.offset, skb->csum);
- if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_and_csum_bits) < 0)
- return -1;
- if (desc.offset != skb->len) {
- unsigned int csum2;
- csum2 = skb_checksum(skb, desc.offset, skb->len - desc.offset, 0);
- desc.csum = csum_block_add(desc.csum, csum2, desc.offset);
- }
- if (desc.count)
- return -1;
- if ((unsigned short)csum_fold(desc.csum))
- return -1;
- return 0;
-no_checksum:
- if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits) < 0)
- return -1;
- if (desc.count)
- return -1;
- return 0;
-}
-
-/*
- * Input handler for RPC replies. Called from a bottom half and hence
- * atomic.
- */
-static void
-udp_data_ready(struct sock *sk, int len)
-{
- struct rpc_task *task;
- struct rpc_xprt *xprt;
- struct rpc_rqst *rovr;
- struct sk_buff *skb;
- int err, repsize, copied;
- u32 _xid, *xp;
-
- read_lock(&sk->sk_callback_lock);
- dprintk("RPC: udp_data_ready...\n");
- if (!(xprt = xprt_from_sock(sk))) {
- printk("RPC: udp_data_ready request not found!\n");
- goto out;
- }
-
- dprintk("RPC: udp_data_ready client %p\n", xprt);
-
- if ((skb = skb_recv_datagram(sk, 0, 1, &err)) == NULL)
- goto out;
-
- if (xprt->shutdown)
- goto dropit;
-
- repsize = skb->len - sizeof(struct udphdr);
- if (repsize < 4) {
- printk("RPC: impossible RPC reply size %d!\n", repsize);
- goto dropit;
- }
-
- /* Copy the XID from the skb... */
- xp = skb_header_pointer(skb, sizeof(struct udphdr),
- sizeof(_xid), &_xid);
- if (xp == NULL)
- goto dropit;
-
- /* Look up and lock the request corresponding to the given XID */
- spin_lock(&xprt->sock_lock);
- rovr = xprt_lookup_rqst(xprt, *xp);
- if (!rovr)
- goto out_unlock;
- task = rovr->rq_task;
-
- dprintk("RPC: %4d received reply\n", task->tk_pid);
-
- if ((copied = rovr->rq_private_buf.buflen) > repsize)
- copied = repsize;
-
- /* Suck it into the iovec, verify checksum if not done by hw. */
- if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb))
- goto out_unlock;
-
- /* Something worked... */
- dst_confirm(skb->dst);
-
- xprt_complete_rqst(xprt, rovr, copied);
-
- out_unlock:
- spin_unlock(&xprt->sock_lock);
- dropit:
- skb_free_datagram(sk, skb);
- out:
- read_unlock(&sk->sk_callback_lock);
-}
-
-/*
- * Copy from an skb into memory and shrink the skb.
- */
-static inline size_t
-tcp_copy_data(skb_reader_t *desc, void *p, size_t len)
-{
- if (len > desc->count)
- len = desc->count;
- if (skb_copy_bits(desc->skb, desc->offset, p, len)) {
- dprintk("RPC: failed to copy %zu bytes from skb. %zu bytes remain\n",
- len, desc->count);
- return 0;
- }
- desc->offset += len;
- desc->count -= len;
- dprintk("RPC: copied %zu bytes from skb. %zu bytes remain\n",
- len, desc->count);
- return len;
-}
-
-/*
- * TCP read fragment marker
- */
-static inline void
-tcp_read_fraghdr(struct rpc_xprt *xprt, skb_reader_t *desc)
-{
- size_t len, used;
- char *p;
-
- p = ((char *) &xprt->tcp_recm) + xprt->tcp_offset;
- len = sizeof(xprt->tcp_recm) - xprt->tcp_offset;
- used = tcp_copy_data(desc, p, len);
- xprt->tcp_offset += used;
- if (used != len)
- return;
- xprt->tcp_reclen = ntohl(xprt->tcp_recm);
- if (xprt->tcp_reclen & 0x80000000)
- xprt->tcp_flags |= XPRT_LAST_FRAG;
- else
- xprt->tcp_flags &= ~XPRT_LAST_FRAG;
- xprt->tcp_reclen &= 0x7fffffff;
- xprt->tcp_flags &= ~XPRT_COPY_RECM;
- xprt->tcp_offset = 0;
- /* Sanity check of the record length */
- if (xprt->tcp_reclen < 4) {
- printk(KERN_ERR "RPC: Invalid TCP record fragment length\n");
- xprt_disconnect(xprt);
- }
- dprintk("RPC: reading TCP record fragment of length %d\n",
- xprt->tcp_reclen);
-}
-
-static void
-tcp_check_recm(struct rpc_xprt *xprt)
-{
- dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u, tcp_flags = %lx\n",
- xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen, xprt->tcp_flags);
- if (xprt->tcp_offset == xprt->tcp_reclen) {
- xprt->tcp_flags |= XPRT_COPY_RECM;
- xprt->tcp_offset = 0;
- if (xprt->tcp_flags & XPRT_LAST_FRAG) {
- xprt->tcp_flags &= ~XPRT_COPY_DATA;
- xprt->tcp_flags |= XPRT_COPY_XID;
- xprt->tcp_copied = 0;
- }
- }
-}
-
-/*
- * TCP read xid
- */
-static inline void
-tcp_read_xid(struct rpc_xprt *xprt, skb_reader_t *desc)
-{
- size_t len, used;
- char *p;
-
- len = sizeof(xprt->tcp_xid) - xprt->tcp_offset;
- dprintk("RPC: reading XID (%Zu bytes)\n", len);
- p = ((char *) &xprt->tcp_xid) + xprt->tcp_offset;
- used = tcp_copy_data(desc, p, len);
- xprt->tcp_offset += used;
- if (used != len)
- return;
- xprt->tcp_flags &= ~XPRT_COPY_XID;
- xprt->tcp_flags |= XPRT_COPY_DATA;
- xprt->tcp_copied = 4;
- dprintk("RPC: reading reply for XID %08x\n",
- ntohl(xprt->tcp_xid));
- tcp_check_recm(xprt);
-}
-
-/*
- * TCP read and complete request
- */
-static inline void
-tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
-{
- struct rpc_rqst *req;
- struct xdr_buf *rcvbuf;
- size_t len;
- ssize_t r;
-
- /* Find and lock the request corresponding to this xid */
- spin_lock(&xprt->sock_lock);
- req = xprt_lookup_rqst(xprt, xprt->tcp_xid);
- if (!req) {
- xprt->tcp_flags &= ~XPRT_COPY_DATA;
- dprintk("RPC: XID %08x request not found!\n",
- ntohl(xprt->tcp_xid));
- spin_unlock(&xprt->sock_lock);
- return;
- }
-
- rcvbuf = &req->rq_private_buf;
- len = desc->count;
- if (len > xprt->tcp_reclen - xprt->tcp_offset) {
- skb_reader_t my_desc;
-
- len = xprt->tcp_reclen - xprt->tcp_offset;
- memcpy(&my_desc, desc, sizeof(my_desc));
- my_desc.count = len;
- r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
- &my_desc, tcp_copy_data);
- desc->count -= r;
- desc->offset += r;
- } else
- r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
- desc, tcp_copy_data);
-
- if (r > 0) {
- xprt->tcp_copied += r;
- xprt->tcp_offset += r;
- }
- if (r != len) {
- /* Error when copying to the receive buffer,
- * usually because we weren't able to allocate
- * additional buffer pages. All we can do now
- * is turn off XPRT_COPY_DATA, so the request
- * will not receive any additional updates,
- * and time out.
- * Any remaining data from this record will
- * be discarded.
- */
- xprt->tcp_flags &= ~XPRT_COPY_DATA;
- dprintk("RPC: XID %08x truncated request\n",
- ntohl(xprt->tcp_xid));
- dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n",
- xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen);
- goto out;
- }
-
- dprintk("RPC: XID %08x read %Zd bytes\n",
- ntohl(xprt->tcp_xid), r);
- dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n",
- xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen);
-
- if (xprt->tcp_copied == req->rq_private_buf.buflen)
- xprt->tcp_flags &= ~XPRT_COPY_DATA;
- else if (xprt->tcp_offset == xprt->tcp_reclen) {
- if (xprt->tcp_flags & XPRT_LAST_FRAG)
- xprt->tcp_flags &= ~XPRT_COPY_DATA;
- }
-
-out:
- if (!(xprt->tcp_flags & XPRT_COPY_DATA)) {
- dprintk("RPC: %4d received reply complete\n",
- req->rq_task->tk_pid);
- xprt_complete_rqst(xprt, req, xprt->tcp_copied);
- }
- spin_unlock(&xprt->sock_lock);
- tcp_check_recm(xprt);
-}
-
-/*
- * TCP discard extra bytes from a short read
- */
-static inline void
-tcp_read_discard(struct rpc_xprt *xprt, skb_reader_t *desc)
-{
- size_t len;
-
- len = xprt->tcp_reclen - xprt->tcp_offset;
- if (len > desc->count)
- len = desc->count;
- desc->count -= len;
- desc->offset += len;
- xprt->tcp_offset += len;
- dprintk("RPC: discarded %Zu bytes\n", len);
- tcp_check_recm(xprt);
-}
-
-/*
- * TCP record receive routine
- * We first have to grab the record marker, then the XID, then the data.
+/**
+ * xprt_update_rtt - update an RPC client's RTT state after receiving a reply
+ * @task: RPC request that recently completed
+ *
*/
-static int
-tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
- unsigned int offset, size_t len)
-{
- struct rpc_xprt *xprt = rd_desc->arg.data;
- skb_reader_t desc = {
- .skb = skb,
- .offset = offset,
- .count = len,
- .csum = 0
- };
-
- dprintk("RPC: tcp_data_recv\n");
- do {
- /* Read in a new fragment marker if necessary */
- /* Can we ever really expect to get completely empty fragments? */
- if (xprt->tcp_flags & XPRT_COPY_RECM) {
- tcp_read_fraghdr(xprt, &desc);
- continue;
- }
- /* Read in the xid if necessary */
- if (xprt->tcp_flags & XPRT_COPY_XID) {
- tcp_read_xid(xprt, &desc);
- continue;
- }
- /* Read in the request data */
- if (xprt->tcp_flags & XPRT_COPY_DATA) {
- tcp_read_request(xprt, &desc);
- continue;
- }
- /* Skip over any trailing bytes on short reads */
- tcp_read_discard(xprt, &desc);
- } while (desc.count);
- dprintk("RPC: tcp_data_recv done\n");
- return len - desc.count;
-}
-
-static void tcp_data_ready(struct sock *sk, int bytes)
+void xprt_update_rtt(struct rpc_task *task)
{
- struct rpc_xprt *xprt;
- read_descriptor_t rd_desc;
-
- read_lock(&sk->sk_callback_lock);
- dprintk("RPC: tcp_data_ready...\n");
- if (!(xprt = xprt_from_sock(sk))) {
- printk("RPC: tcp_data_ready socket info not found!\n");
- goto out;
- }
- if (xprt->shutdown)
- goto out;
-
- /* We use rd_desc to pass struct xprt to tcp_data_recv */
- rd_desc.arg.data = xprt;
- rd_desc.count = 65536;
- tcp_read_sock(sk, &rd_desc, tcp_data_recv);
-out:
- read_unlock(&sk->sk_callback_lock);
-}
-
-static void
-tcp_state_change(struct sock *sk)
-{
- struct rpc_xprt *xprt;
+ struct rpc_rqst *req = task->tk_rqstp;
+ struct rpc_rtt *rtt = task->tk_client->cl_rtt;
+ unsigned timer = task->tk_msg.rpc_proc->p_timer;
- read_lock(&sk->sk_callback_lock);
- if (!(xprt = xprt_from_sock(sk)))
- goto out;
- dprintk("RPC: tcp_state_change client %p...\n", xprt);
- dprintk("RPC: state %x conn %d dead %d zapped %d\n",
- sk->sk_state, xprt_connected(xprt),
- sock_flag(sk, SOCK_DEAD),
- sock_flag(sk, SOCK_ZAPPED));
-
- switch (sk->sk_state) {
- case TCP_ESTABLISHED:
- spin_lock_bh(&xprt->sock_lock);
- if (!xprt_test_and_set_connected(xprt)) {
- /* Reset TCP record info */
- xprt->tcp_offset = 0;
- xprt->tcp_reclen = 0;
- xprt->tcp_copied = 0;
- xprt->tcp_flags = XPRT_COPY_RECM | XPRT_COPY_XID;
- rpc_wake_up(&xprt->pending);
- }
- spin_unlock_bh(&xprt->sock_lock);
- break;
- case TCP_SYN_SENT:
- case TCP_SYN_RECV:
- break;
- default:
- xprt_disconnect(xprt);
- break;
+ if (timer) {
+ if (req->rq_ntrans == 1)
+ rpc_update_rtt(rtt, timer,
+ (long)jiffies - req->rq_xtime);
+ rpc_set_timeo(rtt, timer, req->rq_ntrans - 1);
}
- out:
- read_unlock(&sk->sk_callback_lock);
}
-/*
- * Called when more output buffer space is available for this socket.
- * We try not to wake our writers until they can make "significant"
- * progress, otherwise we'll waste resources thrashing sock_sendmsg
- * with a bunch of small requests.
+/**
+ * xprt_complete_rqst - called when reply processing is complete
+ * @task: RPC request that recently completed
+ * @copied: actual number of bytes received from the transport
+ *
+ * Caller holds transport lock.
*/
-static void
-xprt_write_space(struct sock *sk)
+void xprt_complete_rqst(struct rpc_task *task, int copied)
{
- struct rpc_xprt *xprt;
- struct socket *sock;
-
- read_lock(&sk->sk_callback_lock);
- if (!(xprt = xprt_from_sock(sk)) || !(sock = sk->sk_socket))
- goto out;
- if (xprt->shutdown)
- goto out;
-
- /* Wait until we have enough socket memory */
- if (xprt->stream) {
- /* from net/core/stream.c:sk_stream_write_space */
- if (sk_stream_wspace(sk) < sk_stream_min_wspace(sk))
- goto out;
- } else {
- /* from net/core/sock.c:sock_def_write_space */
- if (!sock_writeable(sk))
- goto out;
- }
+ struct rpc_rqst *req = task->tk_rqstp;
- if (!test_and_clear_bit(SOCK_NOSPACE, &sock->flags))
- goto out;
+ dprintk("RPC: %5u xid %08x complete (%d bytes received)\n",
+ task->tk_pid, ntohl(req->rq_xid), copied);
- spin_lock_bh(&xprt->sock_lock);
- if (xprt->snd_task)
- rpc_wake_up_task(xprt->snd_task);
- spin_unlock_bh(&xprt->sock_lock);
-out:
- read_unlock(&sk->sk_callback_lock);
+ list_del_init(&req->rq_list);
+ req->rq_received = req->rq_private_buf.len = copied;
+ rpc_wake_up_task(task);
}
-/*
- * RPC receive timeout handler.
- */
-static void
-xprt_timer(struct rpc_task *task)
+static void xprt_timer(struct rpc_task *task)
{
- struct rpc_rqst *req = task->tk_rqstp;
+ struct rpc_rqst *req = task->tk_rqstp;
struct rpc_xprt *xprt = req->rq_xprt;
- spin_lock(&xprt->sock_lock);
- if (req->rq_received)
- goto out;
-
- xprt_adjust_cwnd(req->rq_xprt, -ETIMEDOUT);
- __xprt_put_cong(xprt, req);
+ dprintk("RPC: %4d xprt_timer\n", task->tk_pid);
- dprintk("RPC: %4d xprt_timer (%s request)\n",
- task->tk_pid, req ? "pending" : "backlogged");
-
- task->tk_status = -ETIMEDOUT;
-out:
+ spin_lock(&xprt->transport_lock);
+ if (!req->rq_received) {
+ if (xprt->ops->timer)
+ xprt->ops->timer(task);
+ task->tk_status = -ETIMEDOUT;
+ }
task->tk_timeout = 0;
rpc_wake_up_task(task);
- spin_unlock(&xprt->sock_lock);
+ spin_unlock(&xprt->transport_lock);
}
-/*
- * Place the actual RPC call.
- * We have to copy the iovec because sendmsg fiddles with its contents.
+/**
+ * xprt_prepare_transmit - reserve the transport before sending a request
+ * @task: RPC task about to send a request
+ *
*/
-int
-xprt_prepare_transmit(struct rpc_task *task)
+int xprt_prepare_transmit(struct rpc_task *task)
{
struct rpc_rqst *req = task->tk_rqstp;
struct rpc_xprt *xprt = req->rq_xprt;
@@ -1191,12 +690,12 @@ xprt_prepare_transmit(struct rpc_task *task)
if (xprt->shutdown)
return -EIO;
- spin_lock_bh(&xprt->sock_lock);
+ spin_lock_bh(&xprt->transport_lock);
if (req->rq_received && !req->rq_bytes_sent) {
err = req->rq_received;
goto out_unlock;
}
- if (!__xprt_lock_write(xprt, task)) {
+ if (!xprt->ops->reserve_xprt(task)) {
err = -EAGAIN;
goto out_unlock;
}
@@ -1206,39 +705,42 @@ xprt_prepare_transmit(struct rpc_task *task)
goto out_unlock;
}
out_unlock:
- spin_unlock_bh(&xprt->sock_lock);
+ spin_unlock_bh(&xprt->transport_lock);
return err;
}
void
-xprt_transmit(struct rpc_task *task)
+xprt_abort_transmit(struct rpc_task *task)
+{
+ struct rpc_xprt *xprt = task->tk_xprt;
+
+ xprt_release_write(xprt, task);
+}
+
+/**
+ * xprt_transmit - send an RPC request on a transport
+ * @task: controlling RPC task
+ *
+ * We have to copy the iovec because sendmsg fiddles with its contents.
+ */
+void xprt_transmit(struct rpc_task *task)
{
- struct rpc_clnt *clnt = task->tk_client;
struct rpc_rqst *req = task->tk_rqstp;
struct rpc_xprt *xprt = req->rq_xprt;
- int status, retry = 0;
-
+ int status;
dprintk("RPC: %4d xprt_transmit(%u)\n", task->tk_pid, req->rq_slen);
- /* set up everything as needed. */
- /* Write the record marker */
- if (xprt->stream) {
- u32 *marker = req->rq_svec[0].iov_base;
-
- *marker = htonl(0x80000000|(req->rq_slen-sizeof(*marker)));
- }
-
smp_rmb();
if (!req->rq_received) {
if (list_empty(&req->rq_list)) {
- spin_lock_bh(&xprt->sock_lock);
+ spin_lock_bh(&xprt->transport_lock);
/* Update the softirq receive buffer */
memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
sizeof(req->rq_private_buf));
/* Add request to the receive list */
list_add_tail(&req->rq_list, &xprt->recv);
- spin_unlock_bh(&xprt->sock_lock);
+ spin_unlock_bh(&xprt->transport_lock);
xprt_reset_majortimeo(req);
/* Turn off autodisconnect */
del_singleshot_timer_sync(&xprt->timer);
@@ -1246,40 +748,19 @@ xprt_transmit(struct rpc_task *task)
} else if (!req->rq_bytes_sent)
return;
- /* Continue transmitting the packet/record. We must be careful
- * to cope with writespace callbacks arriving _after_ we have
- * called xprt_sendmsg().
- */
- while (1) {
- req->rq_xtime = jiffies;
- status = xprt_sendmsg(xprt, req);
-
- if (status < 0)
- break;
-
- if (xprt->stream) {
- req->rq_bytes_sent += status;
-
- /* If we've sent the entire packet, immediately
- * reset the count of bytes sent. */
- if (req->rq_bytes_sent >= req->rq_slen) {
- req->rq_bytes_sent = 0;
- goto out_receive;
- }
- } else {
- if (status >= req->rq_slen)
- goto out_receive;
- status = -EAGAIN;
- break;
- }
-
- dprintk("RPC: %4d xmit incomplete (%d left of %d)\n",
- task->tk_pid, req->rq_slen - req->rq_bytes_sent,
- req->rq_slen);
-
- status = -EAGAIN;
- if (retry++ > 50)
- break;
+ status = xprt->ops->send_request(task);
+ if (status == 0) {
+ dprintk("RPC: %4d xmit complete\n", task->tk_pid);
+ spin_lock_bh(&xprt->transport_lock);
+ xprt->ops->set_retrans_timeout(task);
+ /* Don't race with disconnect */
+ if (!xprt_connected(xprt))
+ task->tk_status = -ENOTCONN;
+ else if (!req->rq_received)
+ rpc_sleep_on(&xprt->pending, task, NULL, xprt_timer);
+ xprt->ops->release_xprt(xprt, task);
+ spin_unlock_bh(&xprt->transport_lock);
+ return;
}
/* Note: at this point, task->tk_sleeping has not yet been set,
@@ -1289,60 +770,19 @@ xprt_transmit(struct rpc_task *task)
task->tk_status = status;
switch (status) {
- case -EAGAIN:
- if (test_bit(SOCK_ASYNC_NOSPACE, &xprt->sock->flags)) {
- /* Protect against races with xprt_write_space */
- spin_lock_bh(&xprt->sock_lock);
- /* Don't race with disconnect */
- if (!xprt_connected(xprt))
- task->tk_status = -ENOTCONN;
- else if (test_bit(SOCK_NOSPACE, &xprt->sock->flags)) {
- task->tk_timeout = req->rq_timeout;
- rpc_sleep_on(&xprt->pending, task, NULL, NULL);
- }
- spin_unlock_bh(&xprt->sock_lock);
- return;
- }
- /* Keep holding the socket if it is blocked */
- rpc_delay(task, HZ>>4);
- return;
case -ECONNREFUSED:
- task->tk_timeout = RPC_REESTABLISH_TIMEOUT;
rpc_sleep_on(&xprt->sending, task, NULL, NULL);
+ case -EAGAIN:
case -ENOTCONN:
return;
default:
- if (xprt->stream)
- xprt_disconnect(xprt);
+ break;
}
xprt_release_write(xprt, task);
return;
- out_receive:
- dprintk("RPC: %4d xmit complete\n", task->tk_pid);
- /* Set the task's receive timeout value */
- spin_lock_bh(&xprt->sock_lock);
- if (!xprt->nocong) {
- int timer = task->tk_msg.rpc_proc->p_timer;
- task->tk_timeout = rpc_calc_rto(clnt->cl_rtt, timer);
- task->tk_timeout <<= rpc_ntimeo(clnt->cl_rtt, timer) + req->rq_retries;
- if (task->tk_timeout > xprt->timeout.to_maxval || task->tk_timeout == 0)
- task->tk_timeout = xprt->timeout.to_maxval;
- } else
- task->tk_timeout = req->rq_timeout;
- /* Don't race with disconnect */
- if (!xprt_connected(xprt))
- task->tk_status = -ENOTCONN;
- else if (!req->rq_received)
- rpc_sleep_on(&xprt->pending, task, NULL, xprt_timer);
- __xprt_release_write(xprt, task);
- spin_unlock_bh(&xprt->sock_lock);
}
-/*
- * Reserve an RPC call slot.
- */
-static inline void
-do_xprt_reserve(struct rpc_task *task)
+static inline void do_xprt_reserve(struct rpc_task *task)
{
struct rpc_xprt *xprt = task->tk_xprt;
@@ -1362,22 +802,25 @@ do_xprt_reserve(struct rpc_task *task)
rpc_sleep_on(&xprt->backlog, task, NULL, NULL);
}
-void
-xprt_reserve(struct rpc_task *task)
+/**
+ * xprt_reserve - allocate an RPC request slot
+ * @task: RPC task requesting a slot allocation
+ *
+ * If no more slots are available, place the task on the transport's
+ * backlog queue.
+ */
+void xprt_reserve(struct rpc_task *task)
{
struct rpc_xprt *xprt = task->tk_xprt;
task->tk_status = -EIO;
if (!xprt->shutdown) {
- spin_lock(&xprt->xprt_lock);
+ spin_lock(&xprt->reserve_lock);
do_xprt_reserve(task);
- spin_unlock(&xprt->xprt_lock);
+ spin_unlock(&xprt->reserve_lock);
}
}
-/*
- * Allocate a 'unique' XID
- */
static inline u32 xprt_alloc_xid(struct rpc_xprt *xprt)
{
return xprt->xid++;
@@ -1388,11 +831,7 @@ static inline void xprt_init_xid(struct rpc_xprt *xprt)
get_random_bytes(&xprt->xid, sizeof(xprt->xid));
}
-/*
- * Initialize RPC request
- */
-static void
-xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
+static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
{
struct rpc_rqst *req = task->tk_rqstp;
@@ -1400,128 +839,104 @@ xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
req->rq_task = task;
req->rq_xprt = xprt;
req->rq_xid = xprt_alloc_xid(xprt);
+ req->rq_release_snd_buf = NULL;
dprintk("RPC: %4d reserved req %p xid %08x\n", task->tk_pid,
req, ntohl(req->rq_xid));
}
-/*
- * Release an RPC call slot
+/**
+ * xprt_release - release an RPC request slot
+ * @task: task which is finished with the slot
+ *
*/
-void
-xprt_release(struct rpc_task *task)
+void xprt_release(struct rpc_task *task)
{
struct rpc_xprt *xprt = task->tk_xprt;
struct rpc_rqst *req;
if (!(req = task->tk_rqstp))
return;
- spin_lock_bh(&xprt->sock_lock);
- __xprt_release_write(xprt, task);
- __xprt_put_cong(xprt, req);
+ spin_lock_bh(&xprt->transport_lock);
+ xprt->ops->release_xprt(xprt, task);
+ if (xprt->ops->release_request)
+ xprt->ops->release_request(task);
if (!list_empty(&req->rq_list))
list_del(&req->rq_list);
xprt->last_used = jiffies;
if (list_empty(&xprt->recv) && !xprt->shutdown)
- mod_timer(&xprt->timer, xprt->last_used + XPRT_IDLE_TIMEOUT);
- spin_unlock_bh(&xprt->sock_lock);
+ mod_timer(&xprt->timer,
+ xprt->last_used + xprt->idle_timeout);
+ spin_unlock_bh(&xprt->transport_lock);
task->tk_rqstp = NULL;
+ if (req->rq_release_snd_buf)
+ req->rq_release_snd_buf(req);
memset(req, 0, sizeof(*req)); /* mark unused */
dprintk("RPC: %4d release request %p\n", task->tk_pid, req);
- spin_lock(&xprt->xprt_lock);
+ spin_lock(&xprt->reserve_lock);
list_add(&req->rq_list, &xprt->free);
- xprt_clear_backlog(xprt);
- spin_unlock(&xprt->xprt_lock);
-}
-
-/*
- * Set default timeout parameters
- */
-static void
-xprt_default_timeout(struct rpc_timeout *to, int proto)
-{
- if (proto == IPPROTO_UDP)
- xprt_set_timeout(to, 5, 5 * HZ);
- else
- xprt_set_timeout(to, 5, 60 * HZ);
+ rpc_wake_up_next(&xprt->backlog);
+ spin_unlock(&xprt->reserve_lock);
}
-/*
- * Set constant timeout
+/**
+ * xprt_set_timeout - set constant RPC timeout
+ * @to: RPC timeout parameters to set up
+ * @retr: number of retries
+ * @incr: amount of increase after each retry
+ *
*/
-void
-xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long incr)
+void xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long incr)
{
to->to_initval =
to->to_increment = incr;
- to->to_maxval = incr * retr;
+ to->to_maxval = to->to_initval + (incr * retr);
to->to_retries = retr;
to->to_exponential = 0;
}
-unsigned int xprt_udp_slot_table_entries = RPC_DEF_SLOT_TABLE;
-unsigned int xprt_tcp_slot_table_entries = RPC_DEF_SLOT_TABLE;
-
-/*
- * Initialize an RPC client
- */
-static struct rpc_xprt *
-xprt_setup(int proto, struct sockaddr_in *ap, struct rpc_timeout *to)
+static struct rpc_xprt *xprt_setup(int proto, struct sockaddr_in *ap, struct rpc_timeout *to)
{
+ int result;
struct rpc_xprt *xprt;
- unsigned int entries;
- size_t slot_table_size;
struct rpc_rqst *req;
- dprintk("RPC: setting up %s transport...\n",
- proto == IPPROTO_UDP? "UDP" : "TCP");
-
- entries = (proto == IPPROTO_TCP)?
- xprt_tcp_slot_table_entries : xprt_udp_slot_table_entries;
-
if ((xprt = kmalloc(sizeof(struct rpc_xprt), GFP_KERNEL)) == NULL)
return ERR_PTR(-ENOMEM);
memset(xprt, 0, sizeof(*xprt)); /* Nnnngh! */
- xprt->max_reqs = entries;
- slot_table_size = entries * sizeof(xprt->slot[0]);
- xprt->slot = kmalloc(slot_table_size, GFP_KERNEL);
- if (xprt->slot == NULL) {
- kfree(xprt);
- return ERR_PTR(-ENOMEM);
- }
- memset(xprt->slot, 0, slot_table_size);
xprt->addr = *ap;
- xprt->prot = proto;
- xprt->stream = (proto == IPPROTO_TCP)? 1 : 0;
- if (xprt->stream) {
- xprt->cwnd = RPC_MAXCWND(xprt);
- xprt->nocong = 1;
- xprt->max_payload = (1U << 31) - 1;
- } else {
- xprt->cwnd = RPC_INITCWND;
- xprt->max_payload = (1U << 16) - (MAX_HEADER << 3);
+
+ switch (proto) {
+ case IPPROTO_UDP:
+ result = xs_setup_udp(xprt, to);
+ break;
+ case IPPROTO_TCP:
+ result = xs_setup_tcp(xprt, to);
+ break;
+ default:
+ printk(KERN_ERR "RPC: unrecognized transport protocol: %d\n",
+ proto);
+ result = -EIO;
+ break;
+ }
+ if (result) {
+ kfree(xprt);
+ return ERR_PTR(result);
}
- spin_lock_init(&xprt->sock_lock);
- spin_lock_init(&xprt->xprt_lock);
- init_waitqueue_head(&xprt->cong_wait);
+
+ spin_lock_init(&xprt->transport_lock);
+ spin_lock_init(&xprt->reserve_lock);
INIT_LIST_HEAD(&xprt->free);
INIT_LIST_HEAD(&xprt->recv);
- INIT_WORK(&xprt->sock_connect, xprt_socket_connect, xprt);
- INIT_WORK(&xprt->task_cleanup, xprt_socket_autoclose, xprt);
+ INIT_WORK(&xprt->task_cleanup, xprt_autoclose, xprt);
init_timer(&xprt->timer);
xprt->timer.function = xprt_init_autodisconnect;
xprt->timer.data = (unsigned long) xprt;
xprt->last_used = jiffies;
- xprt->port = XPRT_MAX_RESVPORT;
-
- /* Set timeout parameters */
- if (to) {
- xprt->timeout = *to;
- } else
- xprt_default_timeout(&xprt->timeout, xprt->prot);
+ xprt->cwnd = RPC_INITCWND;
rpc_init_wait_queue(&xprt->pending, "xprt_pending");
rpc_init_wait_queue(&xprt->sending, "xprt_sending");
@@ -1529,139 +944,25 @@ xprt_setup(int proto, struct sockaddr_in *ap, struct rpc_timeout *to)
rpc_init_priority_wait_queue(&xprt->backlog, "xprt_backlog");
/* initialize free list */
- for (req = &xprt->slot[entries-1]; req >= &xprt->slot[0]; req--)
+ for (req = &xprt->slot[xprt->max_reqs-1]; req >= &xprt->slot[0]; req--)
list_add(&req->rq_list, &xprt->free);
xprt_init_xid(xprt);
- /* Check whether we want to use a reserved port */
- xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0;
-
dprintk("RPC: created transport %p with %u slots\n", xprt,
xprt->max_reqs);
return xprt;
}
-/*
- * Bind to a reserved port
- */
-static inline int xprt_bindresvport(struct rpc_xprt *xprt, struct socket *sock)
-{
- struct sockaddr_in myaddr = {
- .sin_family = AF_INET,
- };
- int err, port;
-
- /* Were we already bound to a given port? Try to reuse it */
- port = xprt->port;
- do {
- myaddr.sin_port = htons(port);
- err = sock->ops->bind(sock, (struct sockaddr *) &myaddr,
- sizeof(myaddr));
- if (err == 0) {
- xprt->port = port;
- return 0;
- }
- if (--port == 0)
- port = XPRT_MAX_RESVPORT;
- } while (err == -EADDRINUSE && port != xprt->port);
-
- printk("RPC: Can't bind to reserved port (%d).\n", -err);
- return err;
-}
-
-static void
-xprt_bind_socket(struct rpc_xprt *xprt, struct socket *sock)
-{
- struct sock *sk = sock->sk;
-
- if (xprt->inet)
- return;
-
- write_lock_bh(&sk->sk_callback_lock);
- sk->sk_user_data = xprt;
- xprt->old_data_ready = sk->sk_data_ready;
- xprt->old_state_change = sk->sk_state_change;
- xprt->old_write_space = sk->sk_write_space;
- if (xprt->prot == IPPROTO_UDP) {
- sk->sk_data_ready = udp_data_ready;
- sk->sk_no_check = UDP_CSUM_NORCV;
- xprt_set_connected(xprt);
- } else {
- tcp_sk(sk)->nonagle = 1; /* disable Nagle's algorithm */
- sk->sk_data_ready = tcp_data_ready;
- sk->sk_state_change = tcp_state_change;
- xprt_clear_connected(xprt);
- }
- sk->sk_write_space = xprt_write_space;
-
- /* Reset to new socket */
- xprt->sock = sock;
- xprt->inet = sk;
- write_unlock_bh(&sk->sk_callback_lock);
-
- return;
-}
-
-/*
- * Set socket buffer length
- */
-void
-xprt_sock_setbufsize(struct rpc_xprt *xprt)
-{
- struct sock *sk = xprt->inet;
-
- if (xprt->stream)
- return;
- if (xprt->rcvsize) {
- sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
- sk->sk_rcvbuf = xprt->rcvsize * xprt->max_reqs * 2;
- }
- if (xprt->sndsize) {
- sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
- sk->sk_sndbuf = xprt->sndsize * xprt->max_reqs * 2;
- sk->sk_write_space(sk);
- }
-}
-
-/*
- * Datastream sockets are created here, but xprt_connect will create
- * and connect stream sockets.
- */
-static struct socket * xprt_create_socket(struct rpc_xprt *xprt, int proto, int resvport)
-{
- struct socket *sock;
- int type, err;
-
- dprintk("RPC: xprt_create_socket(%s %d)\n",
- (proto == IPPROTO_UDP)? "udp" : "tcp", proto);
-
- type = (proto == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM;
-
- if ((err = sock_create_kern(PF_INET, type, proto, &sock)) < 0) {
- printk("RPC: can't create socket (%d).\n", -err);
- return NULL;
- }
-
- /* If the caller has the capability, bind to a reserved port */
- if (resvport && xprt_bindresvport(xprt, sock) < 0) {
- printk("RPC: can't bind to reserved port.\n");
- goto failed;
- }
-
- return sock;
-
-failed:
- sock_release(sock);
- return NULL;
-}
-
-/*
- * Create an RPC client transport given the protocol and peer address.
+/**
+ * xprt_create_proto - create an RPC client transport
+ * @proto: requested transport protocol
+ * @sap: remote peer's address
+ * @to: timeout parameters for new transport
+ *
*/
-struct rpc_xprt *
-xprt_create_proto(int proto, struct sockaddr_in *sap, struct rpc_timeout *to)
+struct rpc_xprt *xprt_create_proto(int proto, struct sockaddr_in *sap, struct rpc_timeout *to)
{
struct rpc_xprt *xprt;
@@ -1673,46 +974,26 @@ xprt_create_proto(int proto, struct sockaddr_in *sap, struct rpc_timeout *to)
return xprt;
}
-/*
- * Prepare for transport shutdown.
- */
-static void
-xprt_shutdown(struct rpc_xprt *xprt)
+static void xprt_shutdown(struct rpc_xprt *xprt)
{
xprt->shutdown = 1;
rpc_wake_up(&xprt->sending);
rpc_wake_up(&xprt->resend);
- rpc_wake_up(&xprt->pending);
+ xprt_wake_pending_tasks(xprt, -EIO);
rpc_wake_up(&xprt->backlog);
- wake_up(&xprt->cong_wait);
del_timer_sync(&xprt->timer);
-
- /* synchronously wait for connect worker to finish */
- cancel_delayed_work(&xprt->sock_connect);
- flush_scheduled_work();
}
-/*
- * Clear the xprt backlog queue
- */
-static int
-xprt_clear_backlog(struct rpc_xprt *xprt) {
- rpc_wake_up_next(&xprt->backlog);
- wake_up(&xprt->cong_wait);
- return 1;
-}
-
-/*
- * Destroy an RPC transport, killing off all requests.
+/**
+ * xprt_destroy - destroy an RPC transport, killing off all requests.
+ * @xprt: transport to destroy
+ *
*/
-int
-xprt_destroy(struct rpc_xprt *xprt)
+int xprt_destroy(struct rpc_xprt *xprt)
{
dprintk("RPC: destroying transport %p\n", xprt);
xprt_shutdown(xprt);
- xprt_disconnect(xprt);
- xprt_close(xprt);
- kfree(xprt->slot);
+ xprt->ops->destroy(xprt);
kfree(xprt);
return 0;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
new file mode 100644
index 00000000000..0a51fd46a84
--- /dev/null
+++ b/net/sunrpc/xprtsock.c
@@ -0,0 +1,1261 @@
+/*
+ * linux/net/sunrpc/xprtsock.c
+ *
+ * Client-side transport implementation for sockets.
+ *
+ * TCP callback races fixes (C) 1998 Red Hat Software <alan@redhat.com>
+ * TCP send fixes (C) 1998 Red Hat Software <alan@redhat.com>
+ * TCP NFS related read + write fixes
+ * (C) 1999 Dave Airlie, University of Limerick, Ireland <airlied@linux.ie>
+ *
+ * Rewrite of larges part of the code in order to stabilize TCP stuff.
+ * Fix behaviour when socket buffer is full.
+ * (C) 1999 Trond Myklebust <trond.myklebust@fys.uio.no>
+ *
+ * IP socket transport implementation, (C) 2005 Chuck Lever <cel@netapp.com>
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/capability.h>
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/errno.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/net.h>
+#include <linux/mm.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/file.h>
+
+#include <net/sock.h>
+#include <net/checksum.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+
+/*
+ * xprtsock tunables
+ */
+unsigned int xprt_udp_slot_table_entries = RPC_DEF_SLOT_TABLE;
+unsigned int xprt_tcp_slot_table_entries = RPC_DEF_SLOT_TABLE;
+
+unsigned int xprt_min_resvport = RPC_DEF_MIN_RESVPORT;
+unsigned int xprt_max_resvport = RPC_DEF_MAX_RESVPORT;
+
+/*
+ * How many times to try sending a request on a socket before waiting
+ * for the socket buffer to clear.
+ */
+#define XS_SENDMSG_RETRY (10U)
+
+/*
+ * Time out for an RPC UDP socket connect. UDP socket connects are
+ * synchronous, but we set a timeout anyway in case of resource
+ * exhaustion on the local host.
+ */
+#define XS_UDP_CONN_TO (5U * HZ)
+
+/*
+ * Wait duration for an RPC TCP connection to be established. Solaris
+ * NFS over TCP uses 60 seconds, for example, which is in line with how
+ * long a server takes to reboot.
+ */
+#define XS_TCP_CONN_TO (60U * HZ)
+
+/*
+ * Wait duration for a reply from the RPC portmapper.
+ */
+#define XS_BIND_TO (60U * HZ)
+
+/*
+ * Delay if a UDP socket connect error occurs. This is most likely some
+ * kind of resource problem on the local host.
+ */
+#define XS_UDP_REEST_TO (2U * HZ)
+
+/*
+ * The reestablish timeout allows clients to delay for a bit before attempting
+ * to reconnect to a server that just dropped our connection.
+ *
+ * We implement an exponential backoff when trying to reestablish a TCP
+ * transport connection with the server. Some servers like to drop a TCP
+ * connection when they are overworked, so we start with a short timeout and
+ * increase over time if the server is down or not responding.
+ */
+#define XS_TCP_INIT_REEST_TO (3U * HZ)
+#define XS_TCP_MAX_REEST_TO (5U * 60 * HZ)
+
+/*
+ * TCP idle timeout; client drops the transport socket if it is idle
+ * for this long. Note that we also timeout UDP sockets to prevent
+ * holding port numbers when there is no RPC traffic.
+ */
+#define XS_IDLE_DISC_TO (5U * 60 * HZ)
+
+#ifdef RPC_DEBUG
+# undef RPC_DEBUG_DATA
+# define RPCDBG_FACILITY RPCDBG_TRANS
+#endif
+
+#ifdef RPC_DEBUG_DATA
+static void xs_pktdump(char *msg, u32 *packet, unsigned int count)
+{
+ u8 *buf = (u8 *) packet;
+ int j;
+
+ dprintk("RPC: %s\n", msg);
+ for (j = 0; j < count && j < 128; j += 4) {
+ if (!(j & 31)) {
+ if (j)
+ dprintk("\n");
+ dprintk("0x%04x ", j);
+ }
+ dprintk("%02x%02x%02x%02x ",
+ buf[j], buf[j+1], buf[j+2], buf[j+3]);
+ }
+ dprintk("\n");
+}
+#else
+static inline void xs_pktdump(char *msg, u32 *packet, unsigned int count)
+{
+ /* NOP */
+}
+#endif
+
+#define XS_SENDMSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL)
+
+static inline int xs_send_head(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, unsigned int len)
+{
+ struct kvec iov = {
+ .iov_base = xdr->head[0].iov_base + base,
+ .iov_len = len - base,
+ };
+ struct msghdr msg = {
+ .msg_name = addr,
+ .msg_namelen = addrlen,
+ .msg_flags = XS_SENDMSG_FLAGS,
+ };
+
+ if (xdr->len > len)
+ msg.msg_flags |= MSG_MORE;
+
+ if (likely(iov.iov_len))
+ return kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len);
+ return kernel_sendmsg(sock, &msg, NULL, 0, 0);
+}
+
+static int xs_send_tail(struct socket *sock, struct xdr_buf *xdr, unsigned int base, unsigned int len)
+{
+ struct kvec iov = {
+ .iov_base = xdr->tail[0].iov_base + base,
+ .iov_len = len - base,
+ };
+ struct msghdr msg = {
+ .msg_flags = XS_SENDMSG_FLAGS,
+ };
+
+ return kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len);
+}
+
+/**
+ * xs_sendpages - write pages directly to a socket
+ * @sock: socket to send on
+ * @addr: UDP only -- address of destination
+ * @addrlen: UDP only -- length of destination address
+ * @xdr: buffer containing this request
+ * @base: starting position in the buffer
+ *
+ */
+static inline int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base)
+{
+ struct page **ppage = xdr->pages;
+ unsigned int len, pglen = xdr->page_len;
+ int err, ret = 0;
+ ssize_t (*sendpage)(struct socket *, struct page *, int, size_t, int);
+
+ if (unlikely(!sock))
+ return -ENOTCONN;
+
+ clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
+
+ len = xdr->head[0].iov_len;
+ if (base < len || (addr != NULL && base == 0)) {
+ err = xs_send_head(sock, addr, addrlen, xdr, base, len);
+ if (ret == 0)
+ ret = err;
+ else if (err > 0)
+ ret += err;
+ if (err != (len - base))
+ goto out;
+ base = 0;
+ } else
+ base -= len;
+
+ if (unlikely(pglen == 0))
+ goto copy_tail;
+ if (unlikely(base >= pglen)) {
+ base -= pglen;
+ goto copy_tail;
+ }
+ if (base || xdr->page_base) {
+ pglen -= base;
+ base += xdr->page_base;
+ ppage += base >> PAGE_CACHE_SHIFT;
+ base &= ~PAGE_CACHE_MASK;
+ }
+
+ sendpage = sock->ops->sendpage ? : sock_no_sendpage;
+ do {
+ int flags = XS_SENDMSG_FLAGS;
+
+ len = PAGE_CACHE_SIZE;
+ if (base)
+ len -= base;
+ if (pglen < len)
+ len = pglen;
+
+ if (pglen != len || xdr->tail[0].iov_len != 0)
+ flags |= MSG_MORE;
+
+ /* Hmm... We might be dealing with highmem pages */
+ if (PageHighMem(*ppage))
+ sendpage = sock_no_sendpage;
+ err = sendpage(sock, *ppage, base, len, flags);
+ if (ret == 0)
+ ret = err;
+ else if (err > 0)
+ ret += err;
+ if (err != len)
+ goto out;
+ base = 0;
+ ppage++;
+ } while ((pglen -= len) != 0);
+copy_tail:
+ len = xdr->tail[0].iov_len;
+ if (base < len) {
+ err = xs_send_tail(sock, xdr, base, len);
+ if (ret == 0)
+ ret = err;
+ else if (err > 0)
+ ret += err;
+ }
+out:
+ return ret;
+}
+
+/**
+ * xs_nospace - place task on wait queue if transmit was incomplete
+ * @task: task to put to sleep
+ *
+ */
+static void xs_nospace(struct rpc_task *task)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+ struct rpc_xprt *xprt = req->rq_xprt;
+
+ dprintk("RPC: %4d xmit incomplete (%u left of %u)\n",
+ task->tk_pid, req->rq_slen - req->rq_bytes_sent,
+ req->rq_slen);
+
+ if (test_bit(SOCK_ASYNC_NOSPACE, &xprt->sock->flags)) {
+ /* Protect against races with write_space */
+ spin_lock_bh(&xprt->transport_lock);
+
+ /* Don't race with disconnect */
+ if (!xprt_connected(xprt))
+ task->tk_status = -ENOTCONN;
+ else if (test_bit(SOCK_NOSPACE, &xprt->sock->flags))
+ xprt_wait_for_buffer_space(task);
+
+ spin_unlock_bh(&xprt->transport_lock);
+ } else
+ /* Keep holding the socket if it is blocked */
+ rpc_delay(task, HZ>>4);
+}
+
+/**
+ * xs_udp_send_request - write an RPC request to a UDP socket
+ * @task: address of RPC task that manages the state of an RPC request
+ *
+ * Return values:
+ * 0: The request has been sent
+ * EAGAIN: The socket was blocked, please call again later to
+ * complete the request
+ * ENOTCONN: Caller needs to invoke connect logic then call again
+ * other: Some other error occured, the request was not sent
+ */
+static int xs_udp_send_request(struct rpc_task *task)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+ struct rpc_xprt *xprt = req->rq_xprt;
+ struct xdr_buf *xdr = &req->rq_snd_buf;
+ int status;
+
+ xs_pktdump("packet data:",
+ req->rq_svec->iov_base,
+ req->rq_svec->iov_len);
+
+ req->rq_xtime = jiffies;
+ status = xs_sendpages(xprt->sock, (struct sockaddr *) &xprt->addr,
+ sizeof(xprt->addr), xdr, req->rq_bytes_sent);
+
+ dprintk("RPC: xs_udp_send_request(%u) = %d\n",
+ xdr->len - req->rq_bytes_sent, status);
+
+ if (likely(status >= (int) req->rq_slen))
+ return 0;
+
+ /* Still some bytes left; set up for a retry later. */
+ if (status > 0)
+ status = -EAGAIN;
+
+ switch (status) {
+ case -ENETUNREACH:
+ case -EPIPE:
+ case -ECONNREFUSED:
+ /* When the server has died, an ICMP port unreachable message
+ * prompts ECONNREFUSED. */
+ break;
+ case -EAGAIN:
+ xs_nospace(task);
+ break;
+ default:
+ dprintk("RPC: sendmsg returned unrecognized error %d\n",
+ -status);
+ break;
+ }
+
+ return status;
+}
+
+static inline void xs_encode_tcp_record_marker(struct xdr_buf *buf)
+{
+ u32 reclen = buf->len - sizeof(rpc_fraghdr);
+ rpc_fraghdr *base = buf->head[0].iov_base;
+ *base = htonl(RPC_LAST_STREAM_FRAGMENT | reclen);
+}
+
+/**
+ * xs_tcp_send_request - write an RPC request to a TCP socket
+ * @task: address of RPC task that manages the state of an RPC request
+ *
+ * Return values:
+ * 0: The request has been sent
+ * EAGAIN: The socket was blocked, please call again later to
+ * complete the request
+ * ENOTCONN: Caller needs to invoke connect logic then call again
+ * other: Some other error occured, the request was not sent
+ *
+ * XXX: In the case of soft timeouts, should we eventually give up
+ * if sendmsg is not able to make progress?
+ */
+static int xs_tcp_send_request(struct rpc_task *task)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+ struct rpc_xprt *xprt = req->rq_xprt;
+ struct xdr_buf *xdr = &req->rq_snd_buf;
+ int status, retry = 0;
+
+ xs_encode_tcp_record_marker(&req->rq_snd_buf);
+
+ xs_pktdump("packet data:",
+ req->rq_svec->iov_base,
+ req->rq_svec->iov_len);
+
+ /* Continue transmitting the packet/record. We must be careful
+ * to cope with writespace callbacks arriving _after_ we have
+ * called sendmsg(). */
+ while (1) {
+ req->rq_xtime = jiffies;
+ status = xs_sendpages(xprt->sock, NULL, 0, xdr,
+ req->rq_bytes_sent);
+
+ dprintk("RPC: xs_tcp_send_request(%u) = %d\n",
+ xdr->len - req->rq_bytes_sent, status);
+
+ if (unlikely(status < 0))
+ break;
+
+ /* If we've sent the entire packet, immediately
+ * reset the count of bytes sent. */
+ req->rq_bytes_sent += status;
+ if (likely(req->rq_bytes_sent >= req->rq_slen)) {
+ req->rq_bytes_sent = 0;
+ return 0;
+ }
+
+ status = -EAGAIN;
+ if (retry++ > XS_SENDMSG_RETRY)
+ break;
+ }
+
+ switch (status) {
+ case -EAGAIN:
+ xs_nospace(task);
+ break;
+ case -ECONNREFUSED:
+ case -ECONNRESET:
+ case -ENOTCONN:
+ case -EPIPE:
+ status = -ENOTCONN;
+ break;
+ default:
+ dprintk("RPC: sendmsg returned unrecognized error %d\n",
+ -status);
+ xprt_disconnect(xprt);
+ break;
+ }
+
+ return status;
+}
+
+/**
+ * xs_close - close a socket
+ * @xprt: transport
+ *
+ * This is used when all requests are complete; ie, no DRC state remains
+ * on the server we want to save.
+ */
+static void xs_close(struct rpc_xprt *xprt)
+{
+ struct socket *sock = xprt->sock;
+ struct sock *sk = xprt->inet;
+
+ if (!sk)
+ return;
+
+ dprintk("RPC: xs_close xprt %p\n", xprt);
+
+ write_lock_bh(&sk->sk_callback_lock);
+ xprt->inet = NULL;
+ xprt->sock = NULL;
+
+ sk->sk_user_data = NULL;
+ sk->sk_data_ready = xprt->old_data_ready;
+ sk->sk_state_change = xprt->old_state_change;
+ sk->sk_write_space = xprt->old_write_space;
+ write_unlock_bh(&sk->sk_callback_lock);
+
+ sk->sk_no_check = 0;
+
+ sock_release(sock);
+}
+
+/**
+ * xs_destroy - prepare to shutdown a transport
+ * @xprt: doomed transport
+ *
+ */
+static void xs_destroy(struct rpc_xprt *xprt)
+{
+ dprintk("RPC: xs_destroy xprt %p\n", xprt);
+
+ cancel_delayed_work(&xprt->connect_worker);
+ flush_scheduled_work();
+
+ xprt_disconnect(xprt);
+ xs_close(xprt);
+ kfree(xprt->slot);
+}
+
+static inline struct rpc_xprt *xprt_from_sock(struct sock *sk)
+{
+ return (struct rpc_xprt *) sk->sk_user_data;
+}
+
+/**
+ * xs_udp_data_ready - "data ready" callback for UDP sockets
+ * @sk: socket with data to read
+ * @len: how much data to read
+ *
+ */
+static void xs_udp_data_ready(struct sock *sk, int len)
+{
+ struct rpc_task *task;
+ struct rpc_xprt *xprt;
+ struct rpc_rqst *rovr;
+ struct sk_buff *skb;
+ int err, repsize, copied;
+ u32 _xid, *xp;
+
+ read_lock(&sk->sk_callback_lock);
+ dprintk("RPC: xs_udp_data_ready...\n");
+ if (!(xprt = xprt_from_sock(sk)))
+ goto out;
+
+ if ((skb = skb_recv_datagram(sk, 0, 1, &err)) == NULL)
+ goto out;
+
+ if (xprt->shutdown)
+ goto dropit;
+
+ repsize = skb->len - sizeof(struct udphdr);
+ if (repsize < 4) {
+ dprintk("RPC: impossible RPC reply size %d!\n", repsize);
+ goto dropit;
+ }
+
+ /* Copy the XID from the skb... */
+ xp = skb_header_pointer(skb, sizeof(struct udphdr),
+ sizeof(_xid), &_xid);
+ if (xp == NULL)
+ goto dropit;
+
+ /* Look up and lock the request corresponding to the given XID */
+ spin_lock(&xprt->transport_lock);
+ rovr = xprt_lookup_rqst(xprt, *xp);
+ if (!rovr)
+ goto out_unlock;
+ task = rovr->rq_task;
+
+ if ((copied = rovr->rq_private_buf.buflen) > repsize)
+ copied = repsize;
+
+ /* Suck it into the iovec, verify checksum if not done by hw. */
+ if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb))
+ goto out_unlock;
+
+ /* Something worked... */
+ dst_confirm(skb->dst);
+
+ xprt_adjust_cwnd(task, copied);
+ xprt_update_rtt(task);
+ xprt_complete_rqst(task, copied);
+
+ out_unlock:
+ spin_unlock(&xprt->transport_lock);
+ dropit:
+ skb_free_datagram(sk, skb);
+ out:
+ read_unlock(&sk->sk_callback_lock);
+}
+
+static inline size_t xs_tcp_copy_data(skb_reader_t *desc, void *p, size_t len)
+{
+ if (len > desc->count)
+ len = desc->count;
+ if (skb_copy_bits(desc->skb, desc->offset, p, len)) {
+ dprintk("RPC: failed to copy %zu bytes from skb. %zu bytes remain\n",
+ len, desc->count);
+ return 0;
+ }
+ desc->offset += len;
+ desc->count -= len;
+ dprintk("RPC: copied %zu bytes from skb. %zu bytes remain\n",
+ len, desc->count);
+ return len;
+}
+
+static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, skb_reader_t *desc)
+{
+ size_t len, used;
+ char *p;
+
+ p = ((char *) &xprt->tcp_recm) + xprt->tcp_offset;
+ len = sizeof(xprt->tcp_recm) - xprt->tcp_offset;
+ used = xs_tcp_copy_data(desc, p, len);
+ xprt->tcp_offset += used;
+ if (used != len)
+ return;
+
+ xprt->tcp_reclen = ntohl(xprt->tcp_recm);
+ if (xprt->tcp_reclen & RPC_LAST_STREAM_FRAGMENT)
+ xprt->tcp_flags |= XPRT_LAST_FRAG;
+ else
+ xprt->tcp_flags &= ~XPRT_LAST_FRAG;
+ xprt->tcp_reclen &= RPC_FRAGMENT_SIZE_MASK;
+
+ xprt->tcp_flags &= ~XPRT_COPY_RECM;
+ xprt->tcp_offset = 0;
+
+ /* Sanity check of the record length */
+ if (unlikely(xprt->tcp_reclen < 4)) {
+ dprintk("RPC: invalid TCP record fragment length\n");
+ xprt_disconnect(xprt);
+ return;
+ }
+ dprintk("RPC: reading TCP record fragment of length %d\n",
+ xprt->tcp_reclen);
+}
+
+static void xs_tcp_check_recm(struct rpc_xprt *xprt)
+{
+ dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u, tcp_flags = %lx\n",
+ xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen, xprt->tcp_flags);
+ if (xprt->tcp_offset == xprt->tcp_reclen) {
+ xprt->tcp_flags |= XPRT_COPY_RECM;
+ xprt->tcp_offset = 0;
+ if (xprt->tcp_flags & XPRT_LAST_FRAG) {
+ xprt->tcp_flags &= ~XPRT_COPY_DATA;
+ xprt->tcp_flags |= XPRT_COPY_XID;
+ xprt->tcp_copied = 0;
+ }
+ }
+}
+
+static inline void xs_tcp_read_xid(struct rpc_xprt *xprt, skb_reader_t *desc)
+{
+ size_t len, used;
+ char *p;
+
+ len = sizeof(xprt->tcp_xid) - xprt->tcp_offset;
+ dprintk("RPC: reading XID (%Zu bytes)\n", len);
+ p = ((char *) &xprt->tcp_xid) + xprt->tcp_offset;
+ used = xs_tcp_copy_data(desc, p, len);
+ xprt->tcp_offset += used;
+ if (used != len)
+ return;
+ xprt->tcp_flags &= ~XPRT_COPY_XID;
+ xprt->tcp_flags |= XPRT_COPY_DATA;
+ xprt->tcp_copied = 4;
+ dprintk("RPC: reading reply for XID %08x\n",
+ ntohl(xprt->tcp_xid));
+ xs_tcp_check_recm(xprt);
+}
+
+static inline void xs_tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
+{
+ struct rpc_rqst *req;
+ struct xdr_buf *rcvbuf;
+ size_t len;
+ ssize_t r;
+
+ /* Find and lock the request corresponding to this xid */
+ spin_lock(&xprt->transport_lock);
+ req = xprt_lookup_rqst(xprt, xprt->tcp_xid);
+ if (!req) {
+ xprt->tcp_flags &= ~XPRT_COPY_DATA;
+ dprintk("RPC: XID %08x request not found!\n",
+ ntohl(xprt->tcp_xid));
+ spin_unlock(&xprt->transport_lock);
+ return;
+ }
+
+ rcvbuf = &req->rq_private_buf;
+ len = desc->count;
+ if (len > xprt->tcp_reclen - xprt->tcp_offset) {
+ skb_reader_t my_desc;
+
+ len = xprt->tcp_reclen - xprt->tcp_offset;
+ memcpy(&my_desc, desc, sizeof(my_desc));
+ my_desc.count = len;
+ r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
+ &my_desc, xs_tcp_copy_data);
+ desc->count -= r;
+ desc->offset += r;
+ } else
+ r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
+ desc, xs_tcp_copy_data);
+
+ if (r > 0) {
+ xprt->tcp_copied += r;
+ xprt->tcp_offset += r;
+ }
+ if (r != len) {
+ /* Error when copying to the receive buffer,
+ * usually because we weren't able to allocate
+ * additional buffer pages. All we can do now
+ * is turn off XPRT_COPY_DATA, so the request
+ * will not receive any additional updates,
+ * and time out.
+ * Any remaining data from this record will
+ * be discarded.
+ */
+ xprt->tcp_flags &= ~XPRT_COPY_DATA;
+ dprintk("RPC: XID %08x truncated request\n",
+ ntohl(xprt->tcp_xid));
+ dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n",
+ xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen);
+ goto out;
+ }
+
+ dprintk("RPC: XID %08x read %Zd bytes\n",
+ ntohl(xprt->tcp_xid), r);
+ dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n",
+ xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen);
+
+ if (xprt->tcp_copied == req->rq_private_buf.buflen)
+ xprt->tcp_flags &= ~XPRT_COPY_DATA;
+ else if (xprt->tcp_offset == xprt->tcp_reclen) {
+ if (xprt->tcp_flags & XPRT_LAST_FRAG)
+ xprt->tcp_flags &= ~XPRT_COPY_DATA;
+ }
+
+out:
+ if (!(xprt->tcp_flags & XPRT_COPY_DATA))
+ xprt_complete_rqst(req->rq_task, xprt->tcp_copied);
+ spin_unlock(&xprt->transport_lock);
+ xs_tcp_check_recm(xprt);
+}
+
+static inline void xs_tcp_read_discard(struct rpc_xprt *xprt, skb_reader_t *desc)
+{
+ size_t len;
+
+ len = xprt->tcp_reclen - xprt->tcp_offset;
+ if (len > desc->count)
+ len = desc->count;
+ desc->count -= len;
+ desc->offset += len;
+ xprt->tcp_offset += len;
+ dprintk("RPC: discarded %Zu bytes\n", len);
+ xs_tcp_check_recm(xprt);
+}
+
+static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, unsigned int offset, size_t len)
+{
+ struct rpc_xprt *xprt = rd_desc->arg.data;
+ skb_reader_t desc = {
+ .skb = skb,
+ .offset = offset,
+ .count = len,
+ .csum = 0
+ };
+
+ dprintk("RPC: xs_tcp_data_recv started\n");
+ do {
+ /* Read in a new fragment marker if necessary */
+ /* Can we ever really expect to get completely empty fragments? */
+ if (xprt->tcp_flags & XPRT_COPY_RECM) {
+ xs_tcp_read_fraghdr(xprt, &desc);
+ continue;
+ }
+ /* Read in the xid if necessary */
+ if (xprt->tcp_flags & XPRT_COPY_XID) {
+ xs_tcp_read_xid(xprt, &desc);
+ continue;
+ }
+ /* Read in the request data */
+ if (xprt->tcp_flags & XPRT_COPY_DATA) {
+ xs_tcp_read_request(xprt, &desc);
+ continue;
+ }
+ /* Skip over any trailing bytes on short reads */
+ xs_tcp_read_discard(xprt, &desc);
+ } while (desc.count);
+ dprintk("RPC: xs_tcp_data_recv done\n");
+ return len - desc.count;
+}
+
+/**
+ * xs_tcp_data_ready - "data ready" callback for TCP sockets
+ * @sk: socket with data to read
+ * @bytes: how much data to read
+ *
+ */
+static void xs_tcp_data_ready(struct sock *sk, int bytes)
+{
+ struct rpc_xprt *xprt;
+ read_descriptor_t rd_desc;
+
+ read_lock(&sk->sk_callback_lock);
+ dprintk("RPC: xs_tcp_data_ready...\n");
+ if (!(xprt = xprt_from_sock(sk)))
+ goto out;
+ if (xprt->shutdown)
+ goto out;
+
+ /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */
+ rd_desc.arg.data = xprt;
+ rd_desc.count = 65536;
+ tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv);
+out:
+ read_unlock(&sk->sk_callback_lock);
+}
+
+/**
+ * xs_tcp_state_change - callback to handle TCP socket state changes
+ * @sk: socket whose state has changed
+ *
+ */
+static void xs_tcp_state_change(struct sock *sk)
+{
+ struct rpc_xprt *xprt;
+
+ read_lock(&sk->sk_callback_lock);
+ if (!(xprt = xprt_from_sock(sk)))
+ goto out;
+ dprintk("RPC: xs_tcp_state_change client %p...\n", xprt);
+ dprintk("RPC: state %x conn %d dead %d zapped %d\n",
+ sk->sk_state, xprt_connected(xprt),
+ sock_flag(sk, SOCK_DEAD),
+ sock_flag(sk, SOCK_ZAPPED));
+
+ switch (sk->sk_state) {
+ case TCP_ESTABLISHED:
+ spin_lock_bh(&xprt->transport_lock);
+ if (!xprt_test_and_set_connected(xprt)) {
+ /* Reset TCP record info */
+ xprt->tcp_offset = 0;
+ xprt->tcp_reclen = 0;
+ xprt->tcp_copied = 0;
+ xprt->tcp_flags = XPRT_COPY_RECM | XPRT_COPY_XID;
+ xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
+ xprt_wake_pending_tasks(xprt, 0);
+ }
+ spin_unlock_bh(&xprt->transport_lock);
+ break;
+ case TCP_SYN_SENT:
+ case TCP_SYN_RECV:
+ break;
+ default:
+ xprt_disconnect(xprt);
+ break;
+ }
+ out:
+ read_unlock(&sk->sk_callback_lock);
+}
+
+/**
+ * xs_udp_write_space - callback invoked when socket buffer space
+ * becomes available
+ * @sk: socket whose state has changed
+ *
+ * Called when more output buffer space is available for this socket.
+ * We try not to wake our writers until they can make "significant"
+ * progress, otherwise we'll waste resources thrashing kernel_sendmsg
+ * with a bunch of small requests.
+ */
+static void xs_udp_write_space(struct sock *sk)
+{
+ read_lock(&sk->sk_callback_lock);
+
+ /* from net/core/sock.c:sock_def_write_space */
+ if (sock_writeable(sk)) {
+ struct socket *sock;
+ struct rpc_xprt *xprt;
+
+ if (unlikely(!(sock = sk->sk_socket)))
+ goto out;
+ if (unlikely(!(xprt = xprt_from_sock(sk))))
+ goto out;
+ if (unlikely(!test_and_clear_bit(SOCK_NOSPACE, &sock->flags)))
+ goto out;
+
+ xprt_write_space(xprt);
+ }
+
+ out:
+ read_unlock(&sk->sk_callback_lock);
+}
+
+/**
+ * xs_tcp_write_space - callback invoked when socket buffer space
+ * becomes available
+ * @sk: socket whose state has changed
+ *
+ * Called when more output buffer space is available for this socket.
+ * We try not to wake our writers until they can make "significant"
+ * progress, otherwise we'll waste resources thrashing kernel_sendmsg
+ * with a bunch of small requests.
+ */
+static void xs_tcp_write_space(struct sock *sk)
+{
+ read_lock(&sk->sk_callback_lock);
+
+ /* from net/core/stream.c:sk_stream_write_space */
+ if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
+ struct socket *sock;
+ struct rpc_xprt *xprt;
+
+ if (unlikely(!(sock = sk->sk_socket)))
+ goto out;
+ if (unlikely(!(xprt = xprt_from_sock(sk))))
+ goto out;
+ if (unlikely(!test_and_clear_bit(SOCK_NOSPACE, &sock->flags)))
+ goto out;
+
+ xprt_write_space(xprt);
+ }
+
+ out:
+ read_unlock(&sk->sk_callback_lock);
+}
+
+static void xs_udp_do_set_buffer_size(struct rpc_xprt *xprt)
+{
+ struct sock *sk = xprt->inet;
+
+ if (xprt->rcvsize) {
+ sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+ sk->sk_rcvbuf = xprt->rcvsize * xprt->max_reqs * 2;
+ }
+ if (xprt->sndsize) {
+ sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+ sk->sk_sndbuf = xprt->sndsize * xprt->max_reqs * 2;
+ sk->sk_write_space(sk);
+ }
+}
+
+/**
+ * xs_udp_set_buffer_size - set send and receive limits
+ * @xprt: generic transport
+ * @sndsize: requested size of send buffer, in bytes
+ * @rcvsize: requested size of receive buffer, in bytes
+ *
+ * Set socket send and receive buffer size limits.
+ */
+static void xs_udp_set_buffer_size(struct rpc_xprt *xprt, size_t sndsize, size_t rcvsize)
+{
+ xprt->sndsize = 0;
+ if (sndsize)
+ xprt->sndsize = sndsize + 1024;
+ xprt->rcvsize = 0;
+ if (rcvsize)
+ xprt->rcvsize = rcvsize + 1024;
+
+ xs_udp_do_set_buffer_size(xprt);
+}
+
+/**
+ * xs_udp_timer - called when a retransmit timeout occurs on a UDP transport
+ * @task: task that timed out
+ *
+ * Adjust the congestion window after a retransmit timeout has occurred.
+ */
+static void xs_udp_timer(struct rpc_task *task)
+{
+ xprt_adjust_cwnd(task, -ETIMEDOUT);
+}
+
+static int xs_bindresvport(struct rpc_xprt *xprt, struct socket *sock)
+{
+ struct sockaddr_in myaddr = {
+ .sin_family = AF_INET,
+ };
+ int err;
+ unsigned short port = xprt->port;
+
+ do {
+ myaddr.sin_port = htons(port);
+ err = sock->ops->bind(sock, (struct sockaddr *) &myaddr,
+ sizeof(myaddr));
+ if (err == 0) {
+ xprt->port = port;
+ dprintk("RPC: xs_bindresvport bound to port %u\n",
+ port);
+ return 0;
+ }
+ if (port <= xprt_min_resvport)
+ port = xprt_max_resvport;
+ else
+ port--;
+ } while (err == -EADDRINUSE && port != xprt->port);
+
+ dprintk("RPC: can't bind to reserved port (%d).\n", -err);
+ return err;
+}
+
+/**
+ * xs_udp_connect_worker - set up a UDP socket
+ * @args: RPC transport to connect
+ *
+ * Invoked by a work queue tasklet.
+ */
+static void xs_udp_connect_worker(void *args)
+{
+ struct rpc_xprt *xprt = (struct rpc_xprt *) args;
+ struct socket *sock = xprt->sock;
+ int err, status = -EIO;
+
+ if (xprt->shutdown || xprt->addr.sin_port == 0)
+ goto out;
+
+ dprintk("RPC: xs_udp_connect_worker for xprt %p\n", xprt);
+
+ /* Start by resetting any existing state */
+ xs_close(xprt);
+
+ if ((err = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock)) < 0) {
+ dprintk("RPC: can't create UDP transport socket (%d).\n", -err);
+ goto out;
+ }
+
+ if (xprt->resvport && xs_bindresvport(xprt, sock) < 0) {
+ sock_release(sock);
+ goto out;
+ }
+
+ if (!xprt->inet) {
+ struct sock *sk = sock->sk;
+
+ write_lock_bh(&sk->sk_callback_lock);
+
+ sk->sk_user_data = xprt;
+ xprt->old_data_ready = sk->sk_data_ready;
+ xprt->old_state_change = sk->sk_state_change;
+ xprt->old_write_space = sk->sk_write_space;
+ sk->sk_data_ready = xs_udp_data_ready;
+ sk->sk_write_space = xs_udp_write_space;
+ sk->sk_no_check = UDP_CSUM_NORCV;
+
+ xprt_set_connected(xprt);
+
+ /* Reset to new socket */
+ xprt->sock = sock;
+ xprt->inet = sk;
+
+ write_unlock_bh(&sk->sk_callback_lock);
+ }
+ xs_udp_do_set_buffer_size(xprt);
+ status = 0;
+out:
+ xprt_wake_pending_tasks(xprt, status);
+ xprt_clear_connecting(xprt);
+}
+
+/*
+ * We need to preserve the port number so the reply cache on the server can
+ * find our cached RPC replies when we get around to reconnecting.
+ */
+static void xs_tcp_reuse_connection(struct rpc_xprt *xprt)
+{
+ int result;
+ struct socket *sock = xprt->sock;
+ struct sockaddr any;
+
+ dprintk("RPC: disconnecting xprt %p to reuse port\n", xprt);
+
+ /*
+ * Disconnect the transport socket by doing a connect operation
+ * with AF_UNSPEC. This should return immediately...
+ */
+ memset(&any, 0, sizeof(any));
+ any.sa_family = AF_UNSPEC;
+ result = sock->ops->connect(sock, &any, sizeof(any), 0);
+ if (result)
+ dprintk("RPC: AF_UNSPEC connect return code %d\n",
+ result);
+}
+
+/**
+ * xs_tcp_connect_worker - connect a TCP socket to a remote endpoint
+ * @args: RPC transport to connect
+ *
+ * Invoked by a work queue tasklet.
+ */
+static void xs_tcp_connect_worker(void *args)
+{
+ struct rpc_xprt *xprt = (struct rpc_xprt *)args;
+ struct socket *sock = xprt->sock;
+ int err, status = -EIO;
+
+ if (xprt->shutdown || xprt->addr.sin_port == 0)
+ goto out;
+
+ dprintk("RPC: xs_tcp_connect_worker for xprt %p\n", xprt);
+
+ if (!xprt->sock) {
+ /* start from scratch */
+ if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) {
+ dprintk("RPC: can't create TCP transport socket (%d).\n", -err);
+ goto out;
+ }
+
+ if (xprt->resvport && xs_bindresvport(xprt, sock) < 0) {
+ sock_release(sock);
+ goto out;
+ }
+ } else
+ /* "close" the socket, preserving the local port */
+ xs_tcp_reuse_connection(xprt);
+
+ if (!xprt->inet) {
+ struct sock *sk = sock->sk;
+
+ write_lock_bh(&sk->sk_callback_lock);
+
+ sk->sk_user_data = xprt;
+ xprt->old_data_ready = sk->sk_data_ready;
+ xprt->old_state_change = sk->sk_state_change;
+ xprt->old_write_space = sk->sk_write_space;
+ sk->sk_data_ready = xs_tcp_data_ready;
+ sk->sk_state_change = xs_tcp_state_change;
+ sk->sk_write_space = xs_tcp_write_space;
+
+ /* socket options */
+ sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
+ sock_reset_flag(sk, SOCK_LINGER);
+ tcp_sk(sk)->linger2 = 0;
+ tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
+
+ xprt_clear_connected(xprt);
+
+ /* Reset to new socket */
+ xprt->sock = sock;
+ xprt->inet = sk;
+
+ write_unlock_bh(&sk->sk_callback_lock);
+ }
+
+ /* Tell the socket layer to start connecting... */
+ status = sock->ops->connect(sock, (struct sockaddr *) &xprt->addr,
+ sizeof(xprt->addr), O_NONBLOCK);
+ dprintk("RPC: %p connect status %d connected %d sock state %d\n",
+ xprt, -status, xprt_connected(xprt), sock->sk->sk_state);
+ if (status < 0) {
+ switch (status) {
+ case -EINPROGRESS:
+ case -EALREADY:
+ goto out_clear;
+ case -ECONNREFUSED:
+ case -ECONNRESET:
+ /* retry with existing socket, after a delay */
+ break;
+ default:
+ /* get rid of existing socket, and retry */
+ xs_close(xprt);
+ break;
+ }
+ }
+out:
+ xprt_wake_pending_tasks(xprt, status);
+out_clear:
+ xprt_clear_connecting(xprt);
+}
+
+/**
+ * xs_connect - connect a socket to a remote endpoint
+ * @task: address of RPC task that manages state of connect request
+ *
+ * TCP: If the remote end dropped the connection, delay reconnecting.
+ *
+ * UDP socket connects are synchronous, but we use a work queue anyway
+ * to guarantee that even unprivileged user processes can set up a
+ * socket on a privileged port.
+ *
+ * If a UDP socket connect fails, the delay behavior here prevents
+ * retry floods (hard mounts).
+ */
+static void xs_connect(struct rpc_task *task)
+{
+ struct rpc_xprt *xprt = task->tk_xprt;
+
+ if (xprt_test_and_set_connecting(xprt))
+ return;
+
+ if (xprt->sock != NULL) {
+ dprintk("RPC: xs_connect delayed xprt %p for %lu seconds\n",
+ xprt, xprt->reestablish_timeout / HZ);
+ schedule_delayed_work(&xprt->connect_worker,
+ xprt->reestablish_timeout);
+ xprt->reestablish_timeout <<= 1;
+ if (xprt->reestablish_timeout > XS_TCP_MAX_REEST_TO)
+ xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO;
+ } else {
+ dprintk("RPC: xs_connect scheduled xprt %p\n", xprt);
+ schedule_work(&xprt->connect_worker);
+
+ /* flush_scheduled_work can sleep... */
+ if (!RPC_IS_ASYNC(task))
+ flush_scheduled_work();
+ }
+}
+
+static struct rpc_xprt_ops xs_udp_ops = {
+ .set_buffer_size = xs_udp_set_buffer_size,
+ .reserve_xprt = xprt_reserve_xprt_cong,
+ .release_xprt = xprt_release_xprt_cong,
+ .connect = xs_connect,
+ .send_request = xs_udp_send_request,
+ .set_retrans_timeout = xprt_set_retrans_timeout_rtt,
+ .timer = xs_udp_timer,
+ .release_request = xprt_release_rqst_cong,
+ .close = xs_close,
+ .destroy = xs_destroy,
+};
+
+static struct rpc_xprt_ops xs_tcp_ops = {
+ .reserve_xprt = xprt_reserve_xprt,
+ .release_xprt = xprt_release_xprt,
+ .connect = xs_connect,
+ .send_request = xs_tcp_send_request,
+ .set_retrans_timeout = xprt_set_retrans_timeout_def,
+ .close = xs_close,
+ .destroy = xs_destroy,
+};
+
+/**
+ * xs_setup_udp - Set up transport to use a UDP socket
+ * @xprt: transport to set up
+ * @to: timeout parameters
+ *
+ */
+int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to)
+{
+ size_t slot_table_size;
+
+ dprintk("RPC: setting up udp-ipv4 transport...\n");
+
+ xprt->max_reqs = xprt_udp_slot_table_entries;
+ slot_table_size = xprt->max_reqs * sizeof(xprt->slot[0]);
+ xprt->slot = kmalloc(slot_table_size, GFP_KERNEL);
+ if (xprt->slot == NULL)
+ return -ENOMEM;
+ memset(xprt->slot, 0, slot_table_size);
+
+ xprt->prot = IPPROTO_UDP;
+ xprt->port = xprt_max_resvport;
+ xprt->tsh_size = 0;
+ xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0;
+ /* XXX: header size can vary due to auth type, IPv6, etc. */
+ xprt->max_payload = (1U << 16) - (MAX_HEADER << 3);
+
+ INIT_WORK(&xprt->connect_worker, xs_udp_connect_worker, xprt);
+ xprt->bind_timeout = XS_BIND_TO;
+ xprt->connect_timeout = XS_UDP_CONN_TO;
+ xprt->reestablish_timeout = XS_UDP_REEST_TO;
+ xprt->idle_timeout = XS_IDLE_DISC_TO;
+
+ xprt->ops = &xs_udp_ops;
+
+ if (to)
+ xprt->timeout = *to;
+ else
+ xprt_set_timeout(&xprt->timeout, 5, 5 * HZ);
+
+ return 0;
+}
+
+/**
+ * xs_setup_tcp - Set up transport to use a TCP socket
+ * @xprt: transport to set up
+ * @to: timeout parameters
+ *
+ */
+int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to)
+{
+ size_t slot_table_size;
+
+ dprintk("RPC: setting up tcp-ipv4 transport...\n");
+
+ xprt->max_reqs = xprt_tcp_slot_table_entries;
+ slot_table_size = xprt->max_reqs * sizeof(xprt->slot[0]);
+ xprt->slot = kmalloc(slot_table_size, GFP_KERNEL);
+ if (xprt->slot == NULL)
+ return -ENOMEM;
+ memset(xprt->slot, 0, slot_table_size);
+
+ xprt->prot = IPPROTO_TCP;
+ xprt->port = xprt_max_resvport;
+ xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32);
+ xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0;
+ xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
+
+ INIT_WORK(&xprt->connect_worker, xs_tcp_connect_worker, xprt);
+ xprt->bind_timeout = XS_BIND_TO;
+ xprt->connect_timeout = XS_TCP_CONN_TO;
+ xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
+ xprt->idle_timeout = XS_IDLE_DISC_TO;
+
+ xprt->ops = &xs_tcp_ops;
+
+ if (to)
+ xprt->timeout = *to;
+ else
+ xprt_set_timeout(&xprt->timeout, 2, 60 * HZ);
+
+ return 0;
+}
diff --git a/net/sysctl_net.c b/net/sysctl_net.c
index 3f6e31069c5..55538f6b60f 100644
--- a/net/sysctl_net.c
+++ b/net/sysctl_net.c
@@ -16,18 +16,18 @@
#include <linux/mm.h>
#include <linux/sysctl.h>
+#include <net/sock.h>
+
#ifdef CONFIG_INET
-extern struct ctl_table ipv4_table[];
+#include <net/ip.h>
#endif
-extern struct ctl_table core_table[];
-
#ifdef CONFIG_NET
-extern struct ctl_table ether_table[];
+#include <linux/if_ether.h>
#endif
#ifdef CONFIG_TR
-extern struct ctl_table tr_table[];
+#include <linux/if_tr.h>
#endif
struct ctl_table net_table[] = {
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index d403e34088a..41feca3bef8 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -105,7 +105,7 @@
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <net/sock.h>
-#include <linux/tcp.h>
+#include <net/tcp_states.h>
#include <net/af_unix.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
@@ -2026,14 +2026,6 @@ static struct net_proto_family unix_family_ops = {
.owner = THIS_MODULE,
};
-#ifdef CONFIG_SYSCTL
-extern void unix_sysctl_register(void);
-extern void unix_sysctl_unregister(void);
-#else
-static inline void unix_sysctl_register(void) {}
-static inline void unix_sysctl_unregister(void) {}
-#endif
-
static int __init af_unix_init(void)
{
int rc = -1;
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 4bd95c8f593..6ffc64e1712 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -76,11 +76,11 @@
#include <linux/netdevice.h>
#include <linux/file.h>
#include <linux/proc_fs.h>
-#include <linux/tcp.h>
#include <net/sock.h>
#include <net/af_unix.h>
#include <net/scm.h>
+#include <net/tcp_states.h>
/* Internal data structures and random procedures: */
@@ -286,16 +286,16 @@ void unix_gc(void)
skb = skb_peek(&s->sk_receive_queue);
while (skb &&
skb != (struct sk_buff *)&s->sk_receive_queue) {
- nextsk=skb->next;
+ nextsk = skb->next;
/*
* Do we have file descriptors ?
*/
- if(UNIXCB(skb).fp)
- {
- __skb_unlink(skb, skb->list);
- __skb_queue_tail(&hitlist,skb);
+ if (UNIXCB(skb).fp) {
+ __skb_unlink(skb,
+ &s->sk_receive_queue);
+ __skb_queue_tail(&hitlist, skb);
}
- skb=nextsk;
+ skb = nextsk;
}
spin_unlock(&s->sk_receive_queue.lock);
}
diff --git a/net/unix/sysctl_net_unix.c b/net/unix/sysctl_net_unix.c
index c974dac4580..690ffa5d5bf 100644
--- a/net/unix/sysctl_net_unix.c
+++ b/net/unix/sysctl_net_unix.c
@@ -12,7 +12,7 @@
#include <linux/mm.h>
#include <linux/sysctl.h>
-extern int sysctl_unix_max_dgram_qlen;
+#include <net/af_unix.h>
static ctl_table unix_table[] = {
{
diff --git a/net/wanrouter/af_wanpipe.c b/net/wanrouter/af_wanpipe.c
index d93b19faaab..596cb96e5f4 100644
--- a/net/wanrouter/af_wanpipe.c
+++ b/net/wanrouter/af_wanpipe.c
@@ -57,7 +57,7 @@
#include <linux/wanpipe.h>
#include <linux/if_wanpipe.h>
#include <linux/pkt_sched.h>
-#include <linux/tcp.h>
+#include <linux/tcp_states.h>
#include <linux/if_wanpipe_common.h>
#include <linux/sdla_x25.h>
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 04bec047fa9..020d73cc841 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -47,7 +47,7 @@
#include <linux/if_arp.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <asm/uaccess.h>
#include <linux/fcntl.h>
#include <linux/termios.h> /* For TIOCINQ/OUTQ */
diff --git a/net/x25/x25_dev.c b/net/x25/x25_dev.c
index 36fc3bf6d88..adfe7b8df35 100644
--- a/net/x25/x25_dev.c
+++ b/net/x25/x25_dev.c
@@ -81,7 +81,7 @@ static int x25_receive_data(struct sk_buff *skb, struct x25_neigh *nb)
}
int x25_lapb_receive_frame(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *ptype)
+ struct packet_type *ptype, struct net_device *orig_dev)
{
struct sk_buff *nskb;
struct x25_neigh *nb;
diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c
index b0197c70a9f..26146874b83 100644
--- a/net/x25/x25_in.c
+++ b/net/x25/x25_in.c
@@ -28,7 +28,7 @@
#include <linux/string.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <net/x25.h>
static int x25_queue_rx_frame(struct sock *sk, struct sk_buff *skb, int more)
diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c
index 7fd872ad0c2..8be9b8fbc24 100644
--- a/net/x25/x25_subr.c
+++ b/net/x25/x25_subr.c
@@ -27,7 +27,7 @@
#include <linux/string.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <net/x25.h>
/*
@@ -80,7 +80,7 @@ void x25_requeue_frames(struct sock *sk)
if (!skb_prev)
skb_queue_head(&sk->sk_write_queue, skb);
else
- skb_append(skb_prev, skb);
+ skb_append(skb_prev, skb, &sk->sk_write_queue);
skb_prev = skb;
}
}
diff --git a/net/x25/x25_timer.c b/net/x25/x25_timer.c
index d6a21a3ad80..0a92e1da392 100644
--- a/net/x25/x25_timer.c
+++ b/net/x25/x25_timer.c
@@ -23,7 +23,7 @@
#include <linux/jiffies.h>
#include <linux/timer.h>
#include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <net/x25.h>
static void x25_heartbeat_expiry(unsigned long);
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index c58a6f05a0b..2407a707232 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -12,7 +12,7 @@
#include <net/ip.h>
#include <net/xfrm.h>
-static kmem_cache_t *secpath_cachep;
+static kmem_cache_t *secpath_cachep __read_mostly;
void __secpath_destroy(struct sec_path *sp)
{
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index d65ed8684fc..0db9e57013f 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -37,7 +37,7 @@ EXPORT_SYMBOL(xfrm_policy_list);
static DEFINE_RWLOCK(xfrm_policy_afinfo_lock);
static struct xfrm_policy_afinfo *xfrm_policy_afinfo[NPROTO];
-static kmem_cache_t *xfrm_dst_cache;
+static kmem_cache_t *xfrm_dst_cache __read_mostly;
static struct work_struct xfrm_policy_gc_work;
static struct list_head xfrm_policy_gc_list =
@@ -163,7 +163,7 @@ static void xfrm_policy_timer(unsigned long data)
if (xp->dead)
goto out;
- dir = xp->index & 7;
+ dir = xfrm_policy_id2dir(xp->index);
if (xp->lft.hard_add_expires_seconds) {
long tmo = xp->lft.hard_add_expires_seconds +
@@ -225,7 +225,7 @@ expired:
* SPD calls.
*/
-struct xfrm_policy *xfrm_policy_alloc(int gfp)
+struct xfrm_policy *xfrm_policy_alloc(gfp_t gfp)
{
struct xfrm_policy *policy;
@@ -417,7 +417,7 @@ struct xfrm_policy *xfrm_policy_byid(int dir, u32 id, int delete)
struct xfrm_policy *pol, **p;
write_lock_bh(&xfrm_policy_lock);
- for (p = &xfrm_policy_list[id & 7]; (pol=*p)!=NULL; p = &pol->next) {
+ for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL; p = &pol->next) {
if (pol->index == id) {
xfrm_pol_hold(pol);
if (delete)
@@ -765,8 +765,8 @@ restart:
switch (policy->action) {
case XFRM_POLICY_BLOCK:
/* Prohibit the flow */
- xfrm_pol_put(policy);
- return -EPERM;
+ err = -EPERM;
+ goto error;
case XFRM_POLICY_ALLOW:
if (policy->xfrm_nr == 0) {
@@ -782,8 +782,8 @@ restart:
*/
dst = xfrm_find_bundle(fl, policy, family);
if (IS_ERR(dst)) {
- xfrm_pol_put(policy);
- return PTR_ERR(dst);
+ err = PTR_ERR(dst);
+ goto error;
}
if (dst)
@@ -1192,46 +1192,6 @@ int xfrm_bundle_ok(struct xfrm_dst *first, struct flowi *fl, int family)
EXPORT_SYMBOL(xfrm_bundle_ok);
-/* Well... that's _TASK_. We need to scan through transformation
- * list and figure out what mss tcp should generate in order to
- * final datagram fit to mtu. Mama mia... :-)
- *
- * Apparently, some easy way exists, but we used to choose the most
- * bizarre ones. :-) So, raising Kalashnikov... tra-ta-ta.
- *
- * Consider this function as something like dark humour. :-)
- */
-static int xfrm_get_mss(struct dst_entry *dst, u32 mtu)
-{
- int res = mtu - dst->header_len;
-
- for (;;) {
- struct dst_entry *d = dst;
- int m = res;
-
- do {
- struct xfrm_state *x = d->xfrm;
- if (x) {
- spin_lock_bh(&x->lock);
- if (x->km.state == XFRM_STATE_VALID &&
- x->type && x->type->get_max_size)
- m = x->type->get_max_size(d->xfrm, m);
- else
- m += x->props.header_len;
- spin_unlock_bh(&x->lock);
- }
- } while ((d = d->child) != NULL);
-
- if (m <= mtu)
- break;
- res -= (m - mtu);
- if (res < 88)
- return mtu;
- }
-
- return res + dst->header_len;
-}
-
int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
{
int err = 0;
@@ -1252,8 +1212,6 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
dst_ops->negative_advice = xfrm_negative_advice;
if (likely(dst_ops->link_failure == NULL))
dst_ops->link_failure = xfrm_link_failure;
- if (likely(dst_ops->get_mss == NULL))
- dst_ops->get_mss = xfrm_get_mss;
if (likely(afinfo->garbage_collect == NULL))
afinfo->garbage_collect = __xfrm_garbage_collect;
xfrm_policy_afinfo[afinfo->family] = afinfo;
@@ -1281,7 +1239,6 @@ int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo)
dst_ops->check = NULL;
dst_ops->negative_advice = NULL;
dst_ops->link_failure = NULL;
- dst_ops->get_mss = NULL;
afinfo->garbage_collect = NULL;
}
}
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 9d206c282cf..8b9a4747417 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1026,6 +1026,12 @@ void xfrm_state_delete_tunnel(struct xfrm_state *x)
}
EXPORT_SYMBOL(xfrm_state_delete_tunnel);
+/*
+ * This function is NOT optimal. For example, with ESP it will give an
+ * MTU that's usually two bytes short of being optimal. However, it will
+ * usually give an answer that's a multiple of 4 provided the input is
+ * also a multiple of 4.
+ */
int xfrm_state_mtu(struct xfrm_state *x, int mtu)
{
int res = mtu;
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 8da3e25b2c4..c35336a0f71 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1125,9 +1125,8 @@ static int xfrm_exp_state_notify(struct xfrm_state *x, struct km_event *c)
if (build_expire(skb, x, c->data.hard) < 0)
BUG();
- NETLINK_CB(skb).dst_groups = XFRMGRP_EXPIRE;
-
- return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_EXPIRE, GFP_ATOMIC);
+ NETLINK_CB(skb).dst_group = XFRMNLGRP_EXPIRE;
+ return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_EXPIRE, GFP_ATOMIC);
}
static int xfrm_notify_sa_flush(struct km_event *c)
@@ -1152,7 +1151,8 @@ static int xfrm_notify_sa_flush(struct km_event *c)
nlh->nlmsg_len = skb->tail - b;
- return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_SA, GFP_ATOMIC);
+ NETLINK_CB(skb).dst_group = XFRMNLGRP_SA;
+ return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_SA, GFP_ATOMIC);
nlmsg_failure:
kfree_skb(skb);
@@ -1226,7 +1226,8 @@ static int xfrm_notify_sa(struct xfrm_state *x, struct km_event *c)
nlh->nlmsg_len = skb->tail - b;
- return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_SA, GFP_ATOMIC);
+ NETLINK_CB(skb).dst_group = XFRMNLGRP_SA;
+ return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_SA, GFP_ATOMIC);
nlmsg_failure:
rtattr_failure:
@@ -1304,9 +1305,8 @@ static int xfrm_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *xt,
if (build_acquire(skb, x, xt, xp, dir) < 0)
BUG();
- NETLINK_CB(skb).dst_groups = XFRMGRP_ACQUIRE;
-
- return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_ACQUIRE, GFP_ATOMIC);
+ NETLINK_CB(skb).dst_group = XFRMNLGRP_ACQUIRE;
+ return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_ACQUIRE, GFP_ATOMIC);
}
/* User gives us xfrm_user_policy_info followed by an array of 0
@@ -1405,9 +1405,8 @@ static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, struct km_eve
if (build_polexpire(skb, xp, dir, c->data.hard) < 0)
BUG();
- NETLINK_CB(skb).dst_groups = XFRMGRP_EXPIRE;
-
- return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_EXPIRE, GFP_ATOMIC);
+ NETLINK_CB(skb).dst_group = XFRMNLGRP_EXPIRE;
+ return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_EXPIRE, GFP_ATOMIC);
}
static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, struct km_event *c)
@@ -1455,7 +1454,8 @@ static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, struct km_event *
nlh->nlmsg_len = skb->tail - b;
- return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_POLICY, GFP_ATOMIC);
+ NETLINK_CB(skb).dst_group = XFRMNLGRP_POLICY;
+ return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_POLICY, GFP_ATOMIC);
nlmsg_failure:
rtattr_failure:
@@ -1480,7 +1480,8 @@ static int xfrm_notify_policy_flush(struct km_event *c)
nlh->nlmsg_len = skb->tail - b;
- return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_POLICY, GFP_ATOMIC);
+ NETLINK_CB(skb).dst_group = XFRMNLGRP_POLICY;
+ return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_POLICY, GFP_ATOMIC);
nlmsg_failure:
kfree_skb(skb);
@@ -1519,7 +1520,8 @@ static int __init xfrm_user_init(void)
{
printk(KERN_INFO "Initializing IPsec netlink socket\n");
- xfrm_nl = netlink_kernel_create(NETLINK_XFRM, xfrm_netlink_rcv);
+ xfrm_nl = netlink_kernel_create(NETLINK_XFRM, XFRMNLGRP_MAX,
+ xfrm_netlink_rcv, THIS_MODULE);
if (xfrm_nl == NULL)
return -ENOMEM;
@@ -1537,3 +1539,4 @@ static void __exit xfrm_user_exit(void)
module_init(xfrm_user_init);
module_exit(xfrm_user_exit);
MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_XFRM);