Linux Networking and Network Devices APIs

Linux Networking

Networking Base Types

enum sock_type

Socket types

Constants

SOCK_STREAM

stream (connection) socket

SOCK_DGRAM

datagram (conn.less) socket

SOCK_RAW

raw socket

SOCK_RDM

reliably-delivered message

SOCK_SEQPACKET

sequential packet socket

SOCK_DCCP

Datagram Congestion Control Protocol socket

SOCK_PACKET

linux specific way of getting packets at the dev level. For writing rarp and other similar things on the user level.

Description

When adding some new socket type please grep ARCH_HAS_SOCKET_TYPE include/asm-* /socket.h, at least MIPS overrides this enum for binary compat reasons.

enum sock_shutdown_cmd

Shutdown types

Constants

SHUT_RD

shutdown receptions

SHUT_WR

shutdown transmissions

SHUT_RDWR

shutdown receptions/transmissions

struct socket

general BSD socket

Definition

struct socket {
  socket_state state;
  short type;
  unsigned long           flags;
  struct file             *file;
  struct sock             *sk;
  const struct proto_ops  *ops;
  struct socket_wq        wq;
};

Members

state

socket state (SS_CONNECTED, etc)

type

socket type (SOCK_STREAM, etc)

flags

socket flags (SOCK_NOSPACE, etc)

file

File back pointer for gc

sk

internal networking protocol agnostic socket representation

ops

protocol specific socket operations

wq

wait queue for several uses

Socket Buffer Functions

unsigned int skb_frag_size(const skb_frag_t *frag)

Returns the size of a skb fragment

Parameters

const skb_frag_t *frag

skb fragment

void skb_frag_size_set(skb_frag_t *frag, unsigned int size)

Sets the size of a skb fragment

Parameters

skb_frag_t *frag

skb fragment

unsigned int size

size of fragment

void skb_frag_size_add(skb_frag_t *frag, int delta)

Increments the size of a skb fragment by delta

Parameters

skb_frag_t *frag

skb fragment

int delta

value to add

void skb_frag_size_sub(skb_frag_t *frag, int delta)

Decrements the size of a skb fragment by delta

Parameters

skb_frag_t *frag

skb fragment

int delta

value to subtract

bool skb_frag_must_loop(struct page *p)

Test if p is a high memory page

Parameters

struct page *p

fragment’s page

skb_frag_foreach_page

skb_frag_foreach_page (f, f_off, f_len, p, p_off, p_len, copied)

loop over pages in a fragment

Parameters

f

skb frag to operate on

f_off

offset from start of f->bv_page

f_len

length from f_off to loop over

p

(temp var) current page

p_off

(temp var) offset from start of current page, non-zero only on first page.

p_len

(temp var) length in current page, < PAGE_SIZE only on first and last page.

copied

(temp var) length so far, excluding current p_len.

A fragment can hold a compound page, in which case per-page operations, notably kmap_atomic, must be called for each regular page.

struct skb_shared_hwtstamps

hardware time stamps

Definition

struct skb_shared_hwtstamps {
  ktime_t hwtstamp;
};

Members

hwtstamp

hardware time stamp transformed into duration since arbitrary point in time

Description

Software time stamps generated by ktime_get_real() are stored in skb->tstamp.

hwtstamps can only be compared against other hwtstamps from the same device.

This structure is attached to packets as part of the skb_shared_info. Use skb_hwtstamps() to get a pointer.

struct sk_buff

socket buffer

Definition

struct sk_buff {
  union {
    struct {
      struct sk_buff          *next;
      struct sk_buff          *prev;
      union {
        struct net_device       *dev;
        unsigned long           dev_scratch;
      };
    };
    struct rb_node          rbnode;
    struct list_head        list;
  };
  union {
    struct sock             *sk;
    int ip_defrag_offset;
  };
  union {
    ktime_t tstamp;
    u64 skb_mstamp_ns;
  };
  char cb[48] ;
  union {
    struct {
      unsigned long   _skb_refdst;
      void (*destructor)(struct sk_buff *skb);
    };
    struct list_head        tcp_tsorted_anchor;
#ifdef CONFIG_NET_SOCK_MSG;
    unsigned long           _sk_redir;
#endif;
  };
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE);
  unsigned long            _nfct;
#endif;
  unsigned int            len, data_len;
  __u16 mac_len, hdr_len;
  __u16 queue_mapping;
#ifdef __BIG_ENDIAN_BITFIELD;
#define CLONED_MASK     (1 << 7);
#else;
#define CLONED_MASK     1;
#endif;
#define CLONED_OFFSET()         offsetof(struct sk_buff, __cloned_offset);
  __u8 cloned:1,nohdr:1,fclone:2,peeked:1,head_frag:1,pfmemalloc:1, pp_recycle:1;
#ifdef CONFIG_SKB_EXTENSIONS;
  __u8 active_extensions;
#endif;
#ifdef __BIG_ENDIAN_BITFIELD;
#define PKT_TYPE_MAX    (7 << 5);
#else;
#define PKT_TYPE_MAX    7;
#endif;
#define PKT_TYPE_OFFSET()       offsetof(struct sk_buff, __pkt_type_offset);
  __u8 pkt_type:3;
  __u8 ignore_df:1;
  __u8 nf_trace:1;
  __u8 ip_summed:2;
  __u8 ooo_okay:1;
  __u8 l4_hash:1;
  __u8 sw_hash:1;
  __u8 wifi_acked_valid:1;
  __u8 wifi_acked:1;
  __u8 no_fcs:1;
  __u8 encapsulation:1;
  __u8 encap_hdr_csum:1;
  __u8 csum_valid:1;
#ifdef __BIG_ENDIAN_BITFIELD;
#define PKT_VLAN_PRESENT_BIT    7;
#else;
#define PKT_VLAN_PRESENT_BIT    0;
#endif;
#define PKT_VLAN_PRESENT_OFFSET()       offsetof(struct sk_buff, __pkt_vlan_present_offset);
  __u8 vlan_present:1;
  __u8 csum_complete_sw:1;
  __u8 csum_level:2;
  __u8 csum_not_inet:1;
  __u8 dst_pending_confirm:1;
#ifdef CONFIG_IPV6_NDISC_NODETYPE;
  __u8 ndisc_nodetype:2;
#endif;
  __u8 ipvs_property:1;
  __u8 inner_protocol_type:1;
  __u8 remcsum_offload:1;
#ifdef CONFIG_NET_SWITCHDEV;
  __u8 offload_fwd_mark:1;
  __u8 offload_l3_fwd_mark:1;
#endif;
#ifdef CONFIG_NET_CLS_ACT;
  __u8 tc_skip_classify:1;
  __u8 tc_at_ingress:1;
#endif;
  __u8 redirected:1;
#ifdef CONFIG_NET_REDIRECT;
  __u8 from_ingress:1;
#endif;
#ifdef CONFIG_NETFILTER_SKIP_EGRESS;
  __u8 nf_skip_egress:1;
#endif;
#ifdef CONFIG_TLS_DEVICE;
  __u8 decrypted:1;
#endif;
  __u8 slow_gro:1;
#ifdef CONFIG_NET_SCHED;
  __u16 tc_index;
#endif;
  union {
    __wsum csum;
    struct {
      __u16 csum_start;
      __u16 csum_offset;
    };
  };
  __u32 priority;
  int skb_iif;
  __u32 hash;
  __be16 vlan_proto;
  __u16 vlan_tci;
#if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS);
  union {
    unsigned int    napi_id;
    unsigned int    sender_cpu;
  };
#endif;
#ifdef CONFIG_NETWORK_SECMARK;
  __u32 secmark;
#endif;
  union {
    __u32 mark;
    __u32 reserved_tailroom;
  };
  union {
    __be16 inner_protocol;
    __u8 inner_ipproto;
  };
  __u16 inner_transport_header;
  __u16 inner_network_header;
  __u16 inner_mac_header;
  __be16 protocol;
  __u16 transport_header;
  __u16 network_header;
  __u16 mac_header;
#ifdef CONFIG_KCOV;
  u64 kcov_handle;
#endif;
  sk_buff_data_t tail;
  sk_buff_data_t end;
  unsigned char           *head, *data;
  unsigned int            truesize;
  refcount_t users;
#ifdef CONFIG_SKB_EXTENSIONS;
  struct skb_ext          *extensions;
#endif;
};

Members

{unnamed_union}

anonymous

{unnamed_struct}

anonymous

next

Next buffer in list

prev

Previous buffer in list

{unnamed_union}

anonymous

dev

Device we arrived on/are leaving by

dev_scratch

(aka dev) alternate use of dev when dev would be NULL

rbnode

RB tree node, alternative to next/prev for netem/tcp

list

queue head

{unnamed_union}

anonymous

sk

Socket we are owned by

ip_defrag_offset

(aka sk) alternate use of sk, used in fragmentation management

{unnamed_union}

anonymous

tstamp

Time we arrived/left

skb_mstamp_ns

(aka tstamp) earliest departure time; start point for retransmit timer

cb

Control buffer. Free for use by every layer. Put private vars here

{unnamed_union}

anonymous

{unnamed_struct}

anonymous

_skb_refdst

destination entry (with norefcount bit)

destructor

Destruct function

tcp_tsorted_anchor

list structure for TCP (tp->tsorted_sent_queue)

_sk_redir

socket redirection information for skmsg

_nfct

Associated connection, if any (with nfctinfo bits)

len

Length of actual data

data_len

Data length

mac_len

Length of link layer header

hdr_len

writable header length of cloned skb

queue_mapping

Queue mapping for multiqueue devices

cloned

Head may be cloned (check refcnt to be sure)

nohdr

Payload reference only, must not modify header

fclone

skbuff clone status

peeked

this packet has been seen already, so stats have been done for it, don’t do them again

head_frag

skb was allocated from page fragments, not allocated by kmalloc() or vmalloc().

pfmemalloc

skbuff was allocated from PFMEMALLOC reserves

pp_recycle

mark the packet for recycling instead of freeing (implies page_pool support on driver)

active_extensions

active extensions (skb_ext_id types)

pkt_type

Packet class

ignore_df

allow local fragmentation

nf_trace

netfilter packet trace flag

ip_summed

Driver fed us an IP checksum

ooo_okay

allow the mapping of a socket to a queue to be changed

l4_hash

indicate hash is a canonical 4-tuple hash over transport ports.

sw_hash

indicates hash was computed in software stack

wifi_acked_valid

wifi_acked was set

wifi_acked

whether frame was acked on wifi or not

no_fcs

Request NIC to treat last 4 bytes as Ethernet FCS

encapsulation

indicates the inner headers in the skbuff are valid

encap_hdr_csum

software checksum is needed

csum_valid

checksum is already valid

vlan_present

VLAN tag is present

csum_complete_sw

checksum was completed by software

csum_level

indicates the number of consecutive checksums found in the packet minus one that have been verified as CHECKSUM_UNNECESSARY (max 3)

csum_not_inet

use CRC32c to resolve CHECKSUM_PARTIAL

dst_pending_confirm

need to confirm neighbour

ndisc_nodetype

router type (from link layer)

ipvs_property

skbuff is owned by ipvs

inner_protocol_type

whether the inner protocol is ENCAP_TYPE_ETHER or ENCAP_TYPE_IPPROTO

remcsum_offload

remote checksum offload is enabled

offload_fwd_mark

Packet was L2-forwarded in hardware

offload_l3_fwd_mark

Packet was L3-forwarded in hardware

tc_skip_classify

do not classify packet. set by IFB device

tc_at_ingress

used within tc_classify to distinguish in/egress

redirected

packet was redirected by packet classifier

from_ingress

packet was redirected from the ingress path

nf_skip_egress

packet shall skip nf egress - see netfilter_netdev.h

decrypted

Decrypted SKB

slow_gro

state present at GRO time, slower prepare step required

tc_index

Traffic control index

{unnamed_union}

anonymous

csum

Checksum (must include start/offset pair)

{unnamed_struct}

anonymous

csum_start

Offset from skb->head where checksumming should start

csum_offset

Offset from csum_start where checksum should be stored

priority

Packet queueing priority

skb_iif

ifindex of device we arrived on

hash

the packet hash

vlan_proto

vlan encapsulation protocol

vlan_tci

vlan tag control information

{unnamed_union}

anonymous

napi_id

id of the NAPI struct this skb came from

sender_cpu

(aka napi_id) source CPU in XPS

secmark

security marking

{unnamed_union}

anonymous

mark

Generic packet mark

reserved_tailroom

(aka mark) number of bytes of free space available at the tail of an sk_buff

{unnamed_union}

anonymous

inner_protocol

Protocol (encapsulation)

inner_ipproto

(aka inner_protocol) stores ipproto when skb->inner_protocol_type == ENCAP_TYPE_IPPROTO;

inner_transport_header

Inner transport layer header (encapsulation)

inner_network_header

Network layer header (encapsulation)

inner_mac_header

Link layer header (encapsulation)

protocol

Packet protocol from driver

transport_header

Transport layer header

network_header

Network layer header

mac_header

Link layer header

kcov_handle

KCOV remote handle for remote coverage collection

tail

Tail pointer

end

End pointer

head

Head of buffer

data

Data head pointer

truesize

Buffer size

users

User count - see {datagram,tcp}.c

extensions

allocated extensions, valid if active_extensions is nonzero

bool skb_pfmemalloc(const struct sk_buff *skb)

Test if the skb was allocated from PFMEMALLOC reserves

Parameters

const struct sk_buff *skb

buffer

struct dst_entry *skb_dst(const struct sk_buff *skb)

returns skb dst_entry

Parameters

const struct sk_buff *skb

buffer

Description

Returns skb dst_entry, regardless of reference taken or not.

void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst)

sets skb dst

Parameters

struct sk_buff *skb

buffer

struct dst_entry *dst

dst entry

Description

Sets skb dst, assuming a reference was taken on dst and should be released by skb_dst_drop()

void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst)

sets skb dst, hopefully, without taking reference

Parameters

struct sk_buff *skb

buffer

struct dst_entry *dst

dst entry

Description

Sets skb dst, assuming a reference was not taken on dst. If dst entry is cached, we do not take reference and dst_release will be avoided by refdst_drop. If dst entry is not cached, we take reference, so that last dst_release can destroy the dst immediately.

bool skb_dst_is_noref(const struct sk_buff *skb)

Test if skb dst isn’t refcounted

Parameters

const struct sk_buff *skb

buffer

struct rtable *skb_rtable(const struct sk_buff *skb)

Returns the skb rtable

Parameters

const struct sk_buff *skb

buffer

unsigned int skb_napi_id(const struct sk_buff *skb)

Returns the skb’s NAPI id

Parameters

const struct sk_buff *skb

buffer

bool skb_unref(struct sk_buff *skb)

decrement the skb’s reference count

Parameters

struct sk_buff *skb

buffer

Description

Returns true if we can free the skb.

struct sk_buff *alloc_skb(unsigned int size, gfp_t priority)

allocate a network buffer

Parameters

unsigned int size

size to allocate

gfp_t priority

allocation mask

Description

This function is a convenient wrapper around __alloc_skb().

bool skb_fclone_busy(const struct sock *sk, const struct sk_buff *skb)

check if fclone is busy

Parameters

const struct sock *sk

socket

const struct sk_buff *skb

buffer

Description

Returns true if skb is a fast clone, and its clone is not freed. Some drivers call skb_orphan() in their ndo_start_xmit(), so we also check that this didnt happen.

struct sk_buff *alloc_skb_fclone(unsigned int size, gfp_t priority)

allocate a network buffer from fclone cache

Parameters

unsigned int size

size to allocate

gfp_t priority

allocation mask

Description

This function is a convenient wrapper around __alloc_skb().

int skb_pad(struct sk_buff *skb, int pad)

zero pad the tail of an skb

Parameters

struct sk_buff *skb

buffer to pad

int pad

space to pad

Ensure that a buffer is followed by a padding area that is zero filled. Used by network drivers which may DMA or transfer data beyond the buffer end onto the wire.

May return error in out of memory cases. The skb is freed on error.

int skb_queue_empty(const struct sk_buff_head *list)

check if a queue is empty

Parameters

const struct sk_buff_head *list

queue head

Returns true if the queue is empty, false otherwise.

bool skb_queue_empty_lockless(const struct sk_buff_head *list)

check if a queue is empty

Parameters

const struct sk_buff_head *list

queue head

Returns true if the queue is empty, false otherwise. This variant can be used in lockless contexts.

bool skb_queue_is_last(const struct sk_buff_head *list, const struct sk_buff *skb)

check if skb is the last entry in the queue

Parameters

const struct sk_buff_head *list

queue head

const struct sk_buff *skb

buffer

Returns true if skb is the last buffer on the list.

bool skb_queue_is_first(const struct sk_buff_head *list, const struct sk_buff *skb)

check if skb is the first entry in the queue

Parameters

const struct sk_buff_head *list

queue head

const struct sk_buff *skb

buffer

Returns true if skb is the first buffer on the list.

struct sk_buff *skb_queue_next(const struct sk_buff_head *list, const struct sk_buff *skb)

return the next packet in the queue

Parameters

const struct sk_buff_head *list

queue head

const struct sk_buff *skb

current buffer

Return the next packet in list after skb. It is only valid to call this if skb_queue_is_last() evaluates to false.

struct sk_buff *skb_queue_prev(const struct sk_buff_head *list, const struct sk_buff *skb)

return the prev packet in the queue

Parameters

const struct sk_buff_head *list

queue head

const struct sk_buff *skb

current buffer

Return the prev packet in list before skb. It is only valid to call this if skb_queue_is_first() evaluates to false.

struct sk_buff *skb_get(struct sk_buff *skb)

reference buffer

Parameters

struct sk_buff *skb

buffer to reference

Makes another reference to a socket buffer and returns a pointer to the buffer.

int skb_cloned(const struct sk_buff *skb)

is the buffer a clone

Parameters

const struct sk_buff *skb

buffer to check

Returns true if the buffer was generated with skb_clone() and is one of multiple shared copies of the buffer. Cloned buffers are shared data so must not be written to under normal circumstances.

int skb_header_cloned(const struct sk_buff *skb)

is the header a clone

Parameters

const struct sk_buff *skb

buffer to check

Returns true if modifying the header part of the buffer requires the data to be copied.

void __skb_header_release(struct sk_buff *skb)

release reference to header

Parameters

struct sk_buff *skb

buffer to operate on

int skb_shared(const struct sk_buff *skb)

is the buffer shared

Parameters

const struct sk_buff *skb

buffer to check

Returns true if more than one person has a reference to this buffer.

struct sk_buff *skb_share_check(struct sk_buff *skb, gfp_t pri)

check if buffer is shared and if so clone it

Parameters

struct sk_buff *skb

buffer to check

gfp_t pri

priority for memory allocation

If the buffer is shared the buffer is cloned and the old copy drops a reference. A new clone with a single reference is returned. If the buffer is not shared the original buffer is returned. When being called from interrupt status or with spinlocks held pri must be GFP_ATOMIC.

NULL is returned on a memory allocation failure.

struct sk_buff *skb_unshare(struct sk_buff *skb, gfp_t pri)

make a copy of a shared buffer

Parameters

struct sk_buff *skb

buffer to check

gfp_t pri

priority for memory allocation

If the socket buffer is a clone then this function creates a new copy of the data, drops a reference count on the old copy and returns the new copy with the reference count at 1. If the buffer is not a clone the original buffer is returned. When called with a spinlock held or from interrupt state pri must be GFP_ATOMIC

NULL is returned on a memory allocation failure.

struct sk_buff *skb_peek(const struct sk_buff_head *list_)

peek at the head of an sk_buff_head

Parameters

const struct sk_buff_head *list_

list to peek at

Peek an sk_buff. Unlike most other operations you _MUST_ be careful with this one. A peek leaves the buffer on the list and someone else may run off with it. You must hold the appropriate locks or have a private queue to do this.

Returns NULL for an empty list or a pointer to the head element. The reference count is not incremented and the reference is therefore volatile. Use with caution.

struct sk_buff *__skb_peek(const struct sk_buff_head *list_)

peek at the head of a non-empty sk_buff_head

Parameters

const struct sk_buff_head *list_

list to peek at

Like skb_peek(), but the caller knows that the list is not empty.

struct sk_buff *skb_peek_next(struct sk_buff *skb, const struct sk_buff_head *list_)

peek skb following the given one from a queue

Parameters

struct sk_buff *skb

skb to start from

const struct sk_buff_head *list_

list to peek at

Returns NULL when the end of the list is met or a pointer to the next element. The reference count is not incremented and the reference is therefore volatile. Use with caution.

struct sk_buff *skb_peek_tail(const struct sk_buff_head *list_)

peek at the tail of an sk_buff_head

Parameters

const struct sk_buff_head *list_

list to peek at

Peek an sk_buff. Unlike most other operations you _MUST_ be careful with this one. A peek leaves the buffer on the list and someone else may run off with it. You must hold the appropriate locks or have a private queue to do this.

Returns NULL for an empty list or a pointer to the tail element. The reference count is not incremented and the reference is therefore volatile. Use with caution.

__u32 skb_queue_len(const struct sk_buff_head *list_)

get queue length

Parameters

const struct sk_buff_head *list_

list to measure

Return the length of an sk_buff queue.

__u32 skb_queue_len_lockless(const struct sk_buff_head *list_)

get queue length

Parameters

const struct sk_buff_head *list_

list to measure

Return the length of an sk_buff queue. This variant can be used in lockless contexts.

void __skb_queue_head_init(struct sk_buff_head *list)

initialize non-spinlock portions of sk_buff_head

Parameters

struct sk_buff_head *list

queue to initialize

This initializes only the list and queue length aspects of an sk_buff_head object. This allows to initialize the list aspects of an sk_buff_head without reinitializing things like the spinlock. It can also be used for on-stack sk_buff_head objects where the spinlock is known to not be used.

void skb_queue_splice(const struct sk_buff_head *list, struct sk_buff_head *head)

join two skb lists, this is designed for stacks

Parameters

const struct sk_buff_head *list

the new list to add

struct sk_buff_head *head

the place to add it in the first list

void skb_queue_splice_init(struct sk_buff_head *list, struct sk_buff_head *head)

join two skb lists and reinitialise the emptied list

Parameters

struct sk_buff_head *list

the new list to add

struct sk_buff_head *head

the place to add it in the first list

The list at list is reinitialised

void skb_queue_splice_tail(const struct sk_buff_head *list, struct sk_buff_head *head)

join two skb lists, each list being a queue

Parameters

const struct sk_buff_head *list

the new list to add

struct sk_buff_head *head

the place to add it in the first list

void skb_queue_splice_tail_init(struct sk_buff_head *list, struct sk_buff_head *head)

join two skb lists and reinitialise the emptied list

Parameters

struct sk_buff_head *list

the new list to add

struct sk_buff_head *head

the place to add it in the first list

Each of the lists is a queue. The list at list is reinitialised

void __skb_queue_after(struct sk_buff_head *list, struct sk_buff *prev, struct sk_buff *newsk)

queue a buffer at the list head

Parameters

struct sk_buff_head *list

list to use

struct sk_buff *prev

place after this buffer

struct sk_buff *newsk

buffer to queue

Queue a buffer int the middle of a list. This function takes no locks and you must therefore hold required locks before calling it.

A buffer cannot be placed on two lists at the same time.

void __skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk)

queue a buffer at the list head

Parameters

struct sk_buff_head *list

list to use

struct sk_buff *newsk

buffer to queue

Queue a buffer at the start of a list. This function takes no locks and you must therefore hold required locks before calling it.

A buffer cannot be placed on two lists at the same time.

void __skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)

queue a buffer at the list tail

Parameters

struct sk_buff_head *list

list to use

struct sk_buff *newsk

buffer to queue

Queue a buffer at the end of a list. This function takes no locks and you must therefore hold required locks before calling it.

A buffer cannot be placed on two lists at the same time.

struct sk_buff *__skb_dequeue(struct sk_buff_head *list)

remove from the head of the queue

Parameters

struct sk_buff_head *list

list to dequeue from

Remove the head of the list. This function does not take any locks so must be used with appropriate locks held only. The head item is returned or NULL if the list is empty.

struct sk_buff *__skb_dequeue_tail(struct sk_buff_head *list)

remove from the tail of the queue

Parameters

struct sk_buff_head *list

list to dequeue from

Remove the tail of the list. This function does not take any locks so must be used with appropriate locks held only. The tail item is returned or NULL if the list is empty.

void __skb_fill_page_desc(struct sk_buff *skb, int i, struct page *page, int off, int size)

initialise a paged fragment in an skb

Parameters

struct sk_buff *skb

buffer containing fragment to be initialised

int i

paged fragment index to initialise

struct page *page

the page to use for this fragment

int off

the offset to the data with page

int size

the length of the data

Description

Initialises the i’th fragment of skb to point to size bytes at offset off within page.

Does not take any additional reference on the fragment.

void skb_fill_page_desc(struct sk_buff *skb, int i, struct page *page, int off, int size)

initialise a paged fragment in an skb

Parameters

struct sk_buff *skb

buffer containing fragment to be initialised

int i

paged fragment index to initialise

struct page *page

the page to use for this fragment

int off

the offset to the data with page

int size

the length of the data

Description

As per __skb_fill_page_desc() – initialises the i’th fragment of skb to point to size bytes at offset off within page. In addition updates skb such that i is the last fragment.

Does not take any additional reference on the fragment.

unsigned int skb_headroom(const struct sk_buff *skb)

bytes at buffer head

Parameters

const struct sk_buff *skb

buffer to check

Return the number of bytes of free space at the head of an sk_buff.

int skb_tailroom(const struct sk_buff *skb)

bytes at buffer end

Parameters

const struct sk_buff *skb

buffer to check

Return the number of bytes of free space at the tail of an sk_buff

int skb_availroom(const struct sk_buff *skb)

bytes at buffer end

Parameters

const struct sk_buff *skb

buffer to check

Return the number of bytes of free space at the tail of an sk_buff allocated by sk_stream_alloc()

void skb_reserve(struct sk_buff *skb, int len)

adjust headroom

Parameters

struct sk_buff *skb

buffer to alter

int len

bytes to move

Increase the headroom of an empty sk_buff by reducing the tail room. This is only allowed for an empty buffer.

void skb_tailroom_reserve(struct sk_buff *skb, unsigned int mtu, unsigned int needed_tailroom)

adjust reserved_tailroom

Parameters

struct sk_buff *skb

buffer to alter

unsigned int mtu

maximum amount of headlen permitted

unsigned int needed_tailroom

minimum amount of reserved_tailroom

Set reserved_tailroom so that headlen can be as large as possible but not larger than mtu and tailroom cannot be smaller than needed_tailroom. The required headroom should already have been reserved before using this function.

void pskb_trim_unique(struct sk_buff *skb, unsigned int len)

remove end from a paged unique (not cloned) buffer

Parameters

struct sk_buff *skb

buffer to alter

unsigned int len

new length

This is identical to pskb_trim except that the caller knows that the skb is not cloned so we should never get an error due to out- of-memory.

void skb_orphan(struct sk_buff *skb)

orphan a buffer

Parameters

struct sk_buff *skb

buffer to orphan

If a buffer currently has an owner then we call the owner’s destructor function and make the skb unowned. The buffer continues to exist but is no longer charged to its former owner.

int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask)

orphan the frags contained in a buffer

Parameters

struct sk_buff *skb

buffer to orphan frags from

gfp_t gfp_mask

allocation mask for replacement pages

For each frag in the SKB which needs a destructor (i.e. has an owner) create a copy of that frag and release the original page by calling the destructor.

void __skb_queue_purge(struct sk_buff_head *list)

empty a list

Parameters

struct sk_buff_head *list

list to empty

Delete all buffers on an sk_buff list. Each buffer is removed from the list and one reference dropped. This function does not take the list lock and the caller must hold the relevant locks to use it.

void *netdev_alloc_frag(unsigned int fragsz)

allocate a page fragment

Parameters

unsigned int fragsz

fragment size

Description

Allocates a frag from a page for receive buffer. Uses GFP_ATOMIC allocations.

struct sk_buff *netdev_alloc_skb(struct net_device *dev, unsigned int length)

allocate an skbuff for rx on a specific device

Parameters

struct net_device *dev

network device to receive on

unsigned int length

length to allocate

Allocate a new sk_buff and assign it a usage count of one. The buffer has unspecified headroom built in. Users should allocate the headroom they think they need without accounting for the built in space. The built in space is used for optimisations.

NULL is returned if there is no free memory. Although this function allocates memory it can be called from an interrupt.

struct page *__dev_alloc_pages(gfp_t gfp_mask, unsigned int order)

allocate page for network Rx

Parameters

gfp_t gfp_mask

allocation priority. Set __GFP_NOMEMALLOC if not for network Rx

unsigned int order

size of the allocation

Description

Allocate a new page.

NULL is returned if there is no free memory.

struct page *__dev_alloc_page(gfp_t gfp_mask)

allocate a page for network Rx

Parameters

gfp_t gfp_mask

allocation priority. Set __GFP_NOMEMALLOC if not for network Rx

Description

Allocate a new page.

NULL is returned if there is no free memory.

bool dev_page_is_reusable(const struct page *page)

check whether a page can be reused for network Rx

Parameters

const struct page *page

the page to test

Description

A page shouldn’t be considered for reusing/recycling if it was allocated under memory pressure or at a distant memory node.

Returns false if this page should be returned to page allocator, true otherwise.

void skb_propagate_pfmemalloc(const struct page *page, struct sk_buff *skb)

Propagate pfmemalloc if skb is allocated after RX page

Parameters

const struct page *page

The page that was allocated from skb_alloc_page

struct sk_buff *skb

The skb that may need pfmemalloc set

unsigned int skb_frag_off(const skb_frag_t *frag)

Returns the offset of a skb fragment

Parameters

const skb_frag_t *frag

the paged fragment

void skb_frag_off_add(skb_frag_t *frag, int delta)

Increments the offset of a skb fragment by delta

Parameters

skb_frag_t *frag

skb fragment

int delta

value to add

void skb_frag_off_set(skb_frag_t *frag, unsigned int offset)

Sets the offset of a skb fragment

Parameters

skb_frag_t *frag

skb fragment

unsigned int offset

offset of fragment

void skb_frag_off_copy(skb_frag_t *fragto, const skb_frag_t *fragfrom)

Sets the offset of a skb fragment from another fragment

Parameters

skb_frag_t *fragto

skb fragment where offset is set

const skb_frag_t *fragfrom

skb fragment offset is copied from

struct page *skb_frag_page(const skb_frag_t *frag)

retrieve the page referred to by a paged fragment

Parameters

const skb_frag_t *frag

the paged fragment

Description

Returns the struct page associated with frag.

void __skb_frag_ref(skb_frag_t *frag)

take an addition reference on a paged fragment.

Parameters

skb_frag_t *frag

the paged fragment

Description

Takes an additional reference on the paged fragment frag.

void skb_frag_ref(struct sk_buff *skb, int f)

take an addition reference on a paged fragment of an skb.

Parameters

struct sk_buff *skb

the buffer

int f

the fragment offset.

Description

Takes an additional reference on the f’th paged fragment of skb.

void __skb_frag_unref(skb_frag_t *frag, bool recycle)

release a reference on a paged fragment.

Parameters

skb_frag_t *frag

the paged fragment

bool recycle

recycle the page if allocated via page_pool

Description

Releases a reference on the paged fragment frag or recycles the page via the page_pool API.

void skb_frag_unref(struct sk_buff *skb, int f)

release a reference on a paged fragment of an skb.

Parameters

struct sk_buff *skb

the buffer

int f

the fragment offset

Description

Releases a reference on the f’th paged fragment of skb.

void *skb_frag_address(const skb_frag_t *frag)

gets the address of the data contained in a paged fragment

Parameters

const skb_frag_t *frag

the paged fragment buffer

Description

Returns the address of the data within frag. The page must already be mapped.

void *skb_frag_address_safe(const skb_frag_t *frag)

gets the address of the data contained in a paged fragment

Parameters

const skb_frag_t *frag

the paged fragment buffer

Description

Returns the address of the data within frag. Checks that the page is mapped and returns NULL otherwise.

void skb_frag_page_copy(skb_frag_t *fragto, const skb_frag_t *fragfrom)

sets the page in a fragment from another fragment

Parameters

skb_frag_t *fragto

skb fragment where page is set

const skb_frag_t *fragfrom

skb fragment page is copied from

void __skb_frag_set_page(skb_frag_t *frag, struct page *page)

sets the page contained in a paged fragment

Parameters

skb_frag_t *frag

the paged fragment

struct page *page

the page to set

Description

Sets the fragment frag to contain page.

void skb_frag_set_page(struct sk_buff *skb, int f, struct page *page)

sets the page contained in a paged fragment of an skb

Parameters

struct sk_buff *skb

the buffer

int f

the fragment offset

struct page *page

the page to set

Description

Sets the f’th fragment of skb to contain page.

dma_addr_t skb_frag_dma_map(struct device *dev, const skb_frag_t *frag, size_t offset, size_t size, enum dma_data_direction dir)

maps a paged fragment via the DMA API

Parameters

struct device *dev

the device to map the fragment to

const skb_frag_t *frag

the paged fragment to map

size_t offset

the offset within the fragment (starting at the fragment’s own offset)

size_t size

the number of bytes to map

enum dma_data_direction dir

the direction of the mapping (PCI_DMA_*)

Description

Maps the page associated with frag to device.

int skb_clone_writable(const struct sk_buff *skb, unsigned int len)

is the header of a clone writable

Parameters

const struct sk_buff *skb

buffer to check

unsigned int len

length up to which to write

Returns true if modifying the header part of the cloned buffer does not requires the data to be copied.

int skb_cow(struct sk_buff *skb, unsigned int headroom)

copy header of skb when it is required

Parameters

struct sk_buff *skb

buffer to cow

unsigned int headroom

needed headroom

If the skb passed lacks sufficient headroom or its data part is shared, data is reallocated. If reallocation fails, an error is returned and original skb is not changed.

The result is skb with writable area skb->head…skb->tail and at least headroom of space at head.

int skb_cow_head(struct sk_buff *skb, unsigned int headroom)

skb_cow but only making the head writable

Parameters

struct sk_buff *skb

buffer to cow

unsigned int headroom

needed headroom

This function is identical to skb_cow except that we replace the skb_cloned check by skb_header_cloned. It should be used when you only need to push on some header and do not need to modify the data.

int skb_padto(struct sk_buff *skb, unsigned int len)

pad an skbuff up to a minimal size

Parameters

struct sk_buff *skb

buffer to pad

unsigned int len

minimal length

Pads up a buffer to ensure the trailing bytes exist and are blanked. If the buffer already contains sufficient data it is untouched. Otherwise it is extended. Returns zero on success. The skb is freed on error.

int __skb_put_padto(struct sk_buff *skb, unsigned int len, bool free_on_error)

increase size and pad an skbuff up to a minimal size

Parameters

struct sk_buff *skb

buffer to pad

unsigned int len

minimal length

bool free_on_error

free buffer on error

Pads up a buffer to ensure the trailing bytes exist and are blanked. If the buffer already contains sufficient data it is untouched. Otherwise it is extended. Returns zero on success. The skb is freed on error if free_on_error is true.

int skb_put_padto(struct sk_buff *skb, unsigned int len)

increase size and pad an skbuff up to a minimal size

Parameters

struct sk_buff *skb

buffer to pad

unsigned int len

minimal length

Pads up a buffer to ensure the trailing bytes exist and are blanked. If the buffer already contains sufficient data it is untouched. Otherwise it is extended. Returns zero on success. The skb is freed on error.

int skb_linearize(struct sk_buff *skb)

convert paged skb to linear one

Parameters

struct sk_buff *skb

buffer to linarize

If there is no free memory -ENOMEM is returned, otherwise zero is returned and the old skb data released.

bool skb_has_shared_frag(const struct sk_buff *skb)

can any frag be overwritten

Parameters

const struct sk_buff *skb

buffer to test

Description

Return true if the skb has at least one frag that might be modified by an external entity (as in vmsplice()/sendfile())

int skb_linearize_cow(struct sk_buff *skb)

make sure skb is linear and writable

Parameters

struct sk_buff *skb

buffer to process

If there is no free memory -ENOMEM is returned, otherwise zero is returned and the old skb data released.

void skb_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len)

update checksum for received skb after pull

Parameters

struct sk_buff *skb

buffer to update

const void *start

start of data before pull

unsigned int len

length of data pulled

After doing a pull on a received packet, you need to call this to update the CHECKSUM_COMPLETE checksum, or set ip_summed to CHECKSUM_NONE so that it can be recomputed from scratch.

void skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len)

update checksum for received skb after push

Parameters

struct sk_buff *skb

buffer to update

const void *start

start of data after push

unsigned int len

length of data pushed

After doing a push on a received packet, you need to call this to update the CHECKSUM_COMPLETE checksum.

void *skb_push_rcsum(struct sk_buff *skb, unsigned int len)

push skb and update receive checksum

Parameters

struct sk_buff *skb

buffer to update

unsigned int len

length of data pulled

This function performs an skb_push on the packet and updates the CHECKSUM_COMPLETE checksum. It should be used on receive path processing instead of skb_push unless you know that the checksum difference is zero (e.g., a valid IP header) or you are setting ip_summed to CHECKSUM_NONE.

int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len)

trim received skb and update checksum

Parameters

struct sk_buff *skb

buffer to trim

unsigned int len

new length

This is exactly the same as pskb_trim except that it ensures the checksum of received packets are still valid after the operation. It can change skb pointers.

bool skb_needs_linearize(struct sk_buff *skb, netdev_features_t features)

check if we need to linearize a given skb depending on the given device features.

Parameters

struct sk_buff *skb

socket buffer to check

netdev_features_t features

net device features

Returns true if either: 1. skb has frag_list and the device doesn’t support FRAGLIST, or 2. skb is fragmented and the device does not support SG.

void skb_get_timestamp(const struct sk_buff *skb, struct __kernel_old_timeval *stamp)

get timestamp from a skb

Parameters

const struct sk_buff *skb

skb to get stamp from

struct __kernel_old_timeval *stamp

pointer to struct __kernel_old_timeval to store stamp in

Timestamps are stored in the skb as offsets to a base timestamp. This function converts the offset back to a struct timeval and stores it in stamp.

void skb_complete_tx_timestamp(struct sk_buff *skb, struct skb_shared_hwtstamps *hwtstamps)

deliver cloned skb with tx timestamps

Parameters

struct sk_buff *skb

clone of the original outgoing packet

struct skb_shared_hwtstamps *hwtstamps

hardware time stamps

Description

PHY drivers may accept clones of transmitted packets for timestamping via their phy_driver.txtstamp method. These drivers must call this function to return the skb back to the stack with a timestamp.

void skb_tstamp_tx(struct sk_buff *orig_skb, struct skb_shared_hwtstamps *hwtstamps)

queue clone of skb with send time stamps

Parameters

struct sk_buff *orig_skb

the original outgoing packet

struct skb_shared_hwtstamps *hwtstamps

hardware time stamps, may be NULL if not available

Description

If the skb has a socket associated, then this function clones the skb (thus sharing the actual data and optional structures), stores the optional hardware time stamping information (if non NULL) or generates a software time stamp (otherwise), then queues the clone to the error queue of the socket. Errors are silently ignored.

void skb_tx_timestamp(struct sk_buff *skb)

Driver hook for transmit timestamping

Parameters

struct sk_buff *skb

A socket buffer.

Description

Ethernet MAC Drivers should call this function in their hard_xmit() function immediately before giving the sk_buff to the MAC hardware.

Specifically, one should make absolutely sure that this function is called before TX completion of this packet can trigger. Otherwise the packet could potentially already be freed.

void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)

deliver skb with wifi status

Parameters

struct sk_buff *skb

the original outgoing packet

bool acked

ack status

__sum16 skb_checksum_complete(struct sk_buff *skb)

Calculate checksum of an entire packet

Parameters

struct sk_buff *skb

packet to process

This function calculates the checksum over the entire packet plus the value of skb->csum. The latter can be used to supply the checksum of a pseudo header as used by TCP/UDP. It returns the checksum.

For protocols that contain complete checksums such as ICMP/TCP/UDP, this function can be used to verify that checksum on received packets. In that case the function should return zero if the checksum is correct. In particular, this function will return zero if skb->ip_summed is CHECKSUM_UNNECESSARY which indicates that the hardware has already verified the correctness of the checksum.

struct skb_ext

sk_buff extensions

Definition

struct skb_ext {
  refcount_t refcnt;
  u8 offset[SKB_EXT_NUM];
  u8 chunks;
  char data[] ;
};

Members

refcnt

1 on allocation, deallocated on 0

offset

offset to add to data to obtain extension address

chunks

size currently allocated, stored in SKB_EXT_ALIGN_SHIFT units

data

start of extension data, variable sized

Note

offsets/lengths are stored in chunks of 8 bytes, this allows

to use ‘u8’ types while allowing up to 2kb worth of extension data.

void skb_checksum_none_assert(const struct sk_buff *skb)

make sure skb ip_summed is CHECKSUM_NONE

Parameters

const struct sk_buff *skb

skb to check

Description

fresh skbs have their ip_summed set to CHECKSUM_NONE. Instead of forcing ip_summed to CHECKSUM_NONE, we can use this helper, to document places where we make this assertion.

bool skb_head_is_locked(const struct sk_buff *skb)

Determine if the skb->head is locked down

Parameters

const struct sk_buff *skb

skb to check

Description

The head on skbs build around a head frag can be removed if they are not cloned. This function returns true if the skb head is locked down due to either being allocated via kmalloc, or by being a clone with multiple references to the head.

struct sock_common

minimal network layer representation of sockets

Definition

struct sock_common {
  union {
    __addrpair skc_addrpair;
    struct {
      __be32 skc_daddr;
      __be32 skc_rcv_saddr;
    };
  };
  union {
    unsigned int    skc_hash;
    __u16 skc_u16hashes[2];
  };
  union {
    __portpair skc_portpair;
    struct {
      __be16 skc_dport;
      __u16 skc_num;
    };
  };
  unsigned short          skc_family;
  volatile unsigned char  skc_state;
  unsigned char           skc_reuse:4;
  unsigned char           skc_reuseport:1;
  unsigned char           skc_ipv6only:1;
  unsigned char           skc_net_refcnt:1;
  int skc_bound_dev_if;
  union {
    struct hlist_node       skc_bind_node;
    struct hlist_node       skc_portaddr_node;
  };
  struct proto            *skc_prot;
  possible_net_t skc_net;
#if IS_ENABLED(CONFIG_IPV6);
  struct in6_addr         skc_v6_daddr;
  struct in6_addr         skc_v6_rcv_saddr;
#endif;
  atomic64_t skc_cookie;
  union {
    unsigned long   skc_flags;
    struct sock     *skc_listener;
    struct inet_timewait_death_row *skc_tw_dr;
  };
  union {
    struct hlist_node       skc_node;
    struct hlist_nulls_node skc_nulls_node;
  };
  unsigned short          skc_tx_queue_mapping;
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING;
  unsigned short          skc_rx_queue_mapping;
#endif;
  union {
    int skc_incoming_cpu;
    u32 skc_rcv_wnd;
    u32 skc_tw_rcv_nxt;
  };
  refcount_t skc_refcnt;
};

Members

{unnamed_union}

anonymous

skc_addrpair

8-byte-aligned __u64 union of skc_daddr & skc_rcv_saddr

{unnamed_struct}

anonymous

skc_daddr

Foreign IPv4 addr

skc_rcv_saddr

Bound local IPv4 addr

{unnamed_union}

anonymous

skc_hash

hash value used with various protocol lookup tables

skc_u16hashes

two u16 hash values used by UDP lookup tables

{unnamed_union}

anonymous

skc_portpair

__u32 union of skc_dport & skc_num

{unnamed_struct}

anonymous

skc_dport

placeholder for inet_dport/tw_dport

skc_num

placeholder for inet_num/tw_num

skc_family

network address family

skc_state

Connection state

skc_reuse

SO_REUSEADDR setting

skc_reuseport

SO_REUSEPORT setting

skc_ipv6only

socket is IPV6 only

skc_net_refcnt

socket is using net ref counting

skc_bound_dev_if

bound device index if != 0

{unnamed_union}

anonymous

skc_bind_node

bind hash linkage for various protocol lookup tables

skc_portaddr_node

second hash linkage for UDP/UDP-Lite protocol

skc_prot

protocol handlers inside a network family

skc_net

reference to the network namespace of this socket

skc_v6_daddr

IPV6 destination address

skc_v6_rcv_saddr

IPV6 source address

skc_cookie

socket’s cookie value

{unnamed_union}

anonymous

skc_flags

place holder for sk_flags SO_LINGER (l_onoff), SO_BROADCAST, SO_KEEPALIVE, SO_OOBINLINE settings, SO_TIMESTAMPING settings

skc_listener

connection request listener socket (aka rsk_listener) [union with skc_flags]

skc_tw_dr

(aka tw_dr) ptr to struct inet_timewait_death_row [union with skc_flags]

{unnamed_union}

anonymous

skc_node

main hash linkage for various protocol lookup tables

skc_nulls_node

main hash linkage for TCP/UDP/UDP-Lite protocol

skc_tx_queue_mapping

tx queue number for this connection

skc_rx_queue_mapping

rx queue number for this connection

{unnamed_union}

anonymous

skc_incoming_cpu

record/match cpu processing incoming packets

skc_rcv_wnd

(aka rsk_rcv_wnd) TCP receive window size (possibly scaled) [union with skc_incoming_cpu]

skc_tw_rcv_nxt

(aka tw_rcv_nxt) TCP window next expected seq number [union with skc_incoming_cpu]

skc_refcnt

reference count

This is the minimal network layer representation of sockets, the header for struct sock and struct inet_timewait_sock.

struct sock

network layer representation of sockets

Definition

struct sock {
  struct sock_common      __sk_common;
#define sk_node                 __sk_common.skc_node;
#define sk_nulls_node           __sk_common.skc_nulls_node;
#define sk_refcnt               __sk_common.skc_refcnt;
#define sk_tx_queue_mapping     __sk_common.skc_tx_queue_mapping;
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING;
#define sk_rx_queue_mapping     __sk_common.skc_rx_queue_mapping;
#endif;
#define sk_dontcopy_begin       __sk_common.skc_dontcopy_begin;
#define sk_dontcopy_end         __sk_common.skc_dontcopy_end;
#define sk_hash                 __sk_common.skc_hash;
#define sk_portpair             __sk_common.skc_portpair;
#define sk_num                  __sk_common.skc_num;
#define sk_dport                __sk_common.skc_dport;
#define sk_addrpair             __sk_common.skc_addrpair;
#define sk_daddr                __sk_common.skc_daddr;
#define sk_rcv_saddr            __sk_common.skc_rcv_saddr;
#define sk_family               __sk_common.skc_family;
#define sk_state                __sk_common.skc_state;
#define sk_reuse                __sk_common.skc_reuse;
#define sk_reuseport            __sk_common.skc_reuseport;
#define sk_ipv6only             __sk_common.skc_ipv6only;
#define sk_net_refcnt           __sk_common.skc_net_refcnt;
#define sk_bound_dev_if         __sk_common.skc_bound_dev_if;
#define sk_bind_node            __sk_common.skc_bind_node;
#define sk_prot                 __sk_common.skc_prot;
#define sk_net                  __sk_common.skc_net;
#define sk_v6_daddr             __sk_common.skc_v6_daddr;
#define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr;
#define sk_cookie               __sk_common.skc_cookie;
#define sk_incoming_cpu         __sk_common.skc_incoming_cpu;
#define sk_flags                __sk_common.skc_flags;
#define sk_rxhash               __sk_common.skc_rxhash;
  socket_lock_t sk_lock;
  atomic_t sk_drops;
  int sk_rcvlowat;
  struct sk_buff_head     sk_error_queue;
  struct sk_buff_head     sk_receive_queue;
  struct {
    atomic_t rmem_alloc;
    int len;
    struct sk_buff  *head;
    struct sk_buff  *tail;
  } sk_backlog;
#define sk_rmem_alloc sk_backlog.rmem_alloc;
  int sk_forward_alloc;
  u32 sk_reserved_mem;
#ifdef CONFIG_NET_RX_BUSY_POLL;
  unsigned int            sk_ll_usec;
  unsigned int            sk_napi_id;
#endif;
  int sk_rcvbuf;
  struct sk_filter __rcu  *sk_filter;
  union {
    struct socket_wq __rcu  *sk_wq;
  };
#ifdef CONFIG_XFRM;
  struct xfrm_policy __rcu *sk_policy[2];
#endif;
  struct dst_entry        *sk_rx_dst;
  int sk_rx_dst_ifindex;
  u32 sk_rx_dst_cookie;
  struct dst_entry __rcu  *sk_dst_cache;
  atomic_t sk_omem_alloc;
  int sk_sndbuf;
  int sk_wmem_queued;
  refcount_t sk_wmem_alloc;
  unsigned long           sk_tsq_flags;
  union {
    struct sk_buff  *sk_send_head;
    struct rb_root  tcp_rtx_queue;
  };
  struct sk_buff_head     sk_write_queue;
  __s32 sk_peek_off;
  int sk_write_pending;
  __u32 sk_dst_pending_confirm;
  u32 sk_pacing_status;
  long sk_sndtimeo;
  struct timer_list       sk_timer;
  __u32 sk_priority;
  __u32 sk_mark;
  unsigned long           sk_pacing_rate;
  unsigned long           sk_max_pacing_rate;
  struct page_frag        sk_frag;
  netdev_features_t sk_route_caps;
  netdev_features_t sk_route_nocaps;
  netdev_features_t sk_route_forced_caps;
  int sk_gso_type;
  unsigned int            sk_gso_max_size;
  gfp_t sk_allocation;
  __u32 sk_txhash;
  u8 sk_padding : 1,sk_kern_sock : 1,sk_no_check_tx : 1,sk_no_check_rx : 1, sk_userlocks : 4;
  u8 sk_pacing_shift;
  u16 sk_type;
  u16 sk_protocol;
  u16 sk_gso_max_segs;
  unsigned long           sk_lingertime;
  struct proto            *sk_prot_creator;
  rwlock_t sk_callback_lock;
  int sk_err, sk_err_soft;
  u32 sk_ack_backlog;
  u32 sk_max_ack_backlog;
  kuid_t sk_uid;
#ifdef CONFIG_NET_RX_BUSY_POLL;
  u8 sk_prefer_busy_poll;
  u16 sk_busy_poll_budget;
#endif;
  spinlock_t sk_peer_lock;
  struct pid              *sk_peer_pid;
  const struct cred       *sk_peer_cred;
  long sk_rcvtimeo;
  ktime_t sk_stamp;
#if BITS_PER_LONG==32;
  seqlock_t sk_stamp_seq;
#endif;
  u16 sk_tsflags;
  int sk_bind_phc;
  u8 sk_shutdown;
  u32 sk_tskey;
  atomic_t sk_zckey;
  u8 sk_clockid;
  u8 sk_txtime_deadline_mode : 1,sk_txtime_report_errors : 1, sk_txtime_unused : 6;
  struct socket           *sk_socket;
  void *sk_user_data;
#ifdef CONFIG_SECURITY;
  void *sk_security;
#endif;
  struct sock_cgroup_data sk_cgrp_data;
  struct mem_cgroup       *sk_memcg;
  void (*sk_state_change)(struct sock *sk);
  void (*sk_data_ready)(struct sock *sk);
  void (*sk_write_space)(struct sock *sk);
  void (*sk_error_report)(struct sock *sk);
  int (*sk_backlog_rcv)(struct sock *sk, struct sk_buff *skb);
#ifdef CONFIG_SOCK_VALIDATE_XMIT;
  struct sk_buff*         (*sk_validate_xmit_skb)(struct sock *sk,struct net_device *dev, struct sk_buff *skb);
#endif;
  void (*sk_destruct)(struct sock *sk);
  struct sock_reuseport __rcu     *sk_reuseport_cb;
#ifdef CONFIG_BPF_SYSCALL;
  struct bpf_local_storage __rcu  *sk_bpf_storage;
#endif;
  struct rcu_head         sk_rcu;
};

Members

__sk_common

shared layout with inet_timewait_sock

sk_lock

synchronizer

sk_drops

raw/udp drops counter

sk_rcvlowat

SO_RCVLOWAT setting

sk_error_queue

rarely used

sk_receive_queue

incoming packets

sk_backlog

always used with the per-socket spinlock held

sk_forward_alloc

space allocated forward

sk_reserved_mem

space reserved and non-reclaimable for the socket

sk_ll_usec

usecs to busypoll when there is no data

sk_napi_id

id of the last napi context to receive data for sk

sk_rcvbuf

size of receive buffer in bytes

sk_filter

socket filtering instructions

{unnamed_union}

anonymous

sk_wq

sock wait queue and async head

sk_policy

flow policy

sk_rx_dst

receive input route used by early demux

sk_rx_dst_ifindex

ifindex for sk_rx_dst

sk_rx_dst_cookie

cookie for sk_rx_dst

sk_dst_cache

destination cache

sk_omem_alloc

“o” is “option” or “other”

sk_sndbuf

size of send buffer in bytes

sk_wmem_queued

persistent queue size

sk_wmem_alloc

transmit queue bytes committed

sk_tsq_flags

TCP Small Queues flags

{unnamed_union}

anonymous

sk_send_head

front of stuff to transmit

tcp_rtx_queue

TCP re-transmit queue [union with sk_send_head]

sk_write_queue

Packet sending queue

sk_peek_off

current peek_offset value

sk_write_pending

a write to stream socket waits to start

sk_dst_pending_confirm

need to confirm neighbour

sk_pacing_status

Pacing status (requested, handled by sch_fq)

sk_sndtimeo

SO_SNDTIMEO setting

sk_timer

sock cleanup timer

sk_priority

SO_PRIORITY setting

sk_mark

generic packet mark

sk_pacing_rate

Pacing rate (if supported by transport/packet scheduler)

sk_max_pacing_rate

Maximum pacing rate (SO_MAX_PACING_RATE)

sk_frag

cached page frag

sk_route_caps

route capabilities (e.g. NETIF_F_TSO)

sk_route_nocaps

forbidden route capabilities (e.g NETIF_F_GSO_MASK)

sk_route_forced_caps

static, forced route capabilities (set in tcp_init_sock())

sk_gso_type

GSO type (e.g. SKB_GSO_TCPV4)

sk_gso_max_size

Maximum GSO segment size to build

sk_allocation

allocation mode

sk_txhash

computed flow hash for use on transmit

sk_padding

unused element for alignment

sk_kern_sock

True if sock is using kernel lock classes

sk_no_check_tx

SO_NO_CHECK setting, set checksum in TX packets

sk_no_check_rx

allow zero checksum in RX packets

sk_userlocks

SO_SNDBUF and SO_RCVBUF settings

sk_pacing_shift

scaling factor for TCP Small Queues

sk_type

socket type (SOCK_STREAM, etc)

sk_protocol

which protocol this socket belongs in this network family

sk_gso_max_segs

Maximum number of GSO segments

sk_lingertime

SO_LINGER l_linger setting

sk_prot_creator

sk_prot of original sock creator (see ipv6_setsockopt, IPV6_ADDRFORM for instance)

sk_callback_lock

used with the callbacks in the end of this struct

sk_err

last error

sk_err_soft

errors that don’t cause failure but are the cause of a persistent failure not just ‘timed out’

sk_ack_backlog

current listen backlog

sk_max_ack_backlog

listen backlog set in listen()

sk_uid

user id of owner

sk_prefer_busy_poll

prefer busypolling over softirq processing

sk_busy_poll_budget

napi processing budget when busypolling

sk_peer_lock

lock protecting sk_peer_pid and sk_peer_cred

sk_peer_pid

struct pid for this socket’s peer

sk_peer_cred

SO_PEERCRED setting

sk_rcvtimeo

SO_RCVTIMEO setting

sk_stamp

time stamp of last packet received

sk_stamp_seq

lock for accessing sk_stamp on 32 bit architectures only

sk_tsflags

SO_TIMESTAMPING flags

sk_bind_phc

SO_TIMESTAMPING bind PHC index of PTP virtual clock for timestamping

sk_shutdown

mask of SEND_SHUTDOWN and/or RCV_SHUTDOWN

sk_tskey

counter to disambiguate concurrent tstamp requests

sk_zckey

counter to order MSG_ZEROCOPY notifications

sk_clockid

clockid used by time-based scheduling (SO_TXTIME)

sk_txtime_deadline_mode

set deadline mode for SO_TXTIME

sk_txtime_report_errors

set report errors mode for SO_TXTIME

sk_txtime_unused

unused txtime flags

sk_socket

Identd and reporting IO signals

sk_user_data

RPC layer private data

sk_security

used by security modules

sk_cgrp_data

cgroup data for this cgroup

sk_memcg

this socket’s memory cgroup association

sk_state_change

callback to indicate change in the state of the sock

sk_data_ready

callback to indicate there is data to be processed

sk_write_space

callback to indicate there is bf sending space available

sk_error_report

callback to indicate errors (e.g. MSG_ERRQUEUE)

sk_backlog_rcv

callback to process the backlog

sk_validate_xmit_skb

ptr to an optional validate function

sk_destruct

called at sock freeing time, i.e. when all refcnt == 0

sk_reuseport_cb

reuseport group container

sk_bpf_storage

ptr to cache and control for bpf_sk_storage

sk_rcu

used during RCU grace period

bool sk_user_data_is_nocopy(const struct sock *sk)

Test if sk_user_data pointer must not be copied

Parameters

const struct sock *sk

socket

sk_for_each_entry_offset_rcu

sk_for_each_entry_offset_rcu (tpos, pos, head, offset)

iterate over a list at a given struct offset

Parameters

tpos

the type * to use as a loop cursor.

pos

the struct hlist_node to use as a loop cursor.

head

the head for your list.

offset

offset of hlist_node within the struct.

bool lock_sock_fast(struct sock *sk)

fast version of lock_sock

Parameters

struct sock *sk

socket

Description

This version should be used for very small section, where process wont block return false if fast path is taken:

sk_lock.slock locked, owned = 0, BH disabled

return true if slow path is taken:

sk_lock.slock unlocked, owned = 1, BH enabled

void unlock_sock_fast(struct sock *sk, bool slow)

complement of lock_sock_fast

Parameters

struct sock *sk

socket

bool slow

slow mode

Description

fast unlock socket for user context. If slow mode is on, we call regular release_sock()

int sk_wmem_alloc_get(const struct sock *sk)

returns write allocations

Parameters

const struct sock *sk

socket

Return

sk_wmem_alloc minus initial offset of one

int sk_rmem_alloc_get(const struct sock *sk)

returns read allocations

Parameters

const struct sock *sk

socket

Return

sk_rmem_alloc

bool sk_has_allocations(const struct sock *sk)

check if allocations are outstanding

Parameters

const struct sock *sk

socket

Return

true if socket has write or read allocations

bool skwq_has_sleeper(struct socket_wq *wq)

check if there are any waiting processes

Parameters

struct socket_wq *wq

struct socket_wq

Return

true if socket_wq has waiting processes

Description

The purpose of the skwq_has_sleeper and sock_poll_wait is to wrap the memory barrier call. They were added due to the race found within the tcp code.

Consider following tcp code paths:

CPU1                CPU2
sys_select          receive packet
...                 ...
__add_wait_queue    update tp->rcv_nxt
...                 ...
tp->rcv_nxt check   sock_def_readable
...                 {
schedule               rcu_read_lock();
                       wq = rcu_dereference(sk->sk_wq);
                       if (wq && waitqueue_active(&wq->wait))
                           wake_up_interruptible(&wq->wait)
                       ...
                    }

The race for tcp fires when the __add_wait_queue changes done by CPU1 stay in its cache, and so does the tp->rcv_nxt update on CPU2 side. The CPU1 could then endup calling schedule and sleep forever if there are no more data on the socket.

void sock_poll_wait(struct file *filp, struct socket *sock, poll_table *p)

place memory barrier behind the poll_wait call.

Parameters

struct file *filp

file

struct socket *sock

socket to wait on

poll_table *p

poll_table

Description

See the comments in the wq_has_sleeper function.

struct page_frag *sk_page_frag(struct sock *sk)

return an appropriate page_frag

Parameters

struct sock *sk

socket

Description

Use the per task page_frag instead of the per socket one for optimization when we know that we’re in the normal context and owns everything that’s associated with current.

gfpflags_allow_blocking() isn’t enough here as direct reclaim may nest inside other socket operations and end up recursing into sk_page_frag() while it’s already in use.

Return

a per task page_frag if context allows that, otherwise a per socket one.

void _sock_tx_timestamp(struct sock *sk, __u16 tsflags, __u8 *tx_flags, __u32 *tskey)

checks whether the outgoing packet is to be time stamped

Parameters

struct sock *sk

socket sending this packet

__u16 tsflags

timestamping flags to use

__u8 *tx_flags

completed with instructions for time stamping

__u32 *tskey

filled in with next sk_tskey (not for TCP, which uses seqno)

Note

callers should take care of initial *tx_flags value (usually 0)

void sk_eat_skb(struct sock *sk, struct sk_buff *skb)

Release a skb if it is no longer needed

Parameters

struct sock *sk

socket to eat this skb from

struct sk_buff *skb

socket buffer to eat

Description

This routine must be called with interrupts disabled or with the socket locked so that the sk_buff queue operation is ok.

struct sock *skb_steal_sock(struct sk_buff *skb, bool *refcounted)

steal a socket from an sk_buff

Parameters

struct sk_buff *skb

sk_buff to steal the socket from

bool *refcounted

is set to true if the socket is reference-counted

struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)

Bind a socket to a file

Parameters

struct socket *sock

socket

int flags

file status flags

const char *dname

protocol name

Returns the file bound with sock, implicitly storing it in sock->file. If dname is NULL, sets to “”. On failure the return is a ERR pointer (see linux/err.h). This function uses GFP_KERNEL internally.

struct socket *sock_from_file(struct file *file)

Return the socket bounded to file.

Parameters

struct file *file

file

On failure returns NULL.

struct socket *sockfd_lookup(int fd, int *err)

Go from a file number to its socket slot

Parameters

int fd

file handle

int *err

pointer to an error code return

The file handle passed in is locked and the socket it is bound to is returned. If an error occurs the err pointer is overwritten with a negative errno code and NULL is returned. The function checks for both invalid handles and passing a handle which is not a socket.

On a success the socket object pointer is returned.

struct socket *sock_alloc(void)

allocate a socket

Parameters

void

no arguments

Description

Allocate a new inode and socket object. The two are bound together and initialised. The socket is then returned. If we are out of inodes NULL is returned. This functions uses GFP_KERNEL internally.

void sock_release(struct socket *sock)

close a socket

Parameters

struct socket *sock

socket to close

The socket is released from the protocol stack if it has a release callback, and the inode is then released if the socket is bound to an inode not a file.

int sock_sendmsg(struct socket *sock, struct msghdr *msg)

send a message through sock

Parameters

struct socket *sock

socket

struct msghdr *msg

message to send

Sends msg through sock, passing through LSM. Returns the number of bytes sent, or an error code.

int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size)

send a message through sock (kernel-space)

Parameters

struct socket *sock

socket

struct msghdr *msg

message header

struct kvec *vec

kernel vec

size_t num

vec array length

size_t size

total message data size

Builds the message data with vec and sends it through sock. Returns the number of bytes sent, or an error code.

int kernel_sendmsg_locked(struct sock *sk, struct msghdr *msg, struct kvec *vec, size_t num, size_t size)

send a message through sock (kernel-space)

Parameters

struct sock *sk

sock

struct msghdr *msg

message header

struct kvec *vec

output s/g array

size_t num

output s/g array length

size_t size

total message data size

Builds the message data with vec and sends it through sock. Returns the number of bytes sent, or an error code. Caller must hold sk.

int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags)

receive a message from sock

Parameters

struct socket *sock

socket

struct msghdr *msg

message to receive

int flags

message flags

Receives msg from sock, passing through LSM. Returns the total number of bytes received, or an error.

int kernel_recvmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size, int flags)

Receive a message from a socket (kernel space)

Parameters

struct socket *sock

The socket to receive the message from

struct msghdr *msg

Received message

struct kvec *vec

Input s/g array for message data

size_t num

Size of input s/g array

size_t size

Number of bytes to read

int flags

Message flags (MSG_DONTWAIT, etc…)

On return the msg structure contains the scatter/gather array passed in the vec argument. The array is modified so that it consists of the unfilled portion of the original array.

The returned value is the total number of bytes received, or an error.

int sock_create_lite(int family, int type, int protocol, struct socket **res)

creates a socket

Parameters

int family

protocol family (AF_INET, …)

int type

communication type (SOCK_STREAM, …)

int protocol

protocol (0, …)

struct socket **res

new socket

Creates a new socket and assigns it to res, passing through LSM. The new socket initialization is not complete, see kernel_accept(). Returns 0 or an error. On failure res is set to NULL. This function internally uses GFP_KERNEL.

int __sock_create(struct net *net, int family, int type, int protocol, struct socket **res, int kern)

creates a socket

Parameters

struct net *net

net namespace

int family

protocol family (AF_INET, …)

int type

communication type (SOCK_STREAM, …)

int protocol

protocol (0, …)

struct socket **res

new socket

int kern

boolean for kernel space sockets

Creates a new socket and assigns it to res, passing through LSM. Returns 0 or an error. On failure res is set to NULL. kern must be set to true if the socket resides in kernel space. This function internally uses GFP_KERNEL.

int sock_create(int family, int type, int protocol, struct socket **res)

creates a socket

Parameters

int family

protocol family (AF_INET, …)

int type

communication type (SOCK_STREAM, …)

int protocol

protocol (0, …)

struct socket **res

new socket

A wrapper around __sock_create(). Returns 0 or an error. This function internally uses GFP_KERNEL.

int sock_create_kern(struct net *net, int family, int type, int protocol, struct socket **res)

creates a socket (kernel space)

Parameters

struct net *net

net namespace

int family

protocol family (AF_INET, …)

int type

communication type (SOCK_STREAM, …)

int protocol

protocol (0, …)

struct socket **res

new socket

A wrapper around __sock_create(). Returns 0 or an error. This function internally uses GFP_KERNEL.

int sock_register(const struct net_proto_family *ops)

add a socket protocol handler

Parameters

const struct net_proto_family *ops

description of protocol

This function is called by a protocol handler that wants to advertise its address family, and have it linked into the socket interface. The value ops->family corresponds to the socket system call protocol family.

void sock_unregister(int family)

remove a protocol handler

Parameters

int family

protocol family to remove

This function is called by a protocol handler that wants to remove its address family, and have it unlinked from the new socket creation.

If protocol handler is a module, then it can use module reference counts to protect against new references. If protocol handler is not a module then it needs to provide its own protection in the ops->create routine.

int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen)

bind an address to a socket (kernel space)

Parameters

struct socket *sock

socket

struct sockaddr *addr

address

int addrlen

length of address

Returns 0 or an error.

int kernel_listen(struct socket *sock, int backlog)

move socket to listening state (kernel space)

Parameters

struct socket *sock

socket

int backlog

pending connections queue size

Returns 0 or an error.

int kernel_accept(struct socket *sock, struct socket **newsock, int flags)

accept a connection (kernel space)

Parameters

struct socket *sock

listening socket

struct socket **newsock

new connected socket

int flags

flags

flags must be SOCK_CLOEXEC, SOCK_NONBLOCK or 0. If it fails, newsock is guaranteed to be NULL. Returns 0 or an error.

int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, int flags)

connect a socket (kernel space)

Parameters

struct socket *sock

socket

struct sockaddr *addr

address

int addrlen

address length

int flags

flags (O_NONBLOCK, …)

For datagram sockets, addr is the address to which datagrams are sent by default, and the only address from which datagrams are received. For stream sockets, attempts to connect to addr. Returns 0 or an error code.

int kernel_getsockname(struct socket *sock, struct sockaddr *addr)

get the address which the socket is bound (kernel space)

Parameters

struct socket *sock

socket

struct sockaddr *addr

address holder

Fills the addr pointer with the address which the socket is bound. Returns 0 or an error code.

int kernel_getpeername(struct socket *sock, struct sockaddr *addr)

get the address which the socket is connected (kernel space)

Parameters

struct socket *sock

socket

struct sockaddr *addr

address holder

Fills the addr pointer with the address which the socket is connected. Returns 0 or an error code.

int kernel_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)

send a page through a socket (kernel space)

Parameters

struct socket *sock

socket

struct page *page

page

int offset

page offset

size_t size

total size in bytes

int flags

flags (MSG_DONTWAIT, …)

Returns the total amount sent in bytes or an error.

int kernel_sendpage_locked(struct sock *sk, struct page *page, int offset, size_t size, int flags)

send a page through the locked sock (kernel space)

Parameters

struct sock *sk

sock

struct page *page

page

int offset

page offset

size_t size

total size in bytes

int flags

flags (MSG_DONTWAIT, …)

Returns the total amount sent in bytes or an error. Caller must hold sk.

int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how)

shut down part of a full-duplex connection (kernel space)

Parameters

struct socket *sock

socket

enum sock_shutdown_cmd how

connection part

Returns 0 or an error.

u32 kernel_sock_ip_overhead(struct sock *sk)

returns the IP overhead imposed by a socket

Parameters

struct sock *sk

socket

This routine returns the IP overhead imposed by a socket i.e. the length of the underlying IP header, depending on whether this is an IPv4 or IPv6 socket and the length from IP options turned on at the socket. Assumes that the caller has a lock on the socket.

struct sk_buff *build_skb_around(struct sk_buff *skb, void *data, unsigned int frag_size)

build a network buffer around provided skb

Parameters

struct sk_buff *skb

sk_buff provide by caller, must be memset cleared

void *data

data buffer provided by caller

unsigned int frag_size

size of data, or 0 if head was kmalloced

struct sk_buff *napi_build_skb(void *data, unsigned int frag_size)

build a network buffer

Parameters

void *data

data buffer provided by caller

unsigned int frag_size

size of data, or 0 if head was kmalloced

Description

Version of __napi_build_skb() that takes care of skb->head_frag and skb->pfmemalloc when the data is a page or page fragment.

Returns a new sk_buff on success, NULL on allocation failure.

struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, int flags, int node)

allocate a network buffer

Parameters

unsigned int size

size to allocate

gfp_t gfp_mask

allocation mask

int flags

If SKB_ALLOC_FCLONE is set, allocate from fclone cache instead of head cache and allocate a cloned (child) skb. If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for allocations in case the data is required for writeback

int node

numa node to allocate memory on

Allocate a new sk_buff. The returned buffer has no headroom and a tail room of at least size bytes. The object has a reference count of one. The return is the buffer. On a failure the return is NULL.

Buffers may only be allocated from interrupts using a gfp_mask of GFP_ATOMIC.

struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, gfp_t gfp_mask)

allocate an skbuff for rx on a specific device

Parameters

struct net_device *dev

network device to receive on

unsigned int len

length to allocate

gfp_t gfp_mask

get_free_pages mask, passed to alloc_skb

Allocate a new sk_buff and assign it a usage count of one. The buffer has NET_SKB_PAD headroom built in. Users should allocate the headroom they think they need without accounting for the built in space. The built in space is used for optimisations.

NULL is returned if there is no free memory.

struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, gfp_t gfp_mask)

allocate skbuff for rx in a specific NAPI instance

Parameters

struct napi_struct *napi

napi instance this buffer was allocated for

unsigned int len

length to allocate

gfp_t gfp_mask

get_free_pages mask, passed to alloc_skb and alloc_pages

Allocate a new sk_buff for use in NAPI receive. This buffer will attempt to allocate the head from a special reserved region used only for NAPI Rx allocation. By doing this we can save several CPU cycles by avoiding having to disable and re-enable IRQs.

NULL is returned if there is no free memory.

void __kfree_skb(struct sk_buff *skb)

private function

Parameters

struct sk_buff *skb

buffer

Free an sk_buff. Release anything attached to the buffer. Clean the state. This is an internal helper function. Users should always call kfree_skb

void kfree_skb(struct sk_buff *skb)

free an sk_buff

Parameters

struct sk_buff *skb

buffer to free

Drop a reference to the buffer and free it if the usage count has hit zero.

void skb_tx_error(struct sk_buff *skb)

report an sk_buff xmit error

Parameters

struct sk_buff *skb

buffer that triggered an error

Report xmit error if a device callback is tracking this skb. skb must be freed afterwards.

void consume_skb(struct sk_buff *skb)

free an skbuff

Parameters

struct sk_buff *skb

buffer to free

Drop a ref to the buffer and free it if the usage count has hit zero Functions identically to kfree_skb, but kfree_skb assumes that the frame is being dropped after a failure and notes that

struct sk_buff *alloc_skb_for_msg(struct sk_buff *first)

allocate sk_buff to wrap frag list forming a msg

Parameters

struct sk_buff *first

first sk_buff of the msg

struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)

morph one skb into another

Parameters

struct sk_buff *dst

the skb to receive the contents

struct sk_buff *src

the skb to supply the contents

This is identical to skb_clone except that the target skb is supplied by the user.

The target skb is returned upon exit.

int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)

copy userspace skb frags buffers to kernel

Parameters

struct sk_buff *skb

the skb to modify

gfp_t gfp_mask

allocation priority

This must be called on skb with SKBFL_ZEROCOPY_ENABLE. It will copy all frags into kernel and drop the reference to userspace pages.

If this function is called from an interrupt gfp_mask() must be GFP_ATOMIC.

Returns 0 on success or a negative error code on failure to allocate kernel memory to copy to.

struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)

duplicate an sk_buff

Parameters

struct sk_buff *skb

buffer to clone

gfp_t gfp_mask

allocation priority

Duplicate an sk_buff. The new one is not owned by a socket. Both copies share the same packet data but not structure. The new buffer has a reference count of 1. If the allocation fails the function returns NULL otherwise the new buffer is returned.

If this function is called from an interrupt gfp_mask() must be GFP_ATOMIC.

struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)

create private copy of an sk_buff

Parameters

const struct sk_buff *skb

buffer to copy

gfp_t gfp_mask

allocation priority

Make a copy of both an sk_buff and its data. This is used when the caller wishes to modify the data and needs a private copy of the data to alter. Returns NULL on failure or the pointer to the buffer on success. The returned buffer has a reference count of 1.

As by-product this function converts non-linear sk_buff to linear one, so that sk_buff becomes completely private and caller is allowed to modify all the data of returned buffer. This means that this function is not recommended for use in circumstances when only header is going to be modified. Use pskb_copy() instead.

struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, gfp_t gfp_mask, bool fclone)

create copy of an sk_buff with private head.

Parameters

struct sk_buff *skb

buffer to copy

int headroom

headroom of new skb

gfp_t gfp_mask

allocation priority

bool fclone

if true allocate the copy of the skb from the fclone cache instead of the head cache; it is recommended to set this to true for the cases where the copy will likely be cloned

Make a copy of both an sk_buff and part of its data, located in header. Fragmented data remain shared. This is used when the caller wishes to modify only header of sk_buff and needs private copy of the header to alter. Returns NULL on failure or the pointer to the buffer on success. The returned buffer has a reference count of 1.

int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, gfp_t gfp_mask)

reallocate header of sk_buff

Parameters

struct sk_buff *skb

buffer to reallocate

int nhead

room to add at head

int ntail

room to add at tail

gfp_t gfp_mask

allocation priority

Expands (or creates identical copy, if nhead and ntail are zero) header of skb. sk_buff itself is not changed. sk_buff MUST have reference count of 1. Returns zero in the case of success or error, if expansion failed. In the last case, sk_buff is not changed.

All the pointers pointing into skb header may change and must be reloaded after call to this function.

struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)

reallocate header of sk_buff

Parameters

struct sk_buff *skb

buffer to reallocate

unsigned int headroom

needed headroom

Unlike skb_realloc_headroom, this one does not allocate a new skb if possible; copies skb->sk to new skb as needed and frees original skb in case of failures.

It expect increased headroom and generates warning otherwise.

struct sk_buff *skb_copy_expand(const struct sk_buff *skb, int newheadroom, int newtailroom, gfp_t gfp_mask)

copy and expand sk_buff

Parameters

const struct sk_buff *skb

buffer to copy

int newheadroom

new free bytes at head

int newtailroom

new free bytes at tail

gfp_t gfp_mask

allocation priority

Make a copy of both an sk_buff and its data and while doing so allocate additional space.

This is used when the caller wishes to modify the data and needs a private copy of the data to alter as well as more space for new fields. Returns NULL on failure or the pointer to the buffer on success. The returned buffer has a reference count of 1.

You must pass GFP_ATOMIC as the allocation priority if this function is called from an interrupt.

int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error)

zero pad the tail of an skb

Parameters

struct sk_buff *skb

buffer to pad

int pad

space to pad

bool free_on_error

free buffer on error

Ensure that a buffer is followed by a padding area that is zero filled. Used by network drivers which may DMA or transfer data beyond the buffer end onto the wire.

May return error in out of memory cases. The skb is freed on error if free_on_error is true.

void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len)

add data to the tail of a potentially fragmented buffer

Parameters

struct sk_buff *skb

start of the buffer to use

struct sk_buff *tail

tail fragment of the buffer to use

int len

amount of data to add

This function extends the used data area of the potentially fragmented buffer. tail must be the last fragment of skb – or skb itself. If this would exceed the total buffer size the kernel will panic. A pointer to the first byte of the extra data is returned.

void *skb_put(struct sk_buff *skb, unsigned int len)

add data to a buffer

Parameters

struct sk_buff *skb

buffer to use

unsigned int len

amount of data to add

This function extends the used data area of the buffer. If this would exceed the total buffer size the kernel will panic. A pointer to the first byte of the extra data is returned.

void *skb_push(struct sk_buff *skb, unsigned int len)

add data to the start of a buffer

Parameters

struct sk_buff *skb

buffer to use

unsigned int len

amount of data to add

This function extends the used data area of the buffer at the buffer start. If this would exceed the total buffer headroom the kernel will panic. A pointer to the first byte of the extra data is returned.

void *skb_pull(struct sk_buff *skb, unsigned int len)

remove data from the start of a buffer

Parameters

struct sk_buff *skb

buffer to use

unsigned int len

amount of data to remove

This function removes data from the start of a buffer, returning the memory to the headroom. A pointer to the next data in the buffer is returned. Once the data has been pulled future pushes will overwrite the old data.

void skb_trim(struct sk_buff *skb, unsigned int len)

remove end from a buffer

Parameters

struct sk_buff *skb

buffer to alter

unsigned int len

new length

Cut the length of a buffer down by removing data from the tail. If the buffer is already under the length specified it is not modified. The skb must be linear.

void *__pskb_pull_tail(struct sk_buff *skb, int delta)

advance tail of skb header

Parameters

struct sk_buff *skb

buffer to reallocate

int delta

number of bytes to advance tail

The function makes a sense only on a fragmented sk_buff, it expands header moving its tail forward and copying necessary data from fragmented part.

sk_buff MUST have reference count of 1.

Returns NULL (and sk_buff does not change) if pull failed or value of new tail of skb in the case of success.

All the pointers pointing into skb header may change and must be reloaded after call to this function.

int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)

copy bits from skb to kernel buffer

Parameters

const struct sk_buff *skb

source skb

int offset

offset in source

void *to

destination buffer

int len

number of bytes to copy

Copy the specified number of bytes from the source skb to the destination buffer.

CAUTION ! :

If its prototype is ever changed, check arch/{*}/net/{*}.S files, since it is called from BPF assembly code.

int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)

store bits from kernel buffer to skb

Parameters

struct sk_buff *skb

destination buffer

int offset

offset in destination

const void *from

source buffer

int len

number of bytes to copy

Copy the specified number of bytes from the source buffer to the destination skb. This function handles all the messy bits of traversing fragment lists and such.

int skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)

Zero copy skb to skb

Parameters

struct sk_buff *to

destination buffer

struct sk_buff *from

source buffer

int len

number of bytes to copy from source buffer

int hlen

size of linear headroom in destination buffer

Copies up to len bytes from from to to by creating references to the frags in the source buffer.

The hlen as calculated by skb_zerocopy_headlen() specifies the headroom in the to buffer.

Return value: 0: everything is OK -ENOMEM: couldn’t orphan frags of from due to lack of memory -EFAULT: skb_copy_bits() found some problem with skb geometry

struct sk_buff *skb_dequeue(struct sk_buff_head *list)

remove from the head of the queue

Parameters

struct sk_buff_head *list

list to dequeue from

Remove the head of the list. The list lock is taken so the function may be used safely with other locking list functions. The head item is returned or NULL if the list is empty.

struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list)

remove from the tail of the queue

Parameters

struct sk_buff_head *list

list to dequeue from

Remove the tail of the list. The list lock is taken so the function may be used safely with other locking list functions. The tail item is returned or NULL if the list is empty.

void skb_queue_purge(struct sk_buff_head *list)

empty a list

Parameters

struct sk_buff_head *list

list to empty

Delete all buffers on an sk_buff list. Each buffer is removed from the list and one reference dropped. This function takes the list lock and is atomic with respect to other list locking functions.

void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk)

queue a buffer at the list head

Parameters

struct sk_buff_head *list

list to use

struct sk_buff *newsk

buffer to queue

Queue a buffer at the start of the list. This function takes the list lock and can be used safely with other locking sk_buff functions safely.

A buffer cannot be placed on two lists at the same time.

void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)

queue a buffer at the list tail

Parameters

struct sk_buff_head *list

list to use

struct sk_buff *newsk

buffer to queue

Queue a buffer at the tail of the list. This function takes the list lock and can be used safely with other locking sk_buff functions safely.

A buffer cannot be placed on two lists at the same time.

remove a buffer from a list

Parameters

struct sk_buff *skb

buffer to remove

struct sk_buff_head *list

list to use

Remove a packet from a list. The list locks are taken and this function is atomic with respect to other list locked calls

You must know what list the SKB is on.

void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)

append a buffer

Parameters

struct sk_buff *old

buffer to insert after

struct sk_buff *newsk

buffer to insert

struct sk_buff_head *list

list to use

Place a packet after a given packet in a list. The list locks are taken and this function is atomic with respect to other list locked calls. A buffer cannot be placed on two lists at the same time.

void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)

Split fragmented skb to two parts at length len.

Parameters

struct sk_buff *skb

the buffer to split

struct sk_buff *skb1

the buffer to receive the second part

const u32 len

new length for skb

void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, unsigned int to, struct skb_seq_state *st)

Prepare a sequential read of skb data

Parameters

struct sk_buff *skb

the buffer to read

unsigned int from

lower offset of data to be read

unsigned int to

upper offset of data to be read

struct skb_seq_state *st

state variable

Description

Initializes the specified state variable. Must be called before invoking skb_seq_read() for the first time.

unsigned int skb_seq_read(unsigned int consumed, const u8 **data, struct skb_seq_state *st)

Sequentially read skb data

Parameters

unsigned int consumed

number of bytes consumed by the caller so far

const u8 **data

destination pointer for data to be returned

struct skb_seq_state *st

state variable

Description

Reads a block of skb data at consumed relative to the lower offset specified to skb_prepare_seq_read(). Assigns the head of the data block to data and returns the length of the block or 0 if the end of the skb data or the upper offset has been reached.

The caller is not required to consume all of the data returned, i.e. consumed is typically set to the number of bytes already consumed and the next call to skb_seq_read() will return the remaining part of the block.

Note 1: The size of each block of data returned can be arbitrary,

this limitation is the cost for zerocopy sequential reads of potentially non linear data.

Note 2: Fragment lists within fragments are not implemented

at the moment, state->root_skb could be replaced with a stack for this purpose.

void skb_abort_seq_read(struct skb_seq_state *st)

Abort a sequential read of skb data

Parameters

struct skb_seq_state *st

state variable

Description

Must be called if skb_seq_read() was not called until it returned 0.

unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, unsigned int to, struct ts_config *config)

Find a text pattern in skb data

Parameters

struct sk_buff *skb

the buffer to look in

unsigned int from

search offset

unsigned int to

search limit

struct ts_config *config

textsearch configuration

Description

Finds a pattern in the skb data according to the specified textsearch configuration. Use textsearch_next() to retrieve subsequent occurrences of the pattern. Returns the offset to the first occurrence or UINT_MAX if no match was found.

void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)

pull skb and update receive checksum

Parameters

struct sk_buff *skb

buffer to update

unsigned int len

length of data pulled

This function performs an skb_pull on the packet and updates the CHECKSUM_COMPLETE checksum. It should be used on receive path processing instead of skb_pull unless you know that the checksum difference is zero (e.g., a valid IP header) or you are setting ip_summed to CHECKSUM_NONE.

struct sk_buff *skb_segment(struct sk_buff *head_skb, netdev_features_t features)

Perform protocol segmentation on skb.

Parameters

struct sk_buff *head_skb

buffer to segment

netdev_features_t features

features for the output path (see dev->features)

This function performs segmentation on the given skb. It returns a pointer to the first in a list of new skbs for the segments. In case of error it returns ERR_PTR(err).

int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)

Fill a scatter-gather list from a socket buffer

Parameters

struct sk_buff *skb

Socket buffer containing the buffers to be mapped

struct scatterlist *sg

The scatter-gather list to map into

int offset

The offset into the buffer’s contents to start mapping

int len

Length of buffer space to be mapped

Fill the specified scatter-gather list with mappings/pointers into a region of the buffer space attached to a socket buffer. Returns either the number of scatterlist items used, or -EMSGSIZE if the contents could not fit.

int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)

Check that a socket buffer’s data buffers are writable

Parameters

struct sk_buff *skb

The socket buffer to check.

int tailbits

Amount of trailing space to be added

struct sk_buff **trailer

Returned pointer to the skb where the tailbits space begins

Make sure that the data buffers attached to a socket buffer are writable. If they are not, private copies are made of the data buffers and the socket buffer is set to use these instead.

If tailbits is given, make sure that there is space to write tailbits bytes of data beyond current end of socket buffer. trailer will be set to point to the skb in which this space begins.

The number of scatterlist elements required to completely map the COW’d and extended socket buffer will be returned.

struct sk_buff *skb_clone_sk(struct sk_buff *skb)

create clone of skb, and take reference to socket

Parameters

struct sk_buff *skb

the skb to clone

Description

This function creates a clone of a buffer that holds a reference on sk_refcnt. Buffers created via this function are meant to be returned using sock_queue_err_skb, or free via kfree_skb.

When passing buffers allocated with this function to sock_queue_err_skb it is necessary to wrap the call with sock_hold/sock_put in order to prevent the socket from being released prior to being enqueued on the sk_error_queue.

bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)

set up and verify partial csum values for packet

Parameters

struct sk_buff *skb

the skb to set

u16 start

the number of bytes after skb->data to start checksumming.

u16 off

the offset from start to place the checksum.

Description

For untrusted partially-checksummed packets, we need to make sure the values for skb->csum_start and skb->csum_offset are valid so we don’t oops.

This function checks and sets those values and skb->ip_summed: if this returns false you should drop the packet.

int skb_checksum_setup(struct sk_buff *skb, bool recalculate)

set up partial checksum offset

Parameters

struct sk_buff *skb

the skb to set up

bool recalculate

if true the pseudo-header checksum will be recalculated

struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, unsigned int transport_len, __sum16 (*skb_chkf)(struct sk_buff *skb))

validate checksum of an skb

Parameters

struct sk_buff *skb

the skb to check

unsigned int transport_len

the data length beyond the network header

__sum16(*skb_chkf)(struct sk_buff *skb)

checksum function to use

Description

Applies the given checksum function skb_chkf to the provided skb. Returns a checked and maybe trimmed skb. Returns NULL on error.

If the skb has data beyond the given transport length, then a trimmed & cloned skb is checked and returned.

Caller needs to set the skb transport header and free any returned skb if it differs from the provided skb.

bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, bool *fragstolen, int *delta_truesize)

try to merge skb to prior one

Parameters

struct sk_buff *to

prior buffer

struct sk_buff *from

buffer to add

bool *fragstolen

pointer to boolean

int *delta_truesize

how much more was allocated than was requested

void skb_scrub_packet(struct sk_buff *skb, bool xnet)

scrub an skb

Parameters

struct sk_buff *skb

buffer to clean

bool xnet

packet is crossing netns

Description

skb_scrub_packet can be used after encapsulating or decapsulting a packet into/from a tunnel. Some information have to be cleared during these operations. skb_scrub_packet can also be used to clean a skb before injecting it in another namespace (xnet == true). We have to clear all information in the skb that could impact namespace isolation.

bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu)

Will a split GSO skb fit into a given MTU?

Parameters

const struct sk_buff *skb

GSO skb

unsigned int mtu

MTU to validate against

Description

skb_gso_validate_network_len validates if a given skb will fit a wanted MTU once split. It considers L3 headers, L4 headers, and the payload.

bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len)

Will a split GSO skb fit in a given length?

Parameters

const struct sk_buff *skb

GSO skb

unsigned int len

length to validate against

Description

skb_gso_validate_mac_len validates if a given skb will fit a wanted length once split, including L2, L3 and L4 headers and the payload.

int skb_eth_pop(struct sk_buff *skb)

Drop the Ethernet header at the head of a packet

Parameters

struct sk_buff *skb

Socket buffer to modify

Description

Drop the Ethernet header of skb.

Expects that skb->data points to the mac header and that no VLAN tags are present.

Returns 0 on success, -errno otherwise.

int skb_eth_push(struct sk_buff *skb, const unsigned char *dst, const unsigned char *src)

Add a new Ethernet header at the head of a packet

Parameters

struct sk_buff *skb

Socket buffer to modify

const unsigned char *dst

Destination MAC address of the new header

const unsigned char *src

Source MAC address of the new header

Description

Prepend skb with a new Ethernet header.

Expects that skb->data points to the mac header, which must be empty.

Returns 0 on success, -errno otherwise.

int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, int mac_len, bool ethernet)

push a new MPLS header after mac_len bytes from start of the packet

Parameters

struct sk_buff *skb

buffer

__be32 mpls_lse

MPLS label stack entry to push

__be16 mpls_proto

ethertype of the new MPLS header (expects 0x8847 or 0x8848)

int mac_len

length of the MAC header

bool ethernet

flag to indicate if the resulting packet after skb_mpls_push is ethernet

Description

Expects skb->data at mac header.

Returns 0 on success, -errno otherwise.

int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len, bool ethernet)

pop the outermost MPLS header

Parameters

struct sk_buff *skb

buffer

__be16 next_proto

ethertype of header after popped MPLS header

int mac_len

length of the MAC header

bool ethernet

flag to indicate if the packet is ethernet

Description

Expects skb->data at mac header.

Returns 0 on success, -errno otherwise.

int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse)

modify outermost MPLS header and update csum

Parameters

struct sk_buff *skb

buffer

__be32 mpls_lse

new MPLS label stack entry to update to

Description

Expects skb->data at mac header.

Returns 0 on success, -errno otherwise.

int skb_mpls_dec_ttl(struct sk_buff *skb)

decrement the TTL of the outermost MPLS header

Parameters

struct sk_buff *skb

buffer

Description

Expects skb->data at mac header.

Returns 0 on success, -errno otherwise.

struct sk_buff *alloc_skb_with_frags(unsigned long header_len, unsigned long data_len, int max_page_order, int *errcode, gfp_t gfp_mask)

allocate skb with page frags

Parameters

unsigned long header_len

size of linear part

unsigned long data_len

needed length in frags

int max_page_order

max page order desired.

int *errcode

pointer to error code if any

gfp_t gfp_mask

allocation mask

Description

This can be used to allocate a paged skb, given a maximal order for frags.

void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id)

allocate space for given extension, COW if needed

Parameters

struct sk_buff *skb

buffer

enum skb_ext_id id

extension to allocate space for

Description

Allocates enough space for the given extension. If the extension is already present, a pointer to that extension is returned.

If the skb was cloned, COW applies and the returned memory can be modified without changing the extension space of clones buffers.

Returns pointer to the extension or NULL on allocation failure.

bool sk_ns_capable(const struct sock *sk, struct user_namespace *user_ns, int cap)

General socket capability test

Parameters

const struct sock *sk

Socket to use a capability on or through

struct user_namespace *user_ns

The user namespace of the capability to use

int cap

The capability to use

Description

Test to see if the opener of the socket had when the socket was created and the current process has the capability cap in the user namespace user_ns.

bool sk_capable(const struct sock *sk, int cap)

Socket global capability test

Parameters

const struct sock *sk

Socket to use a capability on or through

int cap

The global capability to use

Description

Test to see if the opener of the socket had when the socket was created and the current process has the capability cap in all user namespaces.

bool sk_net_capable(const struct sock *sk, int cap)

Network namespace socket capability test

Parameters

const struct sock *sk

Socket to use a capability on or through

int cap

The capability to use

Description

Test to see if the opener of the socket had when the socket was created and the current process has the capability cap over the network namespace the socket is a member of.

void sk_set_memalloc(struct sock *sk)

sets SOCK_MEMALLOC

Parameters

struct sock *sk

socket to set it on

Description

Set SOCK_MEMALLOC on a socket for access to emergency reserves. It’s the responsibility of the admin to adjust min_free_kbytes to meet the requirements

struct sock *sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern)

All socket objects are allocated here

Parameters

struct net *net

the applicable net namespace

int family

protocol family

gfp_t priority

for allocation (GFP_KERNEL, GFP_ATOMIC, etc)

struct proto *prot

struct proto associated with this new sock instance

int kern

is this to be a kernel socket?

struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)

clone a socket, and lock its clone

Parameters

const struct sock *sk

the socket to clone

const gfp_t priority

for allocation (GFP_KERNEL, GFP_ATOMIC, etc)

Caller must unlock socket even in error path (bh_unlock_sock(newsk))

bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)

check that a page_frag contains enough room

Parameters

unsigned int sz

minimum size of the fragment we want to get

struct page_frag *pfrag

pointer to page_frag

gfp_t gfp

priority for memory allocation

Note

While this allocator tries to use high order pages, there is no guarantee that allocations succeed. Therefore, sz MUST be less or equal than PAGE_SIZE.

int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)

wait for data to arrive at sk_receive_queue

Parameters

struct sock *sk

sock to wait on

long *timeo

for how long

const struct sk_buff *skb

last skb seen on sk_receive_queue

Description

Now socket state including sk->sk_err is changed only under lock, hence we may omit checks after joining wait queue. We check receive queue before schedule() only as optimization; it is very likely that release_sock() added new data.

int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)

increase memory_allocated

Parameters

struct sock *sk

socket

int size

memory size to allocate

int amt

pages to allocate

int kind

allocation type

Similar to __sk_mem_schedule(), but does not update sk_forward_alloc

int __sk_mem_schedule(struct sock *sk, int size, int kind)

increase sk_forward_alloc and memory_allocated

Parameters

struct sock *sk

socket

int size

memory size to allocate

int kind

allocation type

If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means rmem allocation. This function assumes that protocols which have memory_pressure use sk_wmem_queued as write buffer accounting.

void __sk_mem_reduce_allocated(struct sock *sk, int amount)

reclaim memory_allocated

Parameters

struct sock *sk

socket

int amount

number of quanta

Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc

void __sk_mem_reclaim(struct sock *sk, int amount)

reclaim sk_forward_alloc and memory_allocated

Parameters

struct sock *sk

socket

int amount

number of bytes (rounded down to a SK_MEM_QUANTUM multiple)

struct sk_buff *__skb_try_recv_datagram(struct sock *sk, struct sk_buff_head *queue, unsigned int flags, int *off, int *err, struct sk_buff **last)

Receive a datagram skbuff

Parameters

struct sock *sk

socket

struct sk_buff_head *queue

socket queue from which to receive

unsigned int flags

MSG_ flags

int *off

an offset in bytes to peek skb from. Returns an offset within an skb where data actually starts

int *err

error code returned

struct sk_buff **last

set to last peeked message to inform the wait function what to look for when peeking

Get a datagram skbuff, understands the peeking, nonblocking wakeups and possible races. This replaces identical code in packet, raw and udp, as well as the IPX AX.25 and Appletalk. It also finally fixes the long standing peek and read race for datagram sockets. If you alter this routine remember it must be re-entrant.

This function will lock the socket if a skb is returned, so the caller needs to unlock the socket in that case (usually by calling skb_free_datagram). Returns NULL with err set to -EAGAIN if no data was available or to some other value if an error was detected.

  • It does not lock socket since today. This function is

  • free of race conditions. This measure should/can improve

  • significantly datagram socket latencies at high loads,

  • when data copying to user space takes lots of time.

  • (BTW I’ve just killed the last cli() in IP/IPv6/core/netlink/packet

    1. Great win.)

  • –ANK (980729)

The order of the tests when we find no data waiting are specified quite explicitly by POSIX 1003.1g, don’t change them without having the standard around please.

int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)

Free a datagram skbuff forcibly

Parameters

struct sock *sk

socket

struct sk_buff *skb

datagram skbuff

unsigned int flags

MSG_ flags

This function frees a datagram skbuff that was received by skb_recv_datagram. The flags argument must match the one used for skb_recv_datagram.

If the MSG_PEEK flag is set, and the packet is still on the receive queue of the socket, it will be taken off the queue before it is freed.

This function currently only disables BH when acquiring the sk_receive_queue lock. Therefore it must not be used in a context where that lock is acquired in an IRQ context.

It returns 0 if the packet was removed by us.

int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, struct ahash_request *hash)

Copy datagram to an iovec iterator and update a hash.

Parameters

const struct sk_buff *skb

buffer to copy

int offset

offset in the buffer to start copying from

struct iov_iter *to

iovec iterator to copy to

int len

amount of data to copy from buffer to iovec

struct ahash_request *hash

hash request to update

int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, struct iov_iter *to, int len)

Copy a datagram to an iovec iterator.

Parameters

const struct sk_buff *skb

buffer to copy

int offset

offset in the buffer to start copying from

struct iov_iter *to

iovec iterator to copy to

int len

amount of data to copy from buffer to iovec

int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, struct iov_iter *from, int len)

Copy a datagram from an iov_iter.

Parameters

struct sk_buff *skb

buffer to copy

int offset

offset in the buffer to start copying to

struct iov_iter *from

the copy source

int len

amount of data to copy to buffer from iovec

Returns 0 or -EFAULT.

int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)

Build a zerocopy datagram from an iov_iter

Parameters

struct sk_buff *skb

buffer to copy

struct iov_iter *from

the source to copy from

The function will first copy up to headlen, and then pin the userspace pages and build frags through them.

Returns 0, -EFAULT or -EMSGSIZE.

int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen, struct msghdr *msg)

Copy and checksum skb to user iovec.

Parameters

struct sk_buff *skb

skbuff

int hlen

hardware length

struct msghdr *msg

destination

Caller _must_ check that skb will fit to this iovec.

Return

0 - success.

-EINVAL - checksum failure. -EFAULT - fault during copy.

__poll_t datagram_poll(struct file *file, struct socket *sock, poll_table *wait)

generic datagram poll

Parameters

struct file *file

file struct

struct socket *sock

socket

poll_table *wait

poll table

Datagram poll: Again totally generic. This also handles sequenced packet sockets providing the socket receive queue is only ever holding data ready to receive.

Note

when you don’t use this routine for this protocol,

and you use a different write policy from sock_writeable() then please supply your own write_space callback.

int sk_stream_wait_connect(struct sock *sk, long *timeo_p)

Wait for a socket to get into the connected state

Parameters

struct sock *sk

sock to wait on

long *timeo_p

for how long to wait

Description

Must be called with the socket locked.

int sk_stream_wait_memory(struct sock *sk, long *timeo_p)

Wait for more memory for a socket

Parameters

struct sock *sk

socket to wait for memory

long *timeo_p

for how long

Socket Filter

int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)

run a packet through a socket filter

Parameters

struct sock *sk

sock associated with sk_buff

struct sk_buff *skb

buffer to filter

unsigned int cap

limit on how short the eBPF program may trim the packet

Description

Run the eBPF program and then cut skb->data to correct size returned by the program. If pkt_len is 0 we toss packet. If skb->len is smaller than pkt_len we keep whole skb->data. This is the socket level wrapper to bpf_prog_run. It returns 0 if the packet should be accepted or -EPERM if the packet should be tossed.

int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)

create an unattached filter

Parameters

struct bpf_prog **pfp

the unattached filter that is created

struct sock_fprog_kern *fprog

the filter program

Description

Create a filter independent of any socket. We first run some sanity checks on it to make sure it does not explode on us later. If an error occurs or there is insufficient memory for the filter a negative errno code is returned. On success the return is zero.

int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, bpf_aux_classic_check_t trans, bool save_orig)

create an unattached filter from user buffer

Parameters

struct bpf_prog **pfp

the unattached filter that is created

struct sock_fprog *fprog

the filter program

bpf_aux_classic_check_t trans

post-classic verifier transformation handler

bool save_orig

save classic BPF program

Description

This function effectively does the same as bpf_prog_create(), only that it builds up its insns buffer from user space provided buffer. It also allows for passing a bpf_aux_classic_check_t handler.

int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)

attach a socket filter

Parameters

struct sock_fprog *fprog

the filter program

struct sock *sk

the socket to use

Description

Attach the user’s filter code. We first run some sanity checks on it to make sure it does not explode on us later. If an error occurs or there is insufficient memory for the filter a negative errno code is returned. On success the return is zero.

Generic Network Statistics

struct gnet_stats_basic

byte/packet throughput statistics

Definition

struct gnet_stats_basic {
  __u64 bytes;
  __u32 packets;
};

Members

bytes

number of seen bytes

packets

number of seen packets

struct gnet_stats_rate_est

rate estimator

Definition

struct gnet_stats_rate_est {
  __u32 bps;
  __u32 pps;
};

Members

bps

current byte rate

pps

current packet rate

struct gnet_stats_rate_est64

rate estimator

Definition

struct gnet_stats_rate_est64 {
  __u64 bps;
  __u64 pps;
};

Members

bps

current byte rate

pps

current packet rate

struct gnet_stats_queue

queuing statistics

Definition

struct gnet_stats_queue {
  __u32 qlen;
  __u32 backlog;
  __u32 drops;
  __u32 requeues;
  __u32 overlimits;
};

Members

qlen

queue length

backlog

backlog size of queue

drops

number of dropped packets

requeues

number of requeues

overlimits

number of enqueues over the limit

struct gnet_estimator

rate estimator configuration

Definition

struct gnet_estimator {
  signed char     interval;
  unsigned char   ewma_log;
};

Members

interval

sampling period

ewma_log

the log of measurement window weight

int gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type, int xstats_type, spinlock_t *lock, struct gnet_dump *d, int padattr)

start dumping procedure in compatibility mode

Parameters

struct sk_buff *skb

socket buffer to put statistics TLVs into

int type

TLV type for top level statistic TLV

int tc_stats_type

TLV type for backward compatibility struct tc_stats TLV

int xstats_type

TLV type for backward compatibility xstats TLV

spinlock_t *lock

statistics lock

struct gnet_dump *d

dumping handle

int padattr

padding attribute

Description

Initializes the dumping handle, grabs the statistic lock and appends an empty TLV header to the socket buffer for use a container for all other statistic TLVS.

The dumping handle is marked to be in backward compatibility mode telling all gnet_stats_copy_XXX() functions to fill a local copy of struct tc_stats.

Returns 0 on success or -1 if the room in the socket buffer was not sufficient.

int gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock, struct gnet_dump *d, int padattr)

start dumping procedure in compatibility mode

Parameters

struct sk_buff *skb

socket buffer to put statistics TLVs into

int type

TLV type for top level statistic TLV

spinlock_t *lock

statistics lock

struct gnet_dump *d

dumping handle

int padattr

padding attribute

Description

Initializes the dumping handle, grabs the statistic lock and appends an empty TLV header to the socket buffer for use a container for all other statistic TLVS.

Returns 0 on success or -1 if the room in the socket buffer was not sufficient.

int gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic_sync __percpu *cpu, struct gnet_stats_basic_sync *b, bool running)

copy basic statistics into statistic TLV

Parameters

struct gnet_dump *d

dumping handle

struct gnet_stats_basic_sync __percpu *cpu

copy statistic per cpu

struct gnet_stats_basic_sync *b

basic statistics

bool running

true if b represents a running qdisc, thus b’s internal values might change during basic reads. Only used if cpu is NULL

Context

task; must not be run from IRQ or BH contexts

Description

Appends the basic statistics to the top level TLV created by gnet_stats_start_copy().

Returns 0 on success or -1 with the statistic lock released if the room in the socket buffer was not sufficient.

int gnet_stats_copy_basic_hw(struct gnet_dump *d, struct gnet_stats_basic_sync __percpu *cpu, struct gnet_stats_basic_sync *b, bool running)

copy basic hw statistics into statistic TLV

Parameters

struct gnet_dump *d

dumping handle

struct gnet_stats_basic_sync __percpu *cpu

copy statistic per cpu

struct gnet_stats_basic_sync *b

basic statistics

bool running

true if b represents a running qdisc, thus b’s internal values might change during basic reads. Only used if cpu is NULL

Context

task; must not be run from IRQ or BH contexts

Description

Appends the basic statistics to the top level TLV created by gnet_stats_start_copy().

Returns 0 on success or -1 with the statistic lock released if the room in the socket buffer was not sufficient.

int gnet_stats_copy_rate_est(struct gnet_dump *d, struct net_rate_estimator __rcu **rate_est)

copy rate estimator statistics into statistics TLV

Parameters

struct gnet_dump *d

dumping handle

struct net_rate_estimator __rcu **rate_est

rate estimator

Description

Appends the rate estimator statistics to the top level TLV created by gnet_stats_start_copy().

Returns 0 on success or -1 with the statistic lock released if the room in the socket buffer was not sufficient.

int gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue __percpu *cpu_q, struct gnet_stats_queue *q, __u32 qlen)

copy queue statistics into statistics TLV

Parameters

struct gnet_dump *d

dumping handle

struct gnet_stats_queue __percpu *cpu_q

per cpu queue statistics

struct gnet_stats_queue *q

queue statistics

__u32 qlen

queue length statistics

Description

Appends the queue statistics to the top level TLV created by gnet_stats_start_copy(). Using per cpu queue statistics if they are available.

Returns 0 on success or -1 with the statistic lock released if the room in the socket buffer was not sufficient.

int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len)

copy application specific statistics into statistics TLV

Parameters

struct gnet_dump *d

dumping handle

void *st

application specific statistics data

int len

length of data

Description

Appends the application specific statistics to the top level TLV created by gnet_stats_start_copy() and remembers the data for XSTATS if the dumping handle is in backward compatibility mode.

Returns 0 on success or -1 with the statistic lock released if the room in the socket buffer was not sufficient.

int gnet_stats_finish_copy(struct gnet_dump *d)

finish dumping procedure

Parameters

struct gnet_dump *d

dumping handle

Description

Corrects the length of the top level TLV to include all TLVs added by gnet_stats_copy_XXX() calls. Adds the backward compatibility TLVs if gnet_stats_start_copy_compat() was used and releases the statistics lock.

Returns 0 on success or -1 with the statistic lock released if the room in the socket buffer was not sufficient.

int gen_new_estimator(struct gnet_stats_basic_sync *bstats, struct gnet_stats_basic_sync __percpu *cpu_bstats, struct net_rate_estimator __rcu **rate_est, spinlock_t *lock, bool running, struct nlattr *opt)

create a new rate estimator

Parameters

struct gnet_stats_basic_sync *bstats

basic statistics

struct gnet_stats_basic_sync __percpu *cpu_bstats

bstats per cpu

struct net_rate_estimator __rcu **rate_est

rate estimator statistics

spinlock_t *lock

lock for statistics and control path

bool running

true if bstats represents a running qdisc, thus bstats’ internal values might change during basic reads. Only used if bstats_cpu is NULL

struct nlattr *opt

rate estimator configuration TLV

Description

Creates a new rate estimator with bstats as source and rate_est as destination. A new timer with the interval specified in the configuration TLV is created. Upon each interval, the latest statistics will be read from bstats and the estimated rate will be stored in rate_est with the statistics lock grabbed during this period.

Returns 0 on success or a negative error code.

void gen_kill_estimator(struct net_rate_estimator __rcu **rate_est)

remove a rate estimator

Parameters

struct net_rate_estimator __rcu **rate_est

rate estimator

Description

Removes the rate estimator.

int gen_replace_estimator(struct gnet_stats_basic_sync *bstats, struct gnet_stats_basic_sync __percpu *cpu_bstats, struct net_rate_estimator __rcu **rate_est, spinlock_t *lock, bool running, struct nlattr *opt)

replace rate estimator configuration

Parameters

struct gnet_stats_basic_sync *bstats

basic statistics

struct gnet_stats_basic_sync __percpu *cpu_bstats

bstats per cpu

struct net_rate_estimator __rcu **rate_est

rate estimator statistics

spinlock_t *lock

lock for statistics and control path

bool running

true if bstats represents a running qdisc, thus bstats’ internal values might change during basic reads. Only used if cpu_bstats is NULL

struct nlattr *opt

rate estimator configuration TLV

Description

Replaces the configuration of a rate estimator by calling gen_kill_estimator() and gen_new_estimator().

Returns 0 on success or a negative error code.

bool gen_estimator_active(struct net_rate_estimator __rcu **rate_est)

test if estimator is currently in use

Parameters

struct net_rate_estimator __rcu **rate_est

rate estimator

Description

Returns true if estimator is active, and false if not.

SUN RPC subsystem

__be32 *xdr_encode_opaque_fixed(__be32 *p, const void *ptr, unsigned int nbytes)

Encode fixed length opaque data

Parameters

__be32 *p

pointer to current position in XDR buffer.

const void *ptr

pointer to data to encode (or NULL)

unsigned int nbytes

size of data.

Description

Copy the array of data of length nbytes at ptr to the XDR buffer at position p, then align to the next 32-bit boundary by padding with zero bytes (see RFC1832). Returns the updated current XDR buffer position

Note

if ptr is NULL, only the padding is performed.

__be32 *xdr_encode_opaque(__be32 *p, const void *ptr, unsigned int nbytes)

Encode variable length opaque data

Parameters

__be32 *p

pointer to current position in XDR buffer.

const void *ptr

pointer to data to encode (or NULL)

unsigned int nbytes

size of data.

Description

Returns the updated current XDR buffer position

void xdr_terminate_string(const struct xdr_buf *buf, const u32 len)

‘0’-terminate a string residing in an xdr_buf

Parameters

const struct xdr_buf *buf

XDR buffer where string resides

const u32 len

length of string, in bytes

void xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset, struct page **pages, unsigned int base, unsigned int len)

Prepare receive buffer for a large reply

Parameters

struct xdr_buf *xdr

xdr_buf into which reply will be placed

unsigned int offset

expected offset where data payload will start, in bytes

struct page **pages

vector of struct page pointers

unsigned int base

offset in first page where receive should start, in bytes

unsigned int len

expected size of the upper layer data payload, in bytes

void _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len)

Parameters

char *p

pointer to destination

struct page **pages

array of pages

size_t pgbase

offset of source data

size_t len

length

Description

Copies data into an arbitrary memory location from an array of pages The copy is assumed to be non-overlapping.

unsigned int xdr_stream_pos(const struct xdr_stream *xdr)

Return the current offset from the start of the xdr_stream

Parameters

const struct xdr_stream *xdr

pointer to struct xdr_stream

unsigned int xdr_page_pos(const struct xdr_stream *xdr)

Return the current offset from the start of the xdr pages

Parameters

const struct xdr_stream *xdr

pointer to struct xdr_stream

void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p, struct rpc_rqst *rqst)

Initialize a struct xdr_stream for sending data.

Parameters

struct xdr_stream *xdr

pointer to xdr_stream struct

struct xdr_buf *buf

pointer to XDR buffer in which to encode data

__be32 *p

current pointer inside XDR buffer

struct rpc_rqst *rqst

pointer to controlling rpc_rqst, for debugging

Note

at the moment the RPC client only passes the length of our

scratch buffer in the xdr_buf’s header kvec. Previously this meant we needed to call xdr_adjust_iovec() after encoding the data. With the new scheme, the xdr_stream manages the details of the buffer length, and takes care of adjusting the kvec length for us.

void xdr_commit_encode(struct xdr_stream *xdr)

Ensure all data is written to buffer

Parameters

struct xdr_stream *xdr

pointer to xdr_stream

Description

We handle encoding across page boundaries by giving the caller a temporary location to write to, then later copying the data into place; xdr_commit_encode does that copying.

Normally the caller doesn’t need to call this directly, as the following xdr_reserve_space will do it. But an explicit call may be required at the end of encoding, or any other time when the xdr_buf data might be read.

__be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes)

Reserve buffer space for sending

Parameters

struct xdr_stream *xdr

pointer to xdr_stream

size_t nbytes

number of bytes to reserve

Description

Checks that we have enough buffer space to encode ‘nbytes’ more bytes of data. If so, update the total xdr_buf length, and adjust the length of the current kvec.

int xdr_reserve_space_vec(struct xdr_stream *xdr, struct kvec *vec, size_t nbytes)

Reserves a large amount of buffer space for sending

Parameters

struct xdr_stream *xdr

pointer to xdr_stream

struct kvec *vec

pointer to a kvec array

size_t nbytes

number of bytes to reserve

Description

Reserves enough buffer space to encode ‘nbytes’ of data and stores the pointers in ‘vec’. The size argument passed to xdr_reserve_space() is determined based on the number of bytes remaining in the current page to avoid invalidating iov_base pointers when xdr_commit_encode() is called.

void xdr_truncate_encode(struct xdr_stream *xdr, size_t len)

truncate an encode buffer

Parameters

struct xdr_stream *xdr

pointer to xdr_stream

size_t len

new length of buffer

Description

Truncates the xdr stream, so that xdr->buf->len == len, and xdr->p points at offset len from the start of the buffer, and head, tail, and page lengths are adjusted to correspond.

If this means moving xdr->p to a different buffer, we assume that the end pointer should be set to the end of the current page, except in the case of the head buffer when we assume the head buffer’s current length represents the end of the available buffer.

This is not safe to use on a buffer that already has inlined page cache pages (as in a zero-copy server read reply), except for the simple case of truncating from one position in the tail to another.

int xdr_restrict_buflen(struct xdr_stream *xdr, int newbuflen)

decrease available buffer space

Parameters

struct xdr_stream *xdr

pointer to xdr_stream

int newbuflen

new maximum number of bytes available

Description

Adjust our idea of how much space is available in the buffer. If we’ve already used too much space in the buffer, returns -1. If the available space is already smaller than newbuflen, returns 0 and does nothing. Otherwise, adjusts xdr->buf->buflen to newbuflen and ensures xdr->end is set at most offset newbuflen from the start of the buffer.

void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int base, unsigned int len)

Insert a list of pages into an XDR buffer for sending

Parameters

struct xdr_stream *xdr

pointer to xdr_stream

struct page **pages

list of pages

unsigned int base

offset of first byte

unsigned int len

length of data in bytes

void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p, struct rpc_rqst *rqst)

Initialize an xdr_stream for decoding data.

Parameters

struct xdr_stream *xdr

pointer to xdr_stream struct

struct xdr_buf *buf

pointer to XDR buffer from which to decode data

__be32 *p

current pointer inside XDR buffer

struct rpc_rqst *rqst

pointer to controlling rpc_rqst, for debugging

void xdr_init_decode_pages(struct xdr_stream *xdr, struct xdr_buf *buf, struct page **pages, unsigned int len)

Initialize an xdr_stream for decoding into pages

Parameters

struct xdr_stream *xdr

pointer to xdr_stream struct

struct xdr_buf *buf

pointer to XDR buffer from which to decode data

struct page **pages

list of pages to decode into

unsigned int len

length in bytes of buffer in pages

__be32 *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes)

Retrieve XDR data to decode

Parameters

struct xdr_stream *xdr

pointer to xdr_stream struct

size_t nbytes

number of bytes of data to decode

Description

Check if the input buffer is long enough to enable us to decode ‘nbytes’ more bytes of data starting at the current position. If so return the current pointer, then update the current pointer position.

unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len)

align page-based XDR data to current pointer position

Parameters

struct xdr_stream *xdr

pointer to xdr_stream struct

unsigned int len

number of bytes of page data

Description

Moves data beyond the current pointer position from the XDR head[] buffer into the page list. Any data that lies beyond current position + len bytes is moved into the XDR tail[]. The xdr_stream current position is then advanced past that data to align to the next XDR object in the tail.

Returns the number of XDR encoded bytes now contained in the pages

void xdr_enter_page(struct xdr_stream *xdr, unsigned int len)

decode data from the XDR page

Parameters

struct xdr_stream *xdr

pointer to xdr_stream struct

unsigned int len

number of bytes of page data

Description

Moves data beyond the current pointer position from the XDR head[] buffer into the page list. Any data that lies beyond current position + “len” bytes is moved into the XDR tail[]. The current pointer is then repositioned at the beginning of the first XDR page.

int xdr_buf_subsegment(const struct xdr_buf *buf, struct xdr_buf *subbuf, unsigned int base, unsigned int len)

set subbuf to a portion of buf

Parameters

const struct xdr_buf *buf

an xdr buffer

struct xdr_buf *subbuf

the result buffer

unsigned int base

beginning of range in bytes

unsigned int len

length of range in bytes

Description

sets subbuf to an xdr buffer representing the portion of buf of length len starting at offset base.

buf and subbuf may be pointers to the same struct xdr_buf.

Returns -1 if base of length are out of bounds.

bool xdr_stream_subsegment(struct xdr_stream *xdr, struct xdr_buf *subbuf, unsigned int nbytes)

set subbuf to a portion of xdr

Parameters

struct xdr_stream *xdr

an xdr_stream set up for decoding

struct xdr_buf *subbuf

the result buffer

unsigned int nbytes

length of xdr to extract, in bytes

Description

Sets up subbuf to represent a portion of xdr. The portion starts at the current offset in xdr, and extends for a length of nbytes. If this is successful, xdr is advanced to the next XDR data item following that portion.

Return values:

true: subbuf has been initialized, and xdr has been advanced. false: a bounds error has occurred

void xdr_buf_trim(struct xdr_buf *buf, unsigned int len)

lop at most “len” bytes off the end of “buf”

Parameters

struct xdr_buf *buf

buf to be trimmed

unsigned int len

number of bytes to reduce “buf” by

Description

Trim an xdr_buf by the given number of bytes by fixing up the lengths. Note that it’s possible that we’ll trim less than that amount if the xdr_buf is too small, or if (for instance) it’s all in the head and the parser has already read too far into it.

ssize_t xdr_stream_decode_opaque(struct xdr_stream *xdr, void *ptr, size_t size)

Decode variable length opaque

Parameters

struct xdr_stream *xdr

pointer to xdr_stream

void *ptr

location to store opaque data

size_t size

size of storage buffer ptr

Description

Return values:

On success, returns size of object stored in *ptr -EBADMSG on XDR buffer overflow -EMSGSIZE on overflow of storage buffer ptr

ssize_t xdr_stream_decode_opaque_dup(struct xdr_stream *xdr, void **ptr, size_t maxlen, gfp_t gfp_flags)

Decode and duplicate variable length opaque

Parameters

struct xdr_stream *xdr

pointer to xdr_stream

void **ptr

location to store pointer to opaque data

size_t maxlen

maximum acceptable object size

gfp_t gfp_flags

GFP mask to use

Description

Return values:

On success, returns size of object stored in *ptr -EBADMSG on XDR buffer overflow -EMSGSIZE if the size of the object would exceed maxlen -ENOMEM on memory allocation failure

ssize_t xdr_stream_decode_string(struct xdr_stream *xdr, char *str, size_t size)

Decode variable length string

Parameters

struct xdr_stream *xdr

pointer to xdr_stream

char *str

location to store string

size_t size

size of storage buffer str

Description

Return values:

On success, returns length of NUL-terminated string stored in *str -EBADMSG on XDR buffer overflow -EMSGSIZE on overflow of storage buffer str

ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str, size_t maxlen, gfp_t gfp_flags)

Decode and duplicate variable length string

Parameters

struct xdr_stream *xdr

pointer to xdr_stream

char **str

location to store pointer to string

size_t maxlen

maximum acceptable string length

gfp_t gfp_flags

GFP mask to use

Description

Return values:

On success, returns length of NUL-terminated string stored in *ptr -EBADMSG on XDR buffer overflow -EMSGSIZE if the size of the string would exceed maxlen -ENOMEM on memory allocation failure

void svc_xprt_deferred_close(struct svc_xprt *xprt)

Close a transport

Parameters

struct svc_xprt *xprt

transport instance

Description

Used in contexts that need to defer the work of shutting down the transport to an nfsd thread.

void svc_xprt_received(struct svc_xprt *xprt)

start next receiver thread

Parameters

struct svc_xprt *xprt

controlling transport

Description

The caller must hold the XPT_BUSY bit and must not thereafter touch transport data.

Note

XPT_DATA only gets cleared when a read-attempt finds no (or insufficient) data.

char *svc_print_addr(struct svc_rqst *rqstp, char *buf, size_t len)

Format rq_addr field for printing

Parameters

struct svc_rqst *rqstp

svc_rqst struct containing address to print

char *buf

target buffer for formatted address

size_t len

length of target buffer

void svc_reserve(struct svc_rqst *rqstp, int space)

change the space reserved for the reply to a request.

Parameters

struct svc_rqst *rqstp

The request in question

int space

new max space to reserve

Description

Each request reserves some space on the output queue of the transport to make sure the reply fits. This function reduces that reserved space to be the amount of space used already, plus space.

struct svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name, struct net *net, const sa_family_t af, const unsigned short port)

find an RPC transport instance

Parameters

struct svc_serv *serv

pointer to svc_serv to search

const char *xcl_name

C string containing transport’s class name

struct net *net

owner net pointer

const sa_family_t af

Address family of transport’s local address

const unsigned short port

transport’s IP port number

Description

Return the transport instance pointer for the endpoint accepting connections/peer traffic from the specified transport class, address family and port.

Specifying 0 for the address family or port is effectively a wild-card, and will result in matching the first transport in the service’s list that has a matching class name.

int svc_xprt_names(struct svc_serv *serv, char *buf, const int buflen)

format a buffer with a list of transport names

Parameters

struct svc_serv *serv

pointer to an RPC service

char *buf

pointer to a buffer to be filled in

const int buflen

length of buffer to be filled in

Description

Fills in buf with a string containing a list of transport names, each name terminated with ‘n’.

Returns positive length of the filled-in string on success; otherwise a negative errno value is returned if an error occurs.

int xprt_register_transport(struct xprt_class *transport)

register a transport implementation

Parameters

struct xprt_class *transport

transport to register

Description

If a transport implementation is loaded as a kernel module, it can call this interface to make itself known to the RPC client.

Return

0: transport successfully registered -EEXIST: transport already registered -EINVAL: transport module being unloaded

int xprt_unregister_transport(struct xprt_class *transport)

unregister a transport implementation

Parameters

struct xprt_class *transport

transport to unregister

Return

0: transport successfully unregistered -ENOENT: transport never registered

int xprt_find_transport_ident(const char *netid)

convert a netid into a transport identifier

Parameters

const char *netid

transport to load

Return

> 0: transport identifier -ENOENT: transport module not available

int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task)

serialize write access to transports

Parameters

struct rpc_xprt *xprt

pointer to the target transport

struct rpc_task *task

task that is requesting access to the transport

Description

This prevents mixing the payload of separate requests, and prevents transport connects from colliding with writes. No congestion control is provided.

void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task)

allow other requests to use a transport

Parameters

struct rpc_xprt *xprt

transport with other tasks potentially waiting

struct rpc_task *task

task that is releasing access to the transport

Description

Note that “task” can be NULL. No congestion control is provided.

void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)

allow other requests to use a transport

Parameters

struct rpc_xprt *xprt

transport with other tasks potentially waiting

struct rpc_task *task

task that is releasing access to the transport

Description

Note that “task” can be NULL. Another task is awoken to use the transport if the transport’s congestion window allows it.

bool xprt_request_get_cong(struct rpc_xprt *xprt, struct rpc_rqst *req)

Request congestion control credits

Parameters

struct rpc_xprt *xprt

pointer to transport

struct rpc_rqst *req

pointer to RPC request

Description

Useful for transports that require congestion control.

void xprt_release_rqst_cong(struct rpc_task *task)

housekeeping when request is complete

Parameters

struct rpc_task *task

RPC request that recently completed

Description

Useful for transports that require congestion control.

void xprt_adjust_cwnd(struct rpc_xprt *xprt, struct rpc_task *task, int result)

adjust transport congestion window

Parameters

struct rpc_xprt *xprt

pointer to xprt

struct rpc_task *task

recently completed RPC request used to adjust window

int result

result code of completed RPC request

Description

The transport code maintains an estimate on the maximum number of out- standing RPC requests, using a smoothed version of the congestion avoidance implemented in 44BSD. This is basically the Van Jacobson congestion algorithm: If a retransmit occurs, the congestion window is halved; otherwise, it is incremented by 1/cwnd when

  • a reply is received and

  • a full number of requests are outstanding and

  • the congestion window hasn’t been updated recently.

void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status)

wake all tasks on a transport’s pending queue

Parameters

struct rpc_xprt *xprt

transport with waiting tasks

int status

result code to plant in each task before waking it

void xprt_wait_for_buffer_space(struct rpc_xprt *xprt)

wait for transport output buffer to clear

Parameters

struct rpc_xprt *xprt

transport

Description

Note that we only set the timer for the case of RPC_IS_SOFT(), since we don’t in general want to force a socket disconnection due to an incomplete RPC call transmission.

bool xprt_write_space(struct rpc_xprt *xprt)

wake the task waiting for transport output buffer space

Parameters

struct rpc_xprt *xprt

transport with waiting tasks

Description

Can be called in a soft IRQ context, so xprt_write_space never sleeps.

void xprt_disconnect_done(struct rpc_xprt *xprt)

mark a transport as disconnected

Parameters

struct rpc_xprt *xprt

transport to flag for disconnect

void xprt_force_disconnect(struct rpc_xprt *xprt)

force a transport to disconnect

Parameters

struct rpc_xprt *xprt

transport to disconnect

unsigned long xprt_reconnect_delay(const struct rpc_xprt *xprt)

compute the wait before scheduling a connect

Parameters

const struct rpc_xprt *xprt

transport instance

void xprt_reconnect_backoff(struct rpc_xprt *xprt, unsigned long init_to)

compute the new re-establish timeout

Parameters

struct rpc_xprt *xprt

transport instance

unsigned long init_to

initial reestablish timeout

struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid)

find an RPC request corresponding to an XID

Parameters

struct rpc_xprt *xprt

transport on which the original request was transmitted

__be32 xid

RPC XID of incoming reply

Description

Caller holds xprt->queue_lock.

void xprt_pin_rqst(struct rpc_rqst *req)

Pin a request on the transport receive list

Parameters

struct rpc_rqst *req

Request to pin

Description

Caller must ensure this is atomic with the call to xprt_lookup_rqst() so should be holding xprt->queue_lock.

void xprt_unpin_rqst(struct rpc_rqst *req)

Unpin a request on the transport receive list

Parameters

struct rpc_rqst *req

Request to pin

Description

Caller should be holding xprt->queue_lock.

void xprt_update_rtt(struct rpc_task *task)

Update RPC RTT statistics

Parameters

struct rpc_task *task

RPC request that recently completed

Description

Caller holds xprt->queue_lock.

void xprt_complete_rqst(struct rpc_task *task, int copied)

called when reply processing is complete

Parameters

struct rpc_task *task

RPC request that recently completed

int copied

actual number of bytes received from the transport

Description

Caller holds xprt->queue_lock.

void xprt_wait_for_reply_request_def(struct rpc_task *task)

wait for reply

Parameters

struct rpc_task *task

pointer to rpc_task

Description

Set a request’s retransmit timeout based on the transport’s default timeout parameters. Used by transports that don’t adjust the retransmit timeout based on round-trip time estimation, and put the task to sleep on the pending queue.

void xprt_wait_for_reply_request_rtt(struct rpc_task *task)

wait for reply using RTT estimator

Parameters

struct rpc_task *task

pointer to rpc_task

Description

Set a request’s retransmit timeout using the RTT estimator, and put the task to sleep on the pending queue.

struct rpc_xprt *xprt_get(struct rpc_xprt *xprt)

return a reference to an RPC transport.

Parameters

struct rpc_xprt *xprt

pointer to the transport

void xprt_put(struct rpc_xprt *xprt)

release a reference to an RPC transport.

Parameters

struct rpc_xprt *xprt

pointer to the transport

void rpc_wake_up(struct rpc_wait_queue *queue)

wake up all rpc_tasks

Parameters

struct rpc_wait_queue *queue

rpc_wait_queue on which the tasks are sleeping

Description

Grabs queue->lock

void rpc_wake_up_status(struct rpc_wait_queue *queue, int status)

wake up all rpc_tasks and set their status value.

Parameters

struct rpc_wait_queue *queue

rpc_wait_queue on which the tasks are sleeping

int status

status value to set

Description

Grabs queue->lock

int rpc_malloc(struct rpc_task *task)

allocate RPC buffer resources

Parameters

struct rpc_task *task

RPC task

Description

A single memory region is allocated, which is split between the RPC call and RPC reply that this task is being used for. When this RPC is retired, the memory is released by calling rpc_free.

To prevent rpciod from hanging, this allocator never sleeps, returning -ENOMEM and suppressing warning if the request cannot be serviced immediately. The caller can arrange to sleep in a way that is safe for rpciod.

Most requests are ‘small’ (under 2KiB) and can be serviced from a mempool, ensuring that NFS reads and writes can always proceed, and that there is good locality of reference for these buffers.

void rpc_free(struct rpc_task *task)

free RPC buffer resources allocated via rpc_malloc

Parameters

struct rpc_task *task

RPC task

int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)

checksum and copy data

Parameters

struct xdr_buf *xdr

target XDR buffer

struct sk_buff *skb

source skb

Description

We have set things up such that we perform the checksum of the UDP packet in parallel with the copies into the RPC client iovec. -DaveM

struct rpc_iostats *rpc_alloc_iostats(struct rpc_clnt *clnt)

allocate an rpc_iostats structure

Parameters

struct rpc_clnt *clnt

RPC program, version, and xprt

void rpc_free_iostats(struct rpc_iostats *stats)

release an rpc_iostats structure

Parameters

struct rpc_iostats *stats

doomed rpc_iostats structure

void rpc_count_iostats_metrics(const struct rpc_task *task, struct rpc_iostats *op_metrics)

tally up per-task stats

Parameters

const struct rpc_task *task

completed rpc_task

struct rpc_iostats *op_metrics

stat structure for OP that will accumulate stats from task

void rpc_count_iostats(const struct rpc_task *task, struct rpc_iostats *stats)

tally up per-task stats

Parameters

const struct rpc_task *task

completed rpc_task

struct rpc_iostats *stats

array of stat structures

Description

Uses the statidx from task

int rpc_queue_upcall(struct rpc_pipe *pipe, struct rpc_pipe_msg *msg)

queue an upcall message to userspace

Parameters

struct rpc_pipe *pipe

upcall pipe on which to queue given message

struct rpc_pipe_msg *msg

message to queue

Description

Call with an inode created by rpc_mkpipe() to queue an upcall. A userspace process may then later read the upcall by performing a read on an open file for this inode. It is up to the caller to initialize the fields of msg (other than msg->list) appropriately.

struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name, void *private, struct rpc_pipe *pipe)

make an rpc_pipefs file for kernel<->userspace communication

Parameters

struct dentry *parent

dentry of directory to create new “pipe” in

const char *name

name of pipe

void *private

private data to associate with the pipe, for the caller’s use

struct rpc_pipe *pipe

rpc_pipe containing input parameters

Description

Data is made available for userspace to read by calls to rpc_queue_upcall(). The actual reads will result in calls to ops->upcall, which will be called with the file pointer, message, and userspace buffer to copy to.

Writes can come at any time, and do not necessarily have to be responses to upcalls. They will result in calls to msg->downcall.

The private argument passed here will be available to all these methods from the file pointer, via RPC_I(file_inode(file))->private.

remove a pipe

Parameters

struct dentry *dentry

dentry for the pipe, as returned from rpc_mkpipe

Description

After this call, lookups will no longer find the pipe, and any attempts to read or write using preexisting opens of the pipe will return -EPIPE.

void rpc_init_pipe_dir_head(struct rpc_pipe_dir_head *pdh)

initialise a struct rpc_pipe_dir_head

Parameters

struct rpc_pipe_dir_head *pdh

pointer to struct rpc_pipe_dir_head

void rpc_init_pipe_dir_object(struct rpc_pipe_dir_object *pdo, const struct rpc_pipe_dir_object_ops *pdo_ops, void *pdo_data)

initialise a struct rpc_pipe_dir_object

Parameters

struct rpc_pipe_dir_object *pdo

pointer to struct rpc_pipe_dir_object

const struct rpc_pipe_dir_object_ops *pdo_ops

pointer to const struct rpc_pipe_dir_object_ops

void *pdo_data

pointer to caller-defined data

int rpc_add_pipe_dir_object(struct net *net, struct rpc_pipe_dir_head *pdh, struct rpc_pipe_dir_object *pdo)

associate a rpc_pipe_dir_object to a directory

Parameters

struct net *net

pointer to struct net

struct rpc_pipe_dir_head *pdh

pointer to struct rpc_pipe_dir_head

struct rpc_pipe_dir_object *pdo

pointer to struct rpc_pipe_dir_object

void rpc_remove_pipe_dir_object(struct net *net, struct rpc_pipe_dir_head *pdh, struct rpc_pipe_dir_object *pdo)

remove a rpc_pipe_dir_object from a directory

Parameters

struct net *net

pointer to struct net

struct rpc_pipe_dir_head *pdh

pointer to struct rpc_pipe_dir_head

struct rpc_pipe_dir_object *pdo

pointer to struct rpc_pipe_dir_object

struct rpc_pipe_dir_object *rpc_find_or_alloc_pipe_dir_object(struct net *net, struct rpc_pipe_dir_head *pdh, int (*match)(struct rpc_pipe_dir_object*, void*), struct rpc_pipe_dir_object *(*alloc)(void*), void *data)

Parameters

struct net *net

pointer to struct net

struct rpc_pipe_dir_head *pdh

pointer to struct rpc_pipe_dir_head

int (*match)(struct rpc_pipe_dir_object *, void *)

match struct rpc_pipe_dir_object to data

struct rpc_pipe_dir_object *(*alloc)(void *)

allocate a new struct rpc_pipe_dir_object

void *data

user defined data for match() and alloc()

void rpcb_getport_async(struct rpc_task *task)

obtain the port for a given RPC service on a given host

Parameters

struct rpc_task *task

task that is waiting for portmapper request

Description

This one can be called for an ongoing RPC request, and can be used in an async (rpciod) context.

struct rpc_clnt *rpc_create(struct rpc_create_args *args)

create an RPC client and transport with one call

Parameters

struct rpc_create_args *args

rpc_clnt create argument structure

Description

Creates and initializes an RPC transport and an RPC client.

It can ping the server in order to determine if it is up, and to see if it supports this program and version. RPC_CLNT_CREATE_NOPING disables this behavior so asynchronous tasks can also use rpc_create.

struct rpc_clnt *rpc_clone_client(struct rpc_clnt *clnt)

Clone an RPC client structure

Parameters

struct rpc_clnt *clnt

RPC client whose parameters are copied

Description

Returns a fresh RPC client or an ERR_PTR.

struct rpc_clnt *rpc_clone_client_set_auth(struct rpc_clnt *clnt, rpc_authflavor_t flavor)

Clone an RPC client structure and set its auth

Parameters

struct rpc_clnt *clnt

RPC client whose parameters are copied

rpc_authflavor_t flavor

security flavor for new client

Description

Returns a fresh RPC client or an ERR_PTR.

int rpc_switch_client_transport(struct rpc_clnt *clnt, struct xprt_create *args, const struct rpc_timeout *timeout)

switch the RPC transport on the fly

Parameters

struct rpc_clnt *clnt

pointer to a struct rpc_clnt

struct xprt_create *args

pointer to the new transport arguments

const struct rpc_timeout *timeout

pointer to the new timeout parameters

Description

This function allows the caller to switch the RPC transport for the rpc_clnt structure ‘clnt’ to allow it to connect to a mirrored NFS server, for instance. It assumes that the caller has ensured that there are no active RPC tasks by using some form of locking.

Returns zero if “clnt” is now using the new xprt. Otherwise a negative errno is returned, and “clnt” continues to use the old xprt.

int rpc_clnt_iterate_for_each_xprt(struct rpc_clnt *clnt, int (*fn)(struct rpc_clnt*, struct rpc_xprt*, void*), void *data)

Apply a function to all transports

Parameters

struct rpc_clnt *clnt

pointer to client

int (*fn)(struct rpc_clnt *, struct rpc_xprt *, void *)

function to apply

void *data

void pointer to function data

Description

Iterates through the list of RPC transports currently attached to the client and applies the function fn(clnt, xprt, data).

On error, the iteration stops, and the function returns the error value.

struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *old, const struct rpc_program *program, u32 vers)

bind a new RPC program to an existing client

Parameters

struct rpc_clnt *old

old rpc_client

const struct rpc_program *program

rpc program to set

u32 vers

rpc program version

Description

Clones the rpc client and sets up a new RPC program. This is mainly of use for enabling different RPC programs to share the same transport. The Sun NFSv2/v3 ACL protocol can do this.

struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data)

Allocate a new RPC task, then run rpc_execute against it

Parameters

const struct rpc_task_setup *task_setup_data

pointer to task initialisation data

int rpc_call_sync(struct rpc_clnt *clnt, const struct rpc_message *msg, int flags)

Perform a synchronous RPC call

Parameters

struct rpc_clnt *clnt

pointer to RPC client

const struct rpc_message *msg

RPC call parameters

int flags

RPC call flags

int rpc_call_async(struct rpc_clnt *clnt, const struct rpc_message *msg, int flags, const struct rpc_call_ops *tk_ops, void *data)

Perform an asynchronous RPC call

Parameters

struct rpc_clnt *clnt

pointer to RPC client

const struct rpc_message *msg

RPC call parameters

int flags

RPC call flags

const struct rpc_call_ops *tk_ops

RPC call ops

void *data

user call data

void rpc_prepare_reply_pages(struct rpc_rqst *req, struct page **pages, unsigned int base, unsigned int len, unsigned int hdrsize)

Prepare to receive a reply data payload into pages

Parameters

struct rpc_rqst *req

RPC request to prepare

struct page **pages

vector of struct page pointers

unsigned int base

offset in first page where receive should start, in bytes

unsigned int len

expected size of the upper layer data payload, in bytes

unsigned int hdrsize

expected size of upper layer reply header, in XDR words

size_t rpc_peeraddr(struct rpc_clnt *clnt, struct sockaddr *buf, size_t bufsize)

extract remote peer address from clnt’s xprt

Parameters

struct rpc_clnt *clnt

RPC client structure

struct sockaddr *buf

target buffer

size_t bufsize

length of target buffer

Description

Returns the number of bytes that are actually in the stored address.

const char *rpc_peeraddr2str(struct rpc_clnt *clnt, enum rpc_display_format_t format)

return remote peer address in printable format

Parameters

struct rpc_clnt *clnt

RPC client structure

enum rpc_display_format_t format

address format

Description

NB: the lifetime of the memory referenced by the returned pointer is the same as the rpc_xprt itself. As long as the caller uses this pointer, it must hold the RCU read lock.

int rpc_localaddr(struct rpc_clnt *clnt, struct sockaddr *buf, size_t buflen)

discover local endpoint address for an RPC client

Parameters

struct rpc_clnt *clnt

RPC client structure

struct sockaddr *buf

target buffer

size_t buflen

size of target buffer, in bytes

Description

Returns zero and fills in “buf” and “buflen” if successful; otherwise, a negative errno is returned.

This works even if the underlying transport is not currently connected, or if the upper layer never previously provided a source address.

The result of this function call is transient: multiple calls in succession may give different results, depending on how local networking configuration changes over time.

struct net *rpc_net_ns(struct rpc_clnt *clnt)

Get the network namespace for this RPC client

Parameters

struct rpc_clnt *clnt

RPC client to query

size_t rpc_max_payload(struct rpc_clnt *clnt)

Get maximum payload size for a transport, in bytes

Parameters

struct rpc_clnt *clnt

RPC client to query

Description

For stream transports, this is one RPC record fragment (see RFC 1831), as we don’t support multi-record requests yet. For datagram transports, this is the size of an IP packet minus the IP, UDP, and RPC header sizes.

size_t rpc_max_bc_payload(struct rpc_clnt *clnt)

Get maximum backchannel payload size, in bytes

Parameters

struct rpc_clnt *clnt

RPC client to query

void rpc_force_rebind(struct rpc_clnt *clnt)

force transport to check that remote port is unchanged

Parameters

struct rpc_clnt *clnt

client to rebind

int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt_switch *xps, struct rpc_xprt *xprt, void *dummy)

Test and add a new transport to a rpc_clnt

Parameters

struct rpc_clnt *clnt

pointer to struct rpc_clnt

struct rpc_xprt_switch *xps

pointer to struct rpc_xprt_switch,

struct rpc_xprt *xprt

pointer struct rpc_xprt

void *dummy

unused

int rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt_switch *xps, struct rpc_xprt *xprt, void *data)

Parameters

struct rpc_clnt *clnt

struct rpc_clnt to get the new transport

struct rpc_xprt_switch *xps

the rpc_xprt_switch to hold the new transport

struct rpc_xprt *xprt

the rpc_xprt to test

void *data

a struct rpc_add_xprt_test pointer that holds the test function and test function call data

Description

This is an rpc_clnt_add_xprt setup() function which returns 1 so:

1) caller of the test function must dereference the rpc_xprt_switch and the rpc_xprt. 2) test function must call rpc_xprt_switch_add_xprt, usually in the rpc_call_done routine.

Upon success (return of 1), the test function adds the new transport to the rpc_clnt xprt switch

int rpc_clnt_add_xprt(struct rpc_clnt *clnt, struct xprt_create *xprtargs, int (*setup)(struct rpc_clnt*, struct rpc_xprt_switch*, struct rpc_xprt*, void*), void *data)

Add a new transport to a rpc_clnt

Parameters

struct rpc_clnt *clnt

pointer to struct rpc_clnt

struct xprt_create *xprtargs

pointer to struct xprt_create

int (*setup)(struct rpc_clnt *, struct rpc_xprt_switch *, struct rpc_xprt *, void *)

callback to test and/or set up the connection

void *data

pointer to setup function data

Description

Creates a new transport using the parameters set in args and adds it to clnt. If ping is set, then test that connectivity succeeds before adding the new transport.

Network device support

Driver Support

void dev_add_pack(struct packet_type *pt)

add packet handler

Parameters

struct packet_type *pt

packet type declaration

Add a protocol handler to the networking stack. The passed packet_type is linked into kernel lists and may not be freed until it has been removed from the kernel lists.

This call does not sleep therefore it can not guarantee all CPU’s that are in middle of receiving packets will see the new packet type (until the next received packet).

void __dev_remove_pack(struct packet_type *pt)

remove packet handler

Parameters

struct packet_type *pt

packet type declaration

Remove a protocol handler that was previously added to the kernel protocol handlers by dev_add_pack(). The passed packet_type is removed from the kernel lists and can be freed or reused once this function returns.

The packet type might still be in use by receivers and must not be freed until after all the CPU’s have gone through a quiescent state.

void dev_remove_pack(struct packet_type *pt)

remove packet handler

Parameters

struct packet_type *pt

packet type declaration

Remove a protocol handler that was previously added to the kernel protocol handlers by dev_add_pack(). The passed packet_type is removed from the kernel lists and can be freed or reused once this function returns.

This call sleeps to guarantee that no CPU is looking at the packet type after return.

void dev_add_offload(struct packet_offload *po)

register offload handlers

Parameters

struct packet_offload *po

protocol offload declaration

Add protocol offload handlers to the networking stack. The passed proto_offload is linked into kernel lists and may not be freed until it has been removed from the kernel lists.

This call does not sleep therefore it can not guarantee all CPU’s that are in middle of receiving packets will see the new offload handlers (until the next received packet).

void dev_remove_offload(struct packet_offload *po)

remove packet offload handler

Parameters

struct packet_offload *po

packet offload declaration

Remove a packet offload handler that was previously added to the kernel offload handlers by dev_add_offload(). The passed offload_type is removed from the kernel lists and can be freed or reused once this function returns.

This call sleeps to guarantee that no CPU is looking at the packet type after return.

get ‘iflink’ value of a interface

Parameters

const struct net_device *dev

targeted interface

Indicates the ifindex the interface is linked to. Physical interfaces have the same ‘ifindex’ and ‘iflink’ values.

int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)

Retrieve tunnel egress information.

Parameters

struct net_device *dev

targeted interface

struct sk_buff *skb

The packet.

For better visibility of tunnel traffic OVS needs to retrieve egress tunnel information for a packet. Following API allows user to get this info.

struct net_device *__dev_get_by_name(struct net *net, const char *name)

find a device by its name

Parameters

struct net *net

the applicable net namespace

const char *name

name to find

Find an interface by name. Must be called under RTNL semaphore or dev_base_lock. If the name is found a pointer to the device is returned. If the name is not found then NULL is returned. The reference counters are not incremented so the caller must be careful with locks.

struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)

find a device by its name

Parameters

struct net *net

the applicable net namespace

const char *name

name to find

Description

Find an interface by name. If the name is found a pointer to the device is returned. If the name is not found then NULL is returned. The reference counters are not incremented so the caller must be careful with locks. The caller must hold RCU lock.

struct net_device *dev_get_by_name(struct net *net, const char *name)

find a device by its name

Parameters

struct net *net

the applicable net namespace

const char *name

name to find

Find an interface by name. This can be called from any context and does its own locking. The returned handle has the usage count incremented and the caller must use dev_put() to release it when it is no longer needed. NULL is returned if no matching device is found.

struct net_device *__dev_get_by_index(struct net *net, int ifindex)

find a device by its ifindex

Parameters

struct net *net

the applicable net namespace

int ifindex

index of device

Search for an interface by index. Returns NULL if the device is not found or a pointer to the device. The device has not had its reference counter increased so the caller must be careful about locking. The caller must hold either the RTNL semaphore or dev_base_lock.

struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)

find a device by its ifindex

Parameters

struct net *net

the applicable net namespace

int ifindex

index of device

Search for an interface by index. Returns NULL if the device is not found or a pointer to the device. The device has not had its reference counter increased so the caller must be careful about locking. The caller must hold RCU lock.

struct net_device *dev_get_by_index(struct net *net, int ifindex)

find a device by its ifindex

Parameters

struct net *net

the applicable net namespace

int ifindex

index of device

Search for an interface by index. Returns NULL if the device is not found or a pointer to the device. The device returned has had a reference added and the pointer is safe until the user calls dev_put to indicate they have finished with it.

struct net_device *dev_get_by_napi_id(unsigned int napi_id)

find a device by napi_id

Parameters

unsigned int napi_id

ID of the NAPI struct

Search for an interface by NAPI ID. Returns NULL if the device is not found or a pointer to the device. The device has not had its reference counter increased so the caller must be careful about locking. The caller must hold RCU lock.

struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, const char *ha)

find a device by its hardware address

Parameters

struct net *net

the applicable net namespace

unsigned short type

media type of device

const char *ha

hardware address

Search for an interface by MAC address. Returns NULL if the device is not found or a pointer to the device. The caller must hold RCU or RTNL. The returned device has not had its ref count increased and the caller must therefore be careful about locking

struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask)

find any device with given flags

Parameters

struct net *net

the applicable net namespace

unsigned short if_flags

IFF_* values

unsigned short mask

bitmask of bits in if_flags to check

Search for any interface with the given flags. Returns NULL if a device is not found or a pointer to the device. Must be called inside rtnl_lock(), and result refcount is unchanged.

bool dev_valid_name(const char *name)

check if name is okay for network device

Parameters

const char *name

name string

Network device names need to be valid file names to allow sysfs to work. We also disallow any kind of whitespace.

int dev_alloc_name(struct net_device *dev, const char *name)

allocate a name for a device

Parameters

struct net_device *dev

device

const char *name

name format string

Passed a format string - eg “lt``d``” it will try and find a suitable id. It scans list of devices to build up a free map, then chooses the first empty slot. The caller must hold the dev_base or rtnl lock while allocating the name and adding the device in order to avoid duplicates. Limited to bits_per_byte * page size devices (ie 32K on most platforms). Returns the number of the unit assigned or a negative errno code.

int dev_set_alias(struct net_device *dev, const char *alias, size_t len)

change ifalias of a device

Parameters

struct net_device *dev

device

const char *alias

name up to IFALIASZ

size_t len

limit of bytes to copy from info

Set ifalias for a device,

void netdev_features_change(struct net_device *dev)

device changes features

Parameters

struct net_device *dev

device to cause notification

Called to indicate a device has changed features.

void netdev_state_change(struct net_device *dev)

device changes state

Parameters

struct net_device *dev

device to cause notification

Called to indicate a device has changed state. This function calls the notifier chains for netdev_chain and sends a NEWLINK message to the routing socket.

void __netdev_notify_peers(struct net_device *dev)

notify network peers about existence of dev, to be called when rtnl lock is already held.

Parameters

struct net_device *dev

network device

Description

Generate traffic such that interested network peers are aware of dev, such as by generating a gratuitous ARP. This may be used when a device wants to inform the rest of the network about some sort of reconfiguration such as a failover event or virtual machine migration.

void netdev_notify_peers(struct net_device *dev)

notify network peers about existence of dev

Parameters

struct net_device *dev

network device

Description

Generate traffic such that interested network peers are aware of dev, such as by generating a gratuitous ARP. This may be used when a device wants to inform the rest of the network about some sort of reconfiguration such as a failover event or virtual machine migration.

int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)

prepare an interface for use.

Parameters

struct net_device *dev

device to open

struct netlink_ext_ack *extack

netlink extended ack

Takes a device from down to up state. The device’s private open function is invoked and then the multicast lists are loaded. Finally the device is moved into the up state and a NETDEV_UP message is sent to the netdev notifier chain.

Calling this function on an active interface is a nop. On a failure a negative errno code is returned.

void dev_close(struct net_device *dev)

shutdown an interface.

Parameters

struct net_device *dev

device to shutdown

This function moves an active device into down state. A NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device is then deactivated and finally a NETDEV_DOWN is sent to the notifier chain.

void dev_disable_lro(struct net_device *dev)

disable Large Receive Offload on a device

Parameters

struct net_device *dev

device

Disable Large Receive Offload (LRO) on a net device. Must be called under RTNL. This is needed if received packets may be forwarded to another interface.

int register_netdevice_notifier(struct notifier_block *nb)

register a network notifier block

Parameters

struct notifier_block *nb

notifier

Description

Register a notifier to be called when network device events occur. The notifier passed is linked into the kernel structures and must not be reused until it has been unregistered. A negative errno code is returned on a failure.

When registered all registration and up events are replayed to the new notifier to allow device to have a race free view of the network device list.

int unregister_netdevice_notifier(struct notifier_block *nb)

unregister a network notifier block

Parameters

struct notifier_block *nb

notifier

Description

Unregister a notifier previously registered by register_netdevice_notifier(). The notifier is unlinked into the kernel structures and may then be reused. A negative errno code is returned on a failure.

After unregistering unregister and down device events are synthesized for all devices on the device list to the removed notifier to remove the need for special case cleanup code.

int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)

register a per-netns network notifier block

Parameters

struct net *net

network namespace

struct notifier_block *nb

notifier

Description

Register a notifier to be called when network device events occur. The notifier passed is linked into the kernel structures and must not be reused until it has been unregistered. A negative errno code is returned on a failure.

When registered all registration and up events are replayed to the new notifier to allow device to have a race free view of the network device list.

int unregister_netdevice_notifier_net(struct net *net, struct notifier_block *nb)

unregister a per-netns network notifier block

Parameters

struct net *net

network namespace

struct notifier_block *nb

notifier

Description

Unregister a notifier previously registered by register_netdevice_notifier(). The notifier is unlinked into the kernel structures and may then be reused. A negative errno code is returned on a failure.

After unregistering unregister and down device events are synthesized for all devices on the device list to the removed notifier to remove the need for special case cleanup code.

int call_netdevice_notifiers(unsigned long val, struct net_device *dev)

call all network notifier blocks

Parameters

unsigned long val

value passed unmodified to notifier function

struct net_device *dev

net_device pointer passed unmodified to notifier function

Call all network notifier blocks. Parameters and return value are as for raw_notifier_call_chain().

int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)

loopback an skb to another netif

Parameters

struct net_device *dev

destination network device

struct sk_buff *skb

buffer to forward

Description

return values:

NET_RX_SUCCESS (no congestion) NET_RX_DROP (packet was dropped, but freed)

dev_forward_skb can be used for injecting an skb from the start_xmit function of one device into the receive queue of another device.

The receiving device may be in another namespace, so we have to clear all information in the skb that could impact namespace isolation.

bool dev_nit_active(struct net_device *dev)

return true if any network interface taps are in use

Parameters

struct net_device *dev

network device to check for the presence of taps

int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)

set actual number of RX queues used

Parameters

struct net_device *dev

Network device

unsigned int rxq

Actual number of RX queues

This must be called either with the rtnl_lock held or before registration of the net device. Returns 0 on success, or a negative error code. If called before registration, it always succeeds.

int netif_set_real_num_queues(struct net_device *dev, unsigned int txq, unsigned int rxq)

set actual number of RX and TX queues used

Parameters

struct net_device *dev

Network device

unsigned int txq

Actual number of TX queues

unsigned int rxq

Actual number of RX queues

Set the real number of both TX and RX queues. Does nothing if the number of queues is already correct.

int netif_get_num_default_rss_queues(void)

default number of RSS queues

Parameters

void

no arguments

Description

This routine should set an upper limit on the number of RSS queues used by default by multiqueue devices.

void netif_device_detach(struct net_device *dev)

mark device as removed

Parameters

struct net_device *dev

network device

Description

Mark device as removed from system and therefore no longer available.

void netif_device_attach(struct net_device *dev)

mark device as attached

Parameters

struct net_device *dev

network device

Description

Mark device as attached from system and restart if needed.

struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, netdev_features_t features)

mac layer segmentation handler.

Parameters

struct sk_buff *skb

buffer to segment

netdev_features_t features

features for the output path (see dev->features)

struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path)

Perform segmentation on skb.

Parameters

struct sk_buff *skb

buffer to segment

netdev_features_t features

features for the output path (see dev->features)

bool tx_path

whether it is called in TX path

This function segments the given skb and returns a list of segments.

It may return NULL if the skb requires no segmentation. This is only possible when GSO is used for verifying header integrity.

Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.

int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)

loop back skb

Parameters

struct net *net

network namespace this loopback is happening in

struct sock *sk

sk needed to be a netfilter okfn

struct sk_buff *skb

buffer to transmit

bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id, u16 filter_id)

check whether an RFS hardware filter may be removed

Parameters

struct net_device *dev

Device on which the filter was set

u16 rxq_index

RX queue index

u32 flow_id

Flow ID passed to ndo_rx_flow_steer()

u16 filter_id

Filter ID returned by ndo_rx_flow_steer()

Description

Drivers that implement ndo_rx_flow_steer() should periodically call this function for each installed filter and remove the filters for which it returns true.

int netif_rx(struct sk_buff *skb)

post buffer to the network code

Parameters

struct sk_buff *skb

buffer to post

This function receives a packet from a device driver and queues it for the upper (protocol) levels to process. It always succeeds. The buffer may be dropped during processing for congestion control or by the protocol layers.

return values: NET_RX_SUCCESS (no congestion) NET_RX_DROP (packet was dropped)

bool netdev_is_rx_handler_busy(struct net_device *dev)

check if receive handler is registered

Parameters

struct net_device *dev

device to check

Check if a receive handler is already registered for a given device. Return true if there one.

The caller must hold the rtnl_mutex.

int netdev_rx_handler_register(struct net_device *dev, rx_handler_func_t *rx_handler, void *rx_handler_data)

register receive handler

Parameters

struct net_device *dev

device to register a handler for

rx_handler_func_t *rx_handler

receive handler to register

void *rx_handler_data

data pointer that is used by rx handler

Register a receive handler for a device. This handler will then be called from __netif_receive_skb. A negative errno code is returned on a failure.

The caller must hold the rtnl_mutex.

For a general description of rx_handler, see enum rx_handler_result.

void netdev_rx_handler_unregister(struct net_device *dev)

unregister receive handler

Parameters

struct net_device *dev

device to unregister a handler from

Unregister a receive handler from a device.

The caller must hold the rtnl_mutex.

int netif_receive_skb_core(struct sk_buff *skb)

special purpose version of netif_receive_skb

Parameters

struct sk_buff *skb

buffer to process

More direct receive version of netif_receive_skb(). It should only be used by callers that have a need to skip RPS and Generic XDP. Caller must also take care of handling if (page_is_)pfmemalloc.

This function may only be called from softirq context and interrupts should be enabled.

Return values (usually ignored): NET_RX_SUCCESS: no congestion NET_RX_DROP: packet was dropped

int netif_receive_skb(struct sk_buff *skb)

process receive buffer from network

Parameters

struct sk_buff *skb

buffer to process

netif_receive_skb() is the main receive data processing function. It always succeeds. The buffer may be dropped during processing for congestion control or by the protocol layers.

This function may only be called from softirq context and interrupts should be enabled.

Return values (usually ignored): NET_RX_SUCCESS: no congestion NET_RX_DROP: packet was dropped

void netif_receive_skb_list(struct list_head *head)

process many receive buffers from network

Parameters

struct list_head *head

list of skbs to process.

Since return value of netif_receive_skb() is normally ignored, and wouldn’t be meaningful for a list, this function returns void.

This function may only be called from softirq context and interrupts should be enabled.

void __napi_schedule(struct napi_struct *n)

schedule for receive

Parameters

struct napi_struct *n

entry to schedule

Description

The entry’s receive function will be scheduled to run. Consider using __napi_schedule_irqoff() if hard irqs are masked.

bool napi_schedule_prep(struct napi_struct *n)

check if napi can be scheduled

Parameters

struct napi_struct *n

napi context

Description

Test if NAPI routine is already running, and if not mark it as running. This is used as a condition variable to insure only one NAPI poll instance runs. We also make sure there is no pending NAPI disable.

void __napi_schedule_irqoff(struct napi_struct *n)

schedule for receive

Parameters

struct napi_struct *n

entry to schedule

Description

Variant of __napi_schedule() assuming hard irqs are masked.

On PREEMPT_RT enabled kernels this maps to __napi_schedule() because the interrupt disabled assumption might not be true due to force-threaded interrupts and spinlock substitution.

void napi_enable(struct napi_struct *n)

enable NAPI scheduling

Parameters

struct napi_struct *n

NAPI context

Description

Resume NAPI from being scheduled on this context. Must be paired with napi_disable.

bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev)

Check if device is linked to an upper device

Parameters

struct net_device *dev

device

struct net_device *upper_dev

upper device to check

Description

Find out if a device is linked to specified upper device and return true in case it is. Note that this checks only immediate upper device, not through a complete stack of devices. The caller must hold the RTNL lock.

bool netdev_has_upper_dev_all_rcu(struct net_device *dev, struct net_device *upper_dev)

Check if device is linked to an upper device

Parameters

struct net_device *dev

device

struct net_device *upper_dev

upper device to check

Description

Find out if a device is linked to specified upper device and return true in case it is. Note that this checks the entire upper device chain. The caller must hold rcu lock.

bool netdev_has_any_upper_dev(struct net_device *dev)

Check if device is linked to some device

Parameters

struct net_device *dev

device

Description

Find out if a device is linked to an upper device and return true in case it is. The caller must hold the RTNL lock.

struct net_device *netdev_master_upper_dev_get(struct net_device *dev)

Get master upper device

Parameters

struct net_device *dev

device

Description

Find a master upper device and return pointer to it or NULL in case it’s not there. The caller must hold the RTNL lock.

struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, struct list_head **iter)

Get the next dev from upper list

Parameters

struct net_device *dev

device

struct list_head **iter

list_head ** of the current position

Description

Gets the next device from the dev’s upper list, starting from iter position. The caller must hold RCU read lock.

void *netdev_lower_get_next_private(struct net_device *dev, struct list_head **iter)

Get the next ->private from the lower neighbour list

Parameters

struct net_device *dev

device

struct list_head **iter

list_head ** of the current position

Description

Gets the next netdev_adjacent->private from the dev’s lower neighbour list, starting from iter position. The caller must hold either hold the RTNL lock or its own locking that guarantees that the neighbour lower list will remain unchanged.

void *netdev_lower_get_next_private_rcu(struct net_device *dev, struct list_head **iter)

Get the next ->private from the lower neighbour list, RCU variant

Parameters

struct net_device *dev

device

struct list_head **iter

list_head ** of the current position

Description

Gets the next netdev_adjacent->private from the dev’s lower neighbour list, starting from iter position. The caller must hold RCU read lock.

void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)

Get the next device from the lower neighbour list

Parameters

struct net_device *dev

device

struct list_head **iter

list_head ** of the current position

Description

Gets the next netdev_adjacent from the dev’s lower neighbour list, starting from iter position. The caller must hold RTNL lock or its own locking that guarantees that the neighbour lower list will remain unchanged.

void *netdev_lower_get_first_private_rcu(struct net_device *dev)

Get the first ->private from the lower neighbour list, RCU variant

Parameters

struct net_device *dev

device

Description

Gets the first netdev_adjacent->private from the dev’s lower neighbour list. The caller must hold RCU read lock.

struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)

Get master upper device

Parameters

struct net_device *dev

device

Description

Find a master upper device and return pointer to it or NULL in case it’s not there. The caller must hold the RCU read lock.

Add a link to the upper device

Parameters

struct net_device *dev

device

struct net_device *upper_dev

new upper device

struct netlink_ext_ack *extack

netlink extended ack

Description

Adds a link to device which is upper to this one. The caller must hold the RTNL lock. On a failure a negative errno code is returned. On success the reference counts are adjusted and the function returns zero.

Add a master link to the upper device

Parameters

struct net_device *dev

device

struct net_device *upper_dev

new upper device

void *upper_priv

upper device private

void *upper_info

upper info to be passed down via notifier

struct netlink_ext_ack *extack

netlink extended ack

Description

Adds a link to device which is upper to this one. In this case, only one master upper device can be linked, although other non-master devices might be linked as well. The caller must hold the RTNL lock. On a failure a negative errno code is returned. On success the reference counts are adjusted and the function returns zero.

Removes a link to upper device

Parameters

struct net_device *dev

device

struct net_device *upper_dev

new upper device

Description

Removes a link to device which is upper to this one. The caller must hold the RTNL lock.

void netdev_bonding_info_change(struct net_device *dev, struct netdev_bonding_info *bonding_info)

Dispatch event about slave change

Parameters

struct net_device *dev

device

struct netdev_bonding_info *bonding_info

info to dispatch

Description

Send NETDEV_BONDING_INFO to netdev notifiers with info. The caller must hold the RTNL lock.

struct net_device *netdev_get_xmit_slave(struct net_device *dev, struct sk_buff *skb, bool all_slaves)

Get the xmit slave of master device

Parameters

struct net_device *dev

device

struct sk_buff *skb

The packet

bool all_slaves

assume all the slaves are active

Description

The reference counters are not incremented so the caller must be careful with locks. The caller must hold RCU lock. NULL is returned if no slave is found.

struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev, struct sock *sk)

Get the lowest device in chain given device and socket

Parameters

struct net_device *dev

device

struct sock *sk

the socket

Description

NULL is returned if no lower device is found.

void netdev_lower_state_changed(struct net_device *lower_dev, void *lower_state_info)

Dispatch event about lower device state change

Parameters

struct net_device *lower_dev

device

void *lower_state_info

state to dispatch

Description

Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info. The caller must hold the RTNL lock.

int dev_set_promiscuity(struct net_device *dev, int inc)

update promiscuity count on a device

Parameters

struct net_device *dev

device

int inc

modifier

Add or remove promiscuity from a device. While the count in the device remains above zero the interface remains promiscuous. Once it hits zero the device reverts back to normal filtering operation. A negative inc value is used to drop promiscuity on the device. Return 0 if successful or a negative errno code on error.

int dev_set_allmulti(struct net_device *dev, int inc)

update allmulti count on a device

Parameters

struct net_device *dev

device

int inc

modifier

Add or remove reception of all multicast frames to a device. While the count in the device remains above zero the interface remains listening to all interfaces. Once it hits zero the device reverts back to normal filtering operation. A negative inc value is used to drop the counter when releasing a resource needing all multicasts. Return 0 if successful or a negative errno code on error.

unsigned int dev_get_flags(const struct net_device *dev)

get flags reported to userspace

Parameters

const struct net_device *dev

device

Get the combination of flag bits exported through APIs to userspace.

int dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack)

change device settings

Parameters

struct net_device *dev

device

unsigned int flags

device state flags

struct netlink_ext_ack *extack

netlink extended ack

Change settings on device based state flags. The flags are in the userspace exported format.

void dev_set_group(struct net_device *dev, int new_group)

Change group this device belongs to

Parameters

struct net_device *dev

device

int new_group

group this device should belong to

int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, struct netlink_ext_ack *extack)

Call NETDEV_PRE_CHANGEADDR.

Parameters

struct net_device *dev

device

const char *addr

new address

struct netlink_ext_ack *extack

netlink extended ack

int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack)

Change Media Access Control Address

Parameters

struct net_device *dev

device

struct sockaddr *sa

new address

struct netlink_ext_ack *extack

netlink extended ack

Change the hardware (MAC) address of the device

int dev_change_carrier(struct net_device *dev, bool new_carrier)

Change device carrier

Parameters

struct net_device *dev

device

bool new_carrier

new value

Change device carrier

int dev_get_phys_port_id(struct net_device *dev, struct netdev_phys_item_id *ppid)

Get device physical port ID

Parameters

struct net_device *dev

device

struct netdev_phys_item_id *ppid

port ID

Get device physical port ID

int dev_get_phys_port_name(struct net_device *dev, char *name, size_t len)

Get device physical port name

Parameters

struct net_device *dev

device

char *name

port name

size_t len

limit of bytes to copy to name

Get device physical port name

int dev_get_port_parent_id(struct net_device *dev, struct netdev_phys_item_id *ppid, bool recurse)

Get the device’s port parent identifier

Parameters

struct net_device *dev

network device

struct netdev_phys_item_id *ppid

pointer to a storage for the port’s parent identifier

bool recurse

allow/disallow recursion to lower devices

Get the devices’s port parent identifier

bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)

Indicate if two network devices have the same port parent identifier

Parameters

struct net_device *a

first network device

struct net_device *b

second network device

int dev_change_proto_down(struct net_device *dev, bool proto_down)

update protocol port state information

Parameters

struct net_device *dev

device

bool proto_down

new value

This info can be used by switch drivers to set the phys state of the port.

int dev_change_proto_down_generic(struct net_device *dev, bool proto_down)

generic implementation for ndo_change_proto_down that sets carrier according to proto_down.

Parameters

struct net_device *dev

device

bool proto_down

new value

void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, u32 value)

proto down reason

Parameters

struct net_device *dev

device

unsigned long mask

proto down mask

u32 value

proto down value

void netdev_update_features(struct net_device *dev)

recalculate device features

Parameters

struct net_device *dev

the device to check

Recalculate dev->features set and send notifications if it has changed. Should be called after driver or hardware dependent conditions might have changed that influence the features.

void netdev_change_features(struct net_device *dev)

recalculate device features

Parameters

struct net_device *dev

the device to check

Recalculate dev->features set and send notifications even if they have not changed. Should be called instead of netdev_update_features() if also dev->vlan_features might have changed to allow the changes to be propagated to stacked VLAN devices.

void netif_stacked_transfer_operstate(const struct net_device *rootdev, struct net_device *dev)

transfer operstate

Parameters

const struct net_device *rootdev

the root or lower level device to transfer state from

struct net_device *dev

the device to transfer operstate to

Transfer operational state from root to device. This is normally called when a stacking relationship exists between the root device and the device(a leaf device).

int register_netdevice(struct net_device *dev)

register a network device

Parameters

struct net_device *dev

device to register

Take a completed network device structure and add it to the kernel interfaces. A NETDEV_REGISTER message is sent to the netdev notifier chain. 0 is returned on success. A negative errno code is returned on a failure to set up the device, or if the name is a duplicate.

Callers must hold the rtnl semaphore. You may want register_netdev() instead of this.

BUGS: The locking appears insufficient to guarantee two parallel registers will not get the same name.

int init_dummy_netdev(struct net_device *dev)

init a dummy network device for NAPI

Parameters

struct net_device *dev

device to init

This takes a network device structure and initialize the minimum amount of fields so it can be used to schedule NAPI polls without registering a full blown interface. This is to be used by drivers that need to tie several hardware interfaces to a single NAPI poll scheduler due to HW limitations.

int register_netdev(struct net_device *dev)

register a network device

Parameters

struct net_device *dev

device to register

Take a completed network device structure and add it to the kernel interfaces. A NETDEV_REGISTER message is sent to the netdev notifier chain. 0 is returned on success. A negative errno code is returned on a failure to set up the device, or if the name is a duplicate.

This is a wrapper around register_netdevice that takes the rtnl semaphore and expands the device name if you passed a format string to alloc_netdev.

struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, struct rtnl_link_stats64 *storage)

get network device statistics

Parameters

struct net_device *dev

device to get statistics from

struct rtnl_link_stats64 *storage

place to store stats

Get network statistics from device. Return storage. The device driver may provide its own method by setting dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats; otherwise the internal statistics structure is used.

void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, const struct pcpu_sw_netstats __percpu *netstats)

get per-cpu network device statistics

Parameters

struct rtnl_link_stats64 *s

place to store stats

const struct pcpu_sw_netstats __percpu *netstats

per-cpu network stats to read from

Read per-cpu network statistics and populate the related fields in s.

void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s)

ndo_get_stats64 implementation

Parameters

struct net_device *dev

device to get statistics from

struct rtnl_link_stats64 *s

place to store stats

Populate s from dev->stats and dev->tstats. Can be used as ndo_get_stats64() callback.

struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device*), unsigned int txqs, unsigned int rxqs)

allocate network device

Parameters

int sizeof_priv

size of private data to allocate space for

const char *name

device name format string

unsigned char name_assign_type

origin of device name

void (*setup)(struct net_device *)

callback to initialize device

unsigned int txqs

the number of TX subqueues to allocate

unsigned int rxqs

the number of RX subqueues to allocate

Description

Allocates a struct net_device with private data area for driver use and performs basic initialization. Also allocates subqueue structs for each queue on the device.

void free_netdev(struct net_device *dev)

free network device

Parameters

struct net_device *dev

device

Description

This function does the last stage of destroying an allocated device interface. The reference to the device object is released. If this is the last reference then it will be freed.Must be called in process context.

void synchronize_net(void)

Synchronize with packet receive processing

Parameters

void

no arguments

Description

Wait for packets currently being received to be done. Does not block later packets from starting.

void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)

remove device from the kernel

Parameters

struct net_device *dev

device

struct list_head *head

list

This function shuts down a device interface and removes it from the kernel tables. If head not NULL, device is queued to be unregistered later.

Callers must hold the rtnl semaphore. You may want unregister_netdev() instead of this.

void unregister_netdevice_many(struct list_head *head)

unregister many devices

Parameters

struct list_head *head

list of devices

Note

As most callers use a stack allocated list_head,

we force a list_del() to make sure stack wont be corrupted later.

void unregister_netdev(struct net_device *dev)

remove device from the kernel

Parameters

struct net_device *dev

device

This function shuts down a device interface and removes it from the kernel tables.

This is just a wrapper for unregister_netdevice that takes the rtnl semaphore. In general you want to use this and not unregister_netdevice.

int __dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat, int new_ifindex)

move device to different nethost namespace

Parameters

struct net_device *dev

device

struct net *net

network namespace

const char *pat

If not NULL name pattern to try if the current device name is already taken in the destination network namespace.

int new_ifindex

If not zero, specifies device index in the target namespace.

This function shuts down a device interface and moves it to a new network namespace. On success 0 is returned, on a failure a netagive errno code is returned.

Callers must hold the rtnl semaphore.

netdev_features_t netdev_increment_features(netdev_features_t all, netdev_features_t one, netdev_features_t mask)

increment feature set by one

Parameters

netdev_features_t all

current feature set

netdev_features_t one

new feature set

netdev_features_t mask

mask feature set

Computes a new feature set after adding a device with feature set one to the master device with current feature set all. Will not enable anything that is off in mask. Returns the new feature set.

int eth_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len)

create the Ethernet header

Parameters

struct sk_buff *skb

buffer to alter

struct net_device *dev

source device

unsigned short type

Ethernet type field

const void *daddr

destination address (NULL leave destination address)

const void *saddr

source address (NULL use device source address)

unsigned int len

packet length (<= skb->len)

Description

Set the protocol type. For a packet of type ETH_P_802_3/2 we put the length in here instead.

u32 eth_get_headlen(const struct net_device *dev, const void *data, u32 len)

determine the length of header for an ethernet frame

Parameters

const struct net_device *dev

pointer to network device

const void *data

pointer to start of frame

u32 len

total length of frame

Description

Make a best effort attempt to pull the length for all of the headers for a given frame in a linear buffer.

__be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)

determine the packet’s protocol ID.

Parameters

struct sk_buff *skb

received socket data

struct net_device *dev

receiving network device

Description

The rule here is that we assume 802.3 if the type field is short enough to be a length. This is normal practice and works for any ‘now in use’ protocol.

int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr)

extract hardware address from packet

Parameters

const struct sk_buff *skb

packet to extract header from

unsigned char *haddr

destination buffer

int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, __be16 type)

fill cache entry from neighbour

Parameters

const struct neighbour *neigh

source neighbour

struct hh_cache *hh

destination cache entry

__be16 type

Ethernet type field

Description

Create an Ethernet header template from the neighbour.

void eth_header_cache_update(struct hh_cache *hh, const struct net_device *dev, const unsigned char *haddr)

update cache entry

Parameters

struct hh_cache *hh

destination cache entry

const struct net_device *dev

network device

const unsigned char *haddr

new hardware address

Description

Called by Address Resolution module to notify changes in address.

__be16 eth_header_parse_protocol(const struct sk_buff *skb)

extract protocol from L2 header

Parameters

const struct sk_buff *skb

packet to extract protocol from

int eth_prepare_mac_addr_change(struct net_device *dev, void *p)

prepare for mac change

Parameters

struct net_device *dev

network device

void *p

socket address

void eth_commit_mac_addr_change(struct net_device *dev, void *p)

commit mac change

Parameters

struct net_device *dev

network device

void *p

socket address

int eth_mac_addr(struct net_device *dev, void *p)

set new Ethernet hardware address

Parameters

struct net_device *dev

network device

void *p

socket address

Description

Change hardware address of device.

This doesn’t change hardware matching, so needs to be overridden for most real devices.

void ether_setup(struct net_device *dev)

setup Ethernet network device

Parameters

struct net_device *dev

network device

Description

Fill in the fields of the device structure with Ethernet-generic values.

struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs, unsigned int rxqs)

Allocates and sets up an Ethernet device

Parameters

int sizeof_priv

Size of additional driver-private structure to be allocated for this Ethernet device

unsigned int txqs

The number of TX queues this device has.

unsigned int rxqs

The number of RX queues this device has.

Description

Fill in the fields of the device structure with Ethernet-generic values. Basically does everything except registering the device.

Constructs a new net device, complete with a private data area of size (sizeof_priv). A 32-byte (not bit) alignment is enforced for this private data area.

int platform_get_ethdev_address(struct device *dev, struct net_device *netdev)

Set netdev’s MAC address from a given device

Parameters

struct device *dev

Pointer to the device

struct net_device *netdev

Pointer to netdev to write the address to

Description

Wrapper around eth_platform_get_mac_address() which writes the address directly to netdev->dev_addr.

int fwnode_get_mac_address(struct fwnode_handle *fwnode, char *addr)

Get the MAC from the firmware node

Parameters

struct fwnode_handle *fwnode

Pointer to the firmware node

char *addr

Address of buffer to store the MAC in

Description

Search the firmware node for the best MAC address to use. ‘mac-address’ is checked first, because that is supposed to contain to “most recent” MAC address. If that isn’t set, then ‘local-mac-address’ is checked next, because that is the default address. If that isn’t set, then the obsolete ‘address’ is checked, just in case we’re using an old device tree.

Note that the ‘address’ property is supposed to contain a virtual address of the register set, but some DTS files have redefined that property to be the MAC address.

All-zero MAC addresses are rejected, because those could be properties that exist in the firmware tables, but were not updated by the firmware. For example, the DTS could define ‘mac-address’ and ‘local-mac-address’, with zero MAC addresses. Some older U-Boots only initialized ‘local-mac-address’. In this case, the real MAC is in ‘local-mac-address’, and ‘mac-address’ exists but is all zeros.

int device_get_mac_address(struct device *dev, char *addr)

Get the MAC for a given device

Parameters

struct device *dev

Pointer to the device

char *addr

Address of buffer to store the MAC in

int device_get_ethdev_address(struct device *dev, struct net_device *netdev)

Set netdev’s MAC address from a given device

Parameters

struct device *dev

Pointer to the device

struct net_device *netdev

Pointer to netdev to write the address to

Description

Wrapper around device_get_mac_address() which writes the address directly to netdev->dev_addr.

void netif_carrier_on(struct net_device *dev)

set carrier

Parameters

struct net_device *dev

network device

Description

Device has detected acquisition of carrier.

void netif_carrier_off(struct net_device *dev)

clear carrier

Parameters

struct net_device *dev

network device

Description

Device has detected loss of carrier.

void netif_carrier_event(struct net_device *dev)

report carrier state event

Parameters

struct net_device *dev

network device

Description

Device has detected a carrier event but the carrier state wasn’t changed. Use in drivers when querying carrier state asynchronously, to avoid missing events (link flaps) if link recovers before it’s queried.

Determine if given Ethernet address is link-local

Parameters

const u8 *addr

Pointer to a six-byte array containing the Ethernet address

Description

Return true if address is link local reserved addr (01:80:c2:00:00:0X) per IEEE 802.1Q 8.6.3 Frame filtering.

Please note: addr must be aligned to u16.

bool is_zero_ether_addr(const u8 *addr)

Determine if give Ethernet address is all zeros.

Parameters

const u8 *addr

Pointer to a six-byte array containing the Ethernet address

Description

Return true if the address is all zeroes.

Please note: addr must be aligned to u16.

bool is_multicast_ether_addr(const u8 *addr)

Determine if the Ethernet address is a multicast.

Parameters

const u8 *addr

Pointer to a six-byte array containing the Ethernet address

Description

Return true if the address is a multicast address. By definition the broadcast address is also a multicast address.

bool is_local_ether_addr(const u8 *addr)

Determine if the Ethernet address is locally-assigned one (IEEE 802).

Parameters

const u8 *addr

Pointer to a six-byte array containing the Ethernet address

Description

Return true if the address is a local address.

bool is_broadcast_ether_addr(const u8 *addr)

Determine if the Ethernet address is broadcast

Parameters

const u8 *addr

Pointer to a six-byte array containing the Ethernet address

Description

Return true if the address is the broadcast address.

Please note: addr must be aligned to u16.

bool is_unicast_ether_addr(const u8 *addr)

Determine if the Ethernet address is unicast

Parameters

const u8 *addr

Pointer to a six-byte array containing the Ethernet address

Description

Return true if the address is a unicast address.

bool is_valid_ether_addr(const u8 *addr)

Determine if the given Ethernet address is valid

Parameters

const u8 *addr

Pointer to a six-byte array containing the Ethernet address

Description

Check that the Ethernet address (MAC) is not 00:00:00:00:00:00, is not a multicast address, and is not FF:FF:FF:FF:FF:FF.

Return true if the address is valid.

Please note: addr must be aligned to u16.

bool eth_proto_is_802_3(__be16 proto)

Determine if a given Ethertype/length is a protocol

Parameters

__be16 proto

Ethertype/length value to be tested

Description

Check that the value from the Ethertype/length field is a valid Ethertype.

Return true if the valid is an 802.3 supported Ethertype.

void eth_random_addr(u8 *addr)

Generate software assigned random Ethernet address

Parameters

u8 *addr

Pointer to a six-byte array containing the Ethernet address

Description

Generate a random Ethernet address (MAC) that is not multicast and has the local assigned bit set.

void eth_broadcast_addr(u8 *addr)

Assign broadcast address

Parameters

u8 *addr

Pointer to a six-byte array containing the Ethernet address

Description

Assign the broadcast address to the given address array.

void eth_zero_addr(u8 *addr)

Assign zero address

Parameters

u8 *addr

Pointer to a six-byte array containing the Ethernet address

Description

Assign the zero address to the given address array.

void eth_hw_addr_random(struct net_device *dev)

Generate software assigned random Ethernet and set device flag

Parameters

struct net_device *dev

pointer to net_device structure

Description

Generate a random Ethernet address (MAC) to be used by a net device and set addr_assign_type so the state can be read by sysfs and be used by userspace.

u32 eth_hw_addr_crc(struct netdev_hw_addr *ha)

Calculate CRC from netdev_hw_addr

Parameters

struct netdev_hw_addr *ha

pointer to hardware address

Description

Calculate CRC from a hardware address as basis for filter hashes.

void ether_addr_copy(u8 *dst, const u8 *src)

Copy an Ethernet address

Parameters

u8 *dst

Pointer to a six-byte array Ethernet address destination

const u8 *src

Pointer to a six-byte array Ethernet address source

Description

Please note: dst & src must both be aligned to u16.

void eth_hw_addr_set(struct net_device *dev, const u8 *addr)

Assign Ethernet address to a net_device

Parameters

struct net_device *dev

pointer to net_device structure

const u8 *addr

address to assign

Description

Assign given address to the net_device, addr_assign_type is not changed.

void eth_hw_addr_inherit(struct net_device *dst, struct net_device *src)

Copy dev_addr from another net_device

Parameters

struct net_device *dst

pointer to net_device to copy dev_addr to

struct net_device *src

pointer to net_device to copy dev_addr from

Description

Copy the Ethernet address from one net_device to another along with the address attributes (addr_assign_type).

bool ether_addr_equal(const u8 *addr1, const u8 *addr2)

Compare two Ethernet addresses

Parameters

const u8 *addr1

Pointer to a six-byte array containing the Ethernet address

const u8 *addr2

Pointer other six-byte array containing the Ethernet address

Description

Compare two Ethernet addresses, returns true if equal

Please note: addr1 & addr2 must both be aligned to u16.

bool ether_addr_equal_64bits(const u8 addr1[6 + 2], const u8 addr2[6 + 2])

Compare two Ethernet addresses

Parameters

const u8 addr1[6+2]

Pointer to an array of 8 bytes

const u8 addr2[6+2]

Pointer to an other array of 8 bytes

Description

Compare two Ethernet addresses, returns true if equal, false otherwise.

The function doesn’t need any conditional branches and possibly uses word memory accesses on CPU allowing cheap unaligned memory reads. arrays = { byte1, byte2, byte3, byte4, byte5, byte6, pad1, pad2 }

Please note that alignment of addr1 & addr2 are only guaranteed to be 16 bits.

bool ether_addr_equal_unaligned(const u8 *addr1, const u8 *addr2)

Compare two not u16 aligned Ethernet addresses

Parameters

const u8 *addr1

Pointer to a six-byte array containing the Ethernet address

const u8 *addr2

Pointer other six-byte array containing the Ethernet address

Description

Compare two Ethernet addresses, returns true if equal

Please note: Use only when any Ethernet address may not be u16 aligned.

bool ether_addr_equal_masked(const u8 *addr1, const u8 *addr2, const u8 *mask)

Compare two Ethernet addresses with a mask

Parameters

const u8 *addr1

Pointer to a six-byte array containing the 1st Ethernet address

const u8 *addr2

Pointer to a six-byte array containing the 2nd Ethernet address

const u8 *mask

Pointer to a six-byte array containing the Ethernet address bitmask

Description

Compare two Ethernet addresses with a mask, returns true if for every bit set in the bitmask the equivalent bits in the ethernet addresses are equal. Using a mask with all bits set is a slower ether_addr_equal.

u64 ether_addr_to_u64(const u8 *addr)

Convert an Ethernet address into a u64 value.

Parameters

const u8 *addr

Pointer to a six-byte array containing the Ethernet address

Description

Return a u64 value of the address

void u64_to_ether_addr(u64 u, u8 *addr)

Convert a u64 to an Ethernet address.

Parameters

u64 u

u64 to convert to an Ethernet MAC address

u8 *addr

Pointer to a six-byte array to contain the Ethernet address

void eth_addr_dec(u8 *addr)

Decrement the given MAC address

Parameters

u8 *addr

Pointer to a six-byte array containing Ethernet address to decrement

void eth_addr_inc(u8 *addr)

Increment the given MAC address.

Parameters

u8 *addr

Pointer to a six-byte array containing Ethernet address to increment.

bool is_etherdev_addr(const struct net_device *dev, const u8 addr[6 + 2])

Tell if given Ethernet address belongs to the device.

Parameters

const struct net_device *dev

Pointer to a device structure

const u8 addr[6 + 2]

Pointer to a six-byte array containing the Ethernet address

Description

Compare passed address with all addresses of the device. Return true if the address if one of the device addresses.

Note that this function calls ether_addr_equal_64bits() so take care of the right padding.

unsigned long compare_ether_header(const void *a, const void *b)

Compare two Ethernet headers

Parameters

const void *a

Pointer to Ethernet header

const void *b

Pointer to Ethernet header

Description

Compare two Ethernet headers, returns 0 if equal. This assumes that the network header (i.e., IP header) is 4-byte aligned OR the platform can handle unaligned access. This is the case for all packets coming into netif_receive_skb or similar entry points.

void eth_hw_addr_gen(struct net_device *dev, const u8 *base_addr, unsigned int id)

Generate and assign Ethernet address to a port

Parameters

struct net_device *dev

pointer to port’s net_device structure

const u8 *base_addr

base Ethernet address

unsigned int id

offset to add to the base address

Description

Generate a MAC address using a base address and an offset and assign it to a net_device. Commonly used by switch drivers which need to compute addresses for all their ports. addr_assign_type is not changed.

int eth_skb_pad(struct sk_buff *skb)

Pad buffer to mininum number of octets for Ethernet frame

Parameters

struct sk_buff *skb

Buffer to pad

Description

An Ethernet frame should have a minimum size of 60 bytes. This function takes short frames and pads them with zeros up to the 60 byte limit.

void napi_schedule(struct napi_struct *n)

schedule NAPI poll

Parameters

struct napi_struct *n

NAPI context

Description

Schedule NAPI poll routine to be called if it is not already running.

void napi_schedule_irqoff(struct napi_struct *n)

schedule NAPI poll

Parameters

struct napi_struct *n

NAPI context

Description

Variant of napi_schedule(), assuming hard irqs are masked.

bool napi_complete(struct napi_struct *n)

NAPI processing complete

Parameters

struct napi_struct *n

NAPI context

Description

Mark NAPI processing as complete. Consider using napi_complete_done() instead. Return false if device should avoid rearming interrupts.

void napi_disable(struct napi_struct *n)

prevent NAPI from scheduling

Parameters

struct napi_struct *n

NAPI context

Description

Stop NAPI from being scheduled on this context. Waits till any outstanding processing completes.

void napi_synchronize(const struct napi_struct *n)

wait until NAPI is not running

Parameters

const struct napi_struct *n

NAPI context

Description

Wait until NAPI is done being scheduled on this context. Waits till any outstanding processing completes but does not disable future activations.

bool napi_if_scheduled_mark_missed(struct napi_struct *n)

if napi is running, set the NAPIF_STATE_MISSED

Parameters

struct napi_struct *n

NAPI context

Description

If napi is running, set the NAPIF_STATE_MISSED, and return true if NAPI is scheduled.

enum netdev_priv_flags

struct net_device priv_flags

Constants

IFF_802_1Q_VLAN

802.1Q VLAN device

IFF_EBRIDGE

Ethernet bridging device

IFF_BONDING

bonding master or slave

IFF_ISATAP

ISATAP interface (RFC4214)

IFF_WAN_HDLC

WAN HDLC device

IFF_XMIT_DST_RELEASE

dev_hard_start_xmit() is allowed to release skb->dst

IFF_DONT_BRIDGE

disallow bridging this ether dev

IFF_DISABLE_NETPOLL

disable netpoll at run-time

IFF_MACVLAN_PORT

device used as macvlan port

IFF_BRIDGE_PORT

device used as bridge port

IFF_OVS_DATAPATH

device used as Open vSwitch datapath port

IFF_TX_SKB_SHARING

The interface supports sharing skbs on transmit

IFF_UNICAST_FLT

Supports unicast filtering

IFF_TEAM_PORT

device used as team port

IFF_SUPP_NOFCS

device supports sending custom FCS

IFF_LIVE_ADDR_CHANGE

device supports hardware address change when it’s running

IFF_MACVLAN

Macvlan device

IFF_XMIT_DST_RELEASE_PERM

IFF_XMIT_DST_RELEASE not taking into account underlying stacked devices

IFF_L3MDEV_MASTER

device is an L3 master device

IFF_NO_QUEUE

device can run without qdisc attached

IFF_OPENVSWITCH

device is a Open vSwitch master

IFF_L3MDEV_SLAVE

device is enslaved to an L3 master device

IFF_TEAM

device is a team device

IFF_RXFH_CONFIGURED

device has had Rx Flow indirection table configured

IFF_PHONY_HEADROOM

the headroom value is controlled by an external entity (i.e. the master device for bridged veth)

IFF_MACSEC

device is a MACsec device

IFF_NO_RX_HANDLER

device doesn’t support the rx_handler hook

IFF_FAILOVER

device is a failover master device

IFF_FAILOVER_SLAVE

device is lower dev of a failover master device

IFF_L3MDEV_RX_HANDLER

only invoke the rx handler of L3 master device

IFF_LIVE_RENAME_OK

rename is allowed while device is up and running

IFF_TX_SKB_NO_LINEAR

device/driver is capable of xmitting frames with skb_headlen(skb) == 0 (data starts from frag0)

Description

These are the struct net_device, they are only set internally by drivers and used in the kernel. These flags are invisible to userspace; this means that the order of these flags can change during any kernel release.

You should have a pretty good reason to be extending these flags.

struct net_device

The DEVICE structure.

Definition

struct net_device {
  char name[IFNAMSIZ];
  struct netdev_name_node *name_node;
  struct dev_ifalias      __rcu *ifalias;
  unsigned long           mem_end;
  unsigned long           mem_start;
  unsigned long           base_addr;
  unsigned long           state;
  struct list_head        dev_list;
  struct list_head        napi_list;
  struct list_head        unreg_list;
  struct list_head        close_list;
  struct list_head        ptype_all;
  struct list_head        ptype_specific;
  struct {
    struct list_head upper;
    struct list_head lower;
  } adj_list;
  unsigned int            flags;
  unsigned int            priv_flags;
  const struct net_device_ops *netdev_ops;
  int ifindex;
  unsigned short          gflags;
  unsigned short          hard_header_len;
  unsigned int            mtu;
  unsigned short          needed_headroom;
  unsigned short          needed_tailroom;
  netdev_features_t features;
  netdev_features_t hw_features;
  netdev_features_t wanted_features;
  netdev_features_t vlan_features;
  netdev_features_t hw_enc_features;
  netdev_features_t mpls_features;
  netdev_features_t gso_partial_features;
  unsigned int            min_mtu;
  unsigned int            max_mtu;
  unsigned short          type;
  unsigned char           min_header_len;
  unsigned char           name_assign_type;
  int group;
  struct net_device_stats stats;
  atomic_long_t rx_dropped;
  atomic_long_t tx_dropped;
  atomic_long_t rx_nohandler;
  atomic_t carrier_up_count;
  atomic_t carrier_down_count;
#ifdef CONFIG_WIRELESS_EXT;
  const struct iw_handler_def *wireless_handlers;
  struct iw_public_data   *wireless_data;
#endif;
  const struct ethtool_ops *ethtool_ops;
#ifdef CONFIG_NET_L3_MASTER_DEV;
  const struct l3mdev_ops *l3mdev_ops;
#endif;
#if IS_ENABLED(CONFIG_IPV6);
  const struct ndisc_ops *ndisc_ops;
#endif;
#ifdef CONFIG_XFRM_OFFLOAD;
  const struct xfrmdev_ops *xfrmdev_ops;
#endif;
#if IS_ENABLED(CONFIG_TLS_DEVICE);
  const struct tlsdev_ops *tlsdev_ops;
#endif;
  const struct header_ops *header_ops;
  unsigned char           operstate;
  unsigned char           link_mode;
  unsigned char           if_port;
  unsigned char           dma;
  unsigned char           perm_addr[MAX_ADDR_LEN];
  unsigned char           addr_assign_type;
  unsigned char           addr_len;
  unsigned char           upper_level;
  unsigned char           lower_level;
  unsigned short          neigh_priv_len;
  unsigned short          dev_id;
  unsigned short          dev_port;
  unsigned short          padded;
  spinlock_t addr_list_lock;
  int irq;
  struct netdev_hw_addr_list      uc;
  struct netdev_hw_addr_list      mc;
  struct netdev_hw_addr_list      dev_addrs;
#ifdef CONFIG_SYSFS;
  struct kset             *queues_kset;
#endif;
#ifdef CONFIG_LOCKDEP;
  struct list_head        unlink_list;
#endif;
  unsigned int            promiscuity;
  unsigned int            allmulti;
  bool uc_promisc;
#ifdef CONFIG_LOCKDEP;
  unsigned char           nested_level;
#endif;
#if IS_ENABLED(CONFIG_VLAN_8021Q);
  struct vlan_info __rcu  *vlan_info;
#endif;
#if IS_ENABLED(CONFIG_NET_DSA);
  struct dsa_port         *dsa_ptr;
#endif;
#if IS_ENABLED(CONFIG_TIPC);
  struct tipc_bearer __rcu *tipc_ptr;
#endif;
#if IS_ENABLED(CONFIG_IRDA) || IS_ENABLED(CONFIG_ATALK);
  void *atalk_ptr;
#endif;
  struct in_device __rcu  *ip_ptr;
#if IS_ENABLED(CONFIG_DECNET);
  struct dn_dev __rcu     *dn_ptr;
#endif;
  struct inet6_dev __rcu  *ip6_ptr;
#if IS_ENABLED(CONFIG_AX25);
  void *ax25_ptr;
#endif;
  struct wireless_dev     *ieee80211_ptr;
  struct wpan_dev         *ieee802154_ptr;
#if IS_ENABLED(CONFIG_MPLS_ROUTING);
  struct mpls_dev __rcu   *mpls_ptr;
#endif;
#if IS_ENABLED(CONFIG_MCTP);
  struct mctp_dev __rcu   *mctp_ptr;
#endif;
  unsigned char           *dev_addr;
  struct netdev_rx_queue  *_rx;
  unsigned int            num_rx_queues;
  unsigned int            real_num_rx_queues;
  struct bpf_prog __rcu   *xdp_prog;
  unsigned long           gro_flush_timeout;
  int napi_defer_hard_irqs;
  rx_handler_func_t __rcu *rx_handler;
  void __rcu              *rx_handler_data;
#ifdef CONFIG_NET_CLS_ACT;
  struct mini_Qdisc __rcu *miniq_ingress;
#endif;
  struct netdev_queue __rcu *ingress_queue;
#ifdef CONFIG_NETFILTER_INGRESS;
  struct nf_hook_entries __rcu *nf_hooks_ingress;
#endif;
  unsigned char           broadcast[MAX_ADDR_LEN];
#ifdef CONFIG_RFS_ACCEL;
  struct cpu_rmap         *rx_cpu_rmap;
#endif;
  struct hlist_node       index_hlist;
  struct netdev_queue     *_tx ;
  unsigned int            num_tx_queues;
  unsigned int            real_num_tx_queues;
  struct Qdisc            *qdisc;
  unsigned int            tx_queue_len;
  spinlock_t tx_global_lock;
  struct xdp_dev_bulk_queue __percpu *xdp_bulkq;
#ifdef CONFIG_XPS;
  struct xps_dev_maps __rcu *xps_maps[XPS_MAPS_MAX];
#endif;
#ifdef CONFIG_NET_CLS_ACT;
  struct mini_Qdisc __rcu *miniq_egress;
#endif;
#ifdef CONFIG_NETFILTER_EGRESS;
  struct nf_hook_entries __rcu *nf_hooks_egress;
#endif;
#ifdef CONFIG_NET_SCHED;
  unsigned long qdisc_hash[1 << ((4) - 1)];
#endif;
  struct timer_list       watchdog_timer;
  int watchdog_timeo;
  u32 proto_down_reason;
  struct list_head        todo_list;
#ifdef CONFIG_PCPU_DEV_REFCNT;
  int __percpu            *pcpu_refcnt;
#else;
  refcount_t dev_refcnt;
#endif;
  struct list_head        link_watch_list;
  enum {
    NETREG_UNINITIALIZED=0,
    NETREG_REGISTERED,
    NETREG_UNREGISTERING,
    NETREG_UNREGISTERED,
    NETREG_RELEASED,
    NETREG_DUMMY,
  } reg_state:8;
  bool dismantle;
  enum {
    RTNL_LINK_INITIALIZED,
    RTNL_LINK_INITIALIZING,
  } rtnl_link_state:16;
  bool needs_free_netdev;
  void (*priv_destructor)(struct net_device *dev);
#ifdef CONFIG_NETPOLL;
  struct netpoll_info __rcu       *npinfo;
#endif;
  possible_net_t nd_net;
  void *ml_priv;
  enum netdev_ml_priv_type        ml_priv_type;
  union {
    struct pcpu_lstats __percpu             *lstats;
    struct pcpu_sw_netstats __percpu        *tstats;
    struct pcpu_dstats __percpu             *dstats;
  };
#if IS_ENABLED(CONFIG_GARP);
  struct garp_port __rcu  *garp_port;
#endif;
#if IS_ENABLED(CONFIG_MRP);
  struct mrp_port __rcu   *mrp_port;
#endif;
  struct device           dev;
  const struct attribute_group *sysfs_groups[4];
  const struct attribute_group *sysfs_rx_queue_group;
  const struct rtnl_link_ops *rtnl_link_ops;
#define GSO_MAX_SIZE            65536;
  unsigned int            gso_max_size;
#define GSO_MAX_SEGS            65535;
  u16 gso_max_segs;
#ifdef CONFIG_DCB;
  const struct dcbnl_rtnl_ops *dcbnl_ops;
#endif;
  s16 num_tc;
  struct netdev_tc_txq    tc_to_txq[TC_MAX_QUEUE];
  u8 prio_tc_map[TC_BITMASK + 1];
#if IS_ENABLED(CONFIG_FCOE);
  unsigned int            fcoe_ddp_xid;
#endif;
#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO);
  struct netprio_map __rcu *priomap;
#endif;
  struct phy_device       *phydev;
  struct sfp_bus          *sfp_bus;
  struct lock_class_key   *qdisc_tx_busylock;
  bool proto_down;
  unsigned wol_enabled:1;
  unsigned threaded:1;
  struct list_head        net_notifier_list;
#if IS_ENABLED(CONFIG_MACSEC);
  const struct macsec_ops *macsec_ops;
#endif;
  const struct udp_tunnel_nic_info        *udp_tunnel_nic_info;
  struct udp_tunnel_nic   *udp_tunnel_nic;
  struct bpf_xdp_entity   xdp_state[__MAX_XDP_MODE];
};

Members

name

This is the first field of the “visible” part of this structure (i.e. as seen by users in the “Space.c” file). It is the name of the interface.

name_node

Name hashlist node

ifalias

SNMP alias

mem_end

Shared memory end

mem_start

Shared memory start

base_addr

Device I/O address

state

Generic network queuing layer state, see netdev_state_t

dev_list

The global list of network devices

napi_list

List entry used for polling NAPI devices

unreg_list

List entry when we are unregistering the device; see the function unregister_netdev

close_list

List entry used when we are closing the device

ptype_all

Device-specific packet handlers for all protocols

ptype_specific

Device-specific, protocol-specific packet handlers

adj_list

Directly linked devices, like slaves for bonding

flags

Interface flags (a la BSD)

priv_flags

Like ‘flags’ but invisible to userspace, see if.h for the definitions

netdev_ops

Includes several pointers to callbacks, if one wants to override the ndo_*() functions

ifindex

interface index

gflags

Global flags ( kept as legacy )

hard_header_len

Maximum hardware header length.

mtu

Interface MTU value

needed_headroom

Extra headroom the hardware may need, but not in all cases can this be guaranteed

needed_tailroom

Extra tailroom the hardware may need, but not in all cases can this be guaranteed. Some cases also use LL_MAX_HEADER instead to allocate the skb

interface address info:

features

Currently active device features

hw_features

User-changeable features

wanted_features

User-requested features

vlan_features

Mask of features inheritable by VLAN devices

hw_enc_features

Mask of features inherited by encapsulating devices This field indicates what encapsulation offloads the hardware is capable of doing, and drivers will need to set them appropriately.

mpls_features

Mask of features inheritable by MPLS

gso_partial_features

value(s) from NETIF_F_GSO*

min_mtu

Interface Minimum MTU value

max_mtu

Interface Maximum MTU value

type

Interface hardware type

min_header_len

Minimum hardware header length

name_assign_type

network interface name assignment type

group

The group the device belongs to

stats

Statistics struct, which was left as a legacy, use rtnl_link_stats64 instead

rx_dropped

Dropped packets by core network, do not use this in drivers

tx_dropped

Dropped packets by core network, do not use this in drivers

rx_nohandler

nohandler dropped packets by core network on inactive devices, do not use this in drivers

carrier_up_count

Number of times the carrier has been up

carrier_down_count

Number of times the carrier has been down

wireless_handlers

List of functions to handle Wireless Extensions, instead of ioctl, see <net/iw_handler.h> for details.

wireless_data

Instance data managed by the core of wireless extensions

ethtool_ops

Management operations

l3mdev_ops

Layer 3 master device operations

ndisc_ops

Includes callbacks for different IPv6 neighbour discovery handling. Necessary for e.g. 6LoWPAN.

xfrmdev_ops

Transformation offload operations

tlsdev_ops

Transport Layer Security offload operations

header_ops

Includes callbacks for creating,parsing,caching,etc of Layer 2 headers.

operstate

RFC2863 operstate

link_mode

Mapping policy to operstate

if_port

Selectable AUI, TP, …

dma

DMA channel

perm_addr

Permanent hw address

addr_assign_type

Hw address assignment type

addr_len

Hardware address length

upper_level

Maximum depth level of upper devices.

lower_level

Maximum depth level of lower devices.

neigh_priv_len

Used in neigh_alloc()

dev_id

Used to differentiate devices that share the same link layer address

dev_port

Used to differentiate devices that share the same function

padded

How much padding added by alloc_netdev()

addr_list_lock

XXX: need comments on this one

irq

Device IRQ number

uc

unicast mac addresses

mc

multicast mac addresses

dev_addrs

list of device hw addresses

queues_kset

Group of all Kobjects in the Tx and RX queues

unlink_list

As netif_addr_lock() can be called recursively, keep a list of interfaces to be deleted.

FIXME: cleanup struct net_device such that network protocol info moves out.

promiscuity

Number of times the NIC is told to work in promiscuous mode; if it becomes 0 the NIC will exit promiscuous mode

allmulti

Counter, enables or disables allmulticast mode

uc_promisc

Counter that indicates promiscuous mode has been enabled due to the need to listen to additional unicast addresses in a device that does not implement ndo_set_rx_mode()

nested_level

Used as as a parameter of spin_lock_nested() of dev->addr_list_lock.

vlan_info

VLAN info

dsa_ptr

dsa specific data

tipc_ptr

TIPC specific data

atalk_ptr

AppleTalk link

ip_ptr

IPv4 specific data

dn_ptr

DECnet specific data

ip6_ptr

IPv6 specific data

ax25_ptr

AX.25 specific data

ieee80211_ptr

IEEE 802.11 specific data, assign before registering

ieee802154_ptr

IEEE 802.15.4 low-rate Wireless Personal Area Network device struct

mpls_ptr

mpls_dev struct pointer

mctp_ptr

MCTP specific data

dev_addr

Hw address (before bcast, because most packets are unicast)

_rx

Array of RX queues

num_rx_queues

Number of RX queues allocated at register_netdev() time

real_num_rx_queues

Number of RX queues currently active in device

xdp_prog

XDP sockets filter program pointer

gro_flush_timeout

timeout for GRO layer in NAPI

napi_defer_hard_irqs

If not zero, provides a counter that would allow to avoid NIC hard IRQ, on busy queues.

rx_handler

handler for received packets

rx_handler_data

XXX: need comments on this one

miniq_ingress

ingress/clsact qdisc specific data for ingress processing

ingress_queue

XXX: need comments on this one

nf_hooks_ingress

netfilter hooks executed for ingress packets

broadcast

hw bcast address

rx_cpu_rmap

CPU reverse-mapping for RX completion interrupts, indexed by RX queue number. Assigned by driver. This must only be set if the ndo_rx_flow_steer operation is defined

index_hlist

Device index hash chain

_tx

Array of TX queues

num_tx_queues

Number of TX queues allocated at alloc_netdev_mq() time

real_num_tx_queues

Number of TX queues currently active in device

qdisc

Root qdisc from userspace point of view

tx_queue_len

Max frames per queue allowed

tx_global_lock

XXX: need comments on this one

xdp_bulkq

XDP device bulk queue

xps_maps

XXX: need comments on this one

miniq_egress

clsact qdisc specific data for egress processing

nf_hooks_egress

netfilter hooks executed for egress packets

qdisc_hash

qdisc hash table

watchdog_timer

List of timers

watchdog_timeo

Represents the timeout that is used by the watchdog (see dev_watchdog())

proto_down_reason

reason a netdev interface is held down

todo_list

Delayed register/unregister

pcpu_refcnt

Number of references to this device

dev_refcnt

Number of references to this device

link_watch_list

XXX: need comments on this one

reg_state

Register/unregister state machine

dismantle

Device is going to be freed

rtnl_link_state

This enum represents the phases of creating a new link

needs_free_netdev

Should unregister perform free_netdev?

priv_destructor

Called from unregister

npinfo

XXX: need comments on this one

nd_net

Network namespace this network device is inside

ml_priv

Mid-layer private

ml_priv_type

Mid-layer private type

{unnamed_union}

anonymous

lstats

Loopback statistics

tstats

Tunnel statistics

dstats

Dummy statistics

garp_port

GARP

mrp_port

MRP

dev

Class/net/name entry

sysfs_groups

Space for optional device, statistics and wireless sysfs groups

sysfs_rx_queue_group

Space for optional per-rx queue attributes

rtnl_link_ops

Rtnl_link_ops

gso_max_size

Maximum size of generic segmentation offload

gso_max_segs

Maximum number of segments that can be passed to the NIC for GSO

dcbnl_ops

Data Center Bridging netlink ops

num_tc

Number of traffic classes in the net device

tc_to_txq

XXX: need comments on this one

prio_tc_map

XXX: need comments on this one

fcoe_ddp_xid

Max exchange id for FCoE LRO by ddp

priomap

XXX: need comments on this one

phydev

Physical device may attach itself for hardware timestamping

sfp_bus

attached struct sfp_bus structure.

qdisc_tx_busylock

lockdep class annotating Qdisc->busylock spinlock

proto_down

protocol port state information can be sent to the switch driver and used to set the phys state of the switch port.

wol_enabled

Wake-on-LAN is enabled

threaded

napi threaded mode is enabled

net_notifier_list

List of per-net netdev notifier block that follow this device when it is moved to another network namespace.

macsec_ops

MACsec offloading ops

udp_tunnel_nic_info

static structure describing the UDP tunnel offload capabilities of the device

udp_tunnel_nic

UDP tunnel offload state

xdp_state

stores info on attached XDP BPF programs

Description

Actually, this whole structure is a big mistake. It mixes I/O data with strictly “high-level” data, and it has to know about almost every data structure used in the INET module.

void *netdev_priv(const struct net_device *dev)

access network device private data

Parameters

const struct net_device *dev

network device

Description

Get network device private data

void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct*, int), int weight)

initialize a NAPI context

Parameters

struct net_device *dev

network device

struct napi_struct *napi

NAPI context

int (*poll)(struct napi_struct *, int)

polling function

int weight

default weight

Description

netif_napi_add() must be used to initialize a NAPI context prior to calling any of the other NAPI-related functions.

void netif_tx_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct*, int), int weight)

initialize a NAPI context

Parameters

struct net_device *dev

network device

struct napi_struct *napi

NAPI context

int (*poll)(struct napi_struct *, int)

polling function

int weight

default weight

Description

This variant of netif_napi_add() should be used from drivers using NAPI to exclusively poll a TX queue. This will avoid we add it into napi_hash[], thus polluting this hash table.

void __netif_napi_del(struct napi_struct *napi)

remove a NAPI context

Parameters

struct napi_struct *napi

NAPI context

Description

Warning: caller must observe RCU grace period before freeing memory containing napi. Drivers might want to call this helper to combine all the needed RCU grace periods into a single one.

void netif_napi_del(struct napi_struct *napi)

remove a NAPI context

Parameters

struct napi_struct *napi

NAPI context

netif_napi_del() removes a NAPI context from the network device NAPI list

void netif_start_queue(struct net_device *dev)

allow transmit

Parameters

struct net_device *dev

network device

Allow upper layers to call the device hard_start_xmit routine.

void netif_wake_queue(struct net_device *dev)

restart transmit

Parameters

struct net_device *dev

network device

Allow upper layers to call the device hard_start_xmit routine. Used for flow control when transmit resources are available.

void netif_stop_queue(struct net_device *dev)

stop transmitted packets

Parameters

struct net_device *dev

network device

Stop upper layers calling the device hard_start_xmit routine. Used for flow control when transmit resources are unavailable.

bool netif_queue_stopped(const struct net_device *dev)

test if transmit queue is flowblocked

Parameters

const struct net_device *dev

network device

Test if transmit queue on device is currently unable to send.

void netdev_queue_set_dql_min_limit(struct netdev_queue *dev_queue, unsigned int min_limit)

set dql minimum limit

Parameters

struct netdev_queue *dev_queue

pointer to transmit queue

unsigned int min_limit

dql minimum limit

Description

Forces xmit_more() to return true until the minimum threshold defined by min_limit is reached (or until the tx queue is empty). Warning: to be use with care, misuse will impact the latency.

void netdev_txq_bql_enqueue_prefetchw(struct netdev_queue *dev_queue)

prefetch bql data for write

Parameters

struct netdev_queue *dev_queue

pointer to transmit queue

Description

BQL enabled drivers might use this helper in their ndo_start_xmit(), to give appropriate hint to the CPU.

void netdev_txq_bql_complete_prefetchw(struct netdev_queue *dev_queue)

prefetch bql data for write

Parameters

struct netdev_queue *dev_queue

pointer to transmit queue

Description

BQL enabled drivers might use this helper in their TX completion path, to give appropriate hint to the CPU.

void netdev_sent_queue(struct net_device *dev, unsigned int bytes)

report the number of bytes queued to hardware

Parameters

struct net_device *dev

network device

unsigned int bytes

number of bytes queued to the hardware device queue

Report the number of bytes queued for sending/completion to the network device hardware queue. bytes should be a good approximation and should exactly match netdev_completed_queue() bytes

void netdev_completed_queue(struct net_device *dev, unsigned int pkts, unsigned int bytes)

report bytes and packets completed by device

Parameters

struct net_device *dev

network device

unsigned int pkts

actual number of packets sent over the medium

unsigned int bytes

actual number of bytes sent over the medium

Report the number of bytes and packets transmitted by the network device hardware queue over the physical medium, bytes must exactly match the bytes amount passed to netdev_sent_queue()

void netdev_reset_queue(struct net_device *dev_queue)

reset the packets and bytes count of a network device

Parameters

struct net_device *dev_queue

network device

Reset the bytes and packet count of a network device and clear the software flow control OFF bit for this network device

u16 netdev_cap_txqueue(struct net_device *dev, u16 queue_index)

check if selected tx queue exceeds device queues

Parameters

struct net_device *dev

network device

u16 queue_index

given tx queue index

Returns 0 if given tx queue index >= number of device tx queues, otherwise returns the originally passed tx queue index.

bool netif_running(const struct net_device *dev)

test if up

Parameters

const struct net_device *dev

network device

Test if the device has been brought up.

void netif_start_subqueue(struct net_device *dev, u16 queue_index)

allow sending packets on subqueue

Parameters

struct net_device *dev

network device

u16 queue_index

sub queue index

Description

Start individual transmit queue of a device with multiple transmit queues.

void netif_stop_subqueue(struct net_device *dev, u16 queue_index)

stop sending packets on subqueue

Parameters

struct net_device *dev

network device

u16 queue_index

sub queue index

Description

Stop individual transmit queue of a device with multiple transmit queues.

bool __netif_subqueue_stopped(const struct net_device *dev, u16 queue_index)

test status of subqueue

Parameters

const struct net_device *dev

network device

u16 queue_index

sub queue index

Description

Check individual transmit queue of a device with multiple transmit queues.

bool netif_subqueue_stopped(const struct net_device *dev, struct sk_buff *skb)

test status of subqueue

Parameters

const struct net_device *dev

network device

struct sk_buff *skb

sub queue buffer pointer

Description

Check individual transmit queue of a device with multiple transmit queues.

void netif_wake_subqueue(struct net_device *dev, u16 queue_index)

allow sending packets on subqueue

Parameters

struct net_device *dev

network device

u16 queue_index

sub queue index

Description

Resume individual transmit queue of a device with multiple transmit queues.

bool netif_attr_test_mask(unsigned long j, const unsigned long *mask, unsigned int nr_bits)

Test a CPU or Rx queue set in a mask

Parameters

unsigned long j

CPU/Rx queue index

const unsigned long *mask

bitmask of all cpus/rx queues

unsigned int nr_bits

number of bits in the bitmask

Description

Test if a CPU or Rx queue index is set in a mask of all CPU/Rx queues.

bool netif_attr_test_online(unsigned long j, const unsigned long *online_mask, unsigned int nr_bits)

Test for online CPU/Rx queue

Parameters

unsigned long j

CPU/Rx queue index

const unsigned long *online_mask

bitmask for CPUs/Rx queues that are online

unsigned int nr_bits

number of bits in the bitmask

Description

Returns true if a CPU/Rx queue is online.

unsigned int netif_attrmask_next(int n, const unsigned long *srcp, unsigned int nr_bits)

get the next CPU/Rx queue in a cpu/Rx queues mask

Parameters

int n

CPU/Rx queue index

const unsigned long *srcp

the cpumask/Rx queue mask pointer

unsigned int nr_bits

number of bits in the bitmask

Description

Returns >= nr_bits if no further CPUs/Rx queues set.

int netif_attrmask_next_and(int n, const unsigned long *src1p, const unsigned long *src2p, unsigned int nr_bits)

get the next CPU/Rx queue in *src1p & *src2p

Parameters

int n

CPU/Rx queue index

const unsigned long *src1p

the first CPUs/Rx queues mask pointer

const unsigned long *src2p

the second CPUs/Rx queues mask pointer

unsigned int nr_bits

number of bits in the bitmask

Description

Returns >= nr_bits if no further CPUs/Rx queues set in both.

bool netif_is_multiqueue(const struct net_device *dev)

test if device has multiple transmit queues

Parameters

const struct net_device *dev

network device

Description

Check if device has multiple transmit queues

void dev_put(struct net_device *dev)

release reference to device

Parameters

struct net_device *dev

network device

Description

Release reference to device to allow it to be freed.

void dev_hold(struct net_device *dev)

get reference to device

Parameters

struct net_device *dev

network device

Description

Hold reference to device to keep it from being freed.

bool netif_carrier_ok(const struct net_device *dev)

test if carrier present

Parameters

const struct net_device *dev

network device

Description

Check if carrier is present on device

void netif_dormant_on(struct net_device *dev)

mark device as dormant.

Parameters

struct net_device *dev

network device

Description

Mark device as dormant (as per RFC2863).

The dormant state indicates that the relevant interface is not actually in a condition to pass packets (i.e., it is not ‘up’) but is in a “pending” state, waiting for some external event. For “on- demand” interfaces, this new state identifies the situation where the interface is waiting for events to place it in the up state.

void netif_dormant_off(struct net_device *dev)

set device as not dormant.

Parameters

struct net_device *dev

network device

Description

Device is not in dormant state.

bool netif_dormant(const struct net_device *dev)

test if device is dormant

Parameters

const struct net_device *dev

network device

Description

Check if device is dormant.

void netif_testing_on(struct net_device *dev)

mark device as under test.

Parameters

struct net_device *dev

network device

Description

Mark device as under test (as per RFC2863).

The testing state indicates that some test(s) must be performed on the interface. After completion, of the test, the interface state will change to up, dormant, or down, as appropriate.

void netif_testing_off(struct net_device *dev)

set device as not under test.

Parameters

struct net_device *dev

network device

Description

Device is not in testing state.

bool netif_testing(const struct net_device *dev)

test if device is under test

Parameters

const struct net_device *dev

network device

Description

Check if device is under test

bool netif_oper_up(const struct net_device *dev)

test if device is operational

Parameters

const struct net_device *dev

network device

Description

Check if carrier is operational

bool netif_device_present(const struct net_device *dev)

is device available or removed

Parameters

const struct net_device *dev

network device

Description

Check if device has not been removed from system.

void netif_tx_lock(struct net_device *dev)

grab network device transmit lock

Parameters

struct net_device *dev

network device

Description

Get network device transmit lock

int __dev_uc_sync(struct net_device *dev, int (*sync)(struct net_device*, const unsigned char*), int (*unsync)(struct net_device*, const unsigned char*))

Synchonize device’s unicast list

Parameters

struct net_device *dev

device to sync

int (*sync)(struct net_device *, const unsigned char *)

function to call if address should be added

int (*unsync)(struct net_device *, const unsigned char *)

function to call if address should be removed

Add newly added addresses to the interface, and release addresses that have been deleted.

void __dev_uc_unsync(struct net_device *dev, int (*unsync)(struct net_device*, const unsigned char*))

Remove synchronized addresses from device

Parameters

struct net_device *dev

device to sync

int (*unsync)(struct net_device *, const unsigned char *)

function to call if address should be removed

Remove all addresses that were added to the device by dev_uc_sync().

int __dev_mc_sync(struct net_device *dev, int (*sync)(struct net_device*, const unsigned char*), int (*unsync)(struct net_device*, const unsigned char*))

Synchonize device’s multicast list

Parameters

struct net_device *dev

device to sync

int (*sync)(struct net_device *, const unsigned char *)

function to call if address should be added

int (*unsync)(struct net_device *, const unsigned char *)

function to call if address should be removed

Add newly added addresses to the interface, and release addresses that have been deleted.

void __dev_mc_unsync(struct net_device *dev, int (*unsync)(struct net_device*, const unsigned char*))

Remove synchronized addresses from device

Parameters

struct net_device *dev

device to sync

int (*unsync)(struct net_device *, const unsigned char *)

function to call if address should be removed

Remove all addresses that were added to the device by dev_mc_sync().

PHY Support

void phy_print_status(struct phy_device *phydev)

Convenience function to print out the current phy status

Parameters

struct phy_device *phydev

the phy_device struct

int phy_restart_aneg(struct phy_device *phydev)

restart auto-negotiation

Parameters

struct phy_device *phydev

target phy_device struct

Description

Restart the autonegotiation on phydev. Returns >= 0 on success or negative errno on error.

int phy_aneg_done(struct phy_device *phydev)

return auto-negotiation status

Parameters

struct phy_device *phydev

target phy_device struct

Description

Return the auto-negotiation status from this phydev Returns > 0 on success or < 0 on error. 0 means that auto-negotiation is still pending.

int phy_mii_ioctl(struct phy_device *phydev, struct ifreq *ifr, int cmd)

generic PHY MII ioctl interface

Parameters

struct phy_device *phydev

the phy_device struct

struct ifreq *ifr

struct ifreq for socket ioctl’s

int cmd

ioctl cmd to execute

Description

Note that this function is currently incompatible with the PHYCONTROL layer. It changes registers without regard to current state. Use at own risk.

int phy_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)

generic ndo_eth_ioctl implementation

Parameters

struct net_device *dev

the net_device struct

struct ifreq *ifr

struct ifreq for socket ioctl’s

int cmd

ioctl cmd to execute

int phy_do_ioctl_running(struct net_device *dev, struct ifreq *ifr, int cmd)

generic ndo_eth_ioctl implementation but test first

Parameters

struct net_device *dev

the net_device struct

struct ifreq *ifr

struct ifreq for socket ioctl’s

int cmd

ioctl cmd to execute

Description

Same as phy_do_ioctl, but ensures that net_device is running before handling the ioctl.

void phy_queue_state_machine(struct phy_device *phydev, unsigned long jiffies)

Trigger the state machine to run soon

Parameters

struct phy_device *phydev

the phy_device struct

unsigned long jiffies

Run the state machine after these jiffies

void phy_trigger_machine(struct phy_device *phydev)

Trigger the state machine to run now

Parameters

struct phy_device *phydev

the phy_device struct

int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data)

Get the statistic counter names

Parameters

struct phy_device *phydev

the phy_device struct

u8 *data

Where to put the strings

int phy_ethtool_get_sset_count(struct phy_device *phydev)

Get the number of statistic counters

Parameters

struct phy_device *phydev

the phy_device struct

int phy_ethtool_get_stats(struct phy_device *phydev, struct ethtool_stats *stats, u64 *data)

Get the statistic counters

Parameters

struct phy_device *phydev

the phy_device struct

struct ethtool_stats *stats

What counters to get

u64 *data

Where to store the counters

int phy_start_cable_test(struct phy_device *phydev, struct netlink_ext_ack *extack)

Start a cable test

Parameters

struct phy_device *phydev

the phy_device struct

struct netlink_ext_ack *extack

extack for reporting useful error messages

int phy_start_cable_test_tdr(struct phy_device *phydev, struct netlink_ext_ack *extack, const struct phy_tdr_config *config)

Start a raw TDR cable test

Parameters

struct phy_device *phydev

the phy_device struct

struct netlink_ext_ack *extack

extack for reporting useful error messages

const struct phy_tdr_config *config

Configuration of the test to run

int phy_start_aneg(struct phy_device *phydev)

start auto-negotiation for this PHY device

Parameters

struct phy_device *phydev

the phy_device struct

Description

Sanitizes the settings (if we’re not autonegotiating

them), and then calls the driver’s config_aneg function. If the PHYCONTROL Layer is operating, we change the state to reflect the beginning of Auto-negotiation or forcing.

int phy_speed_down(struct phy_device *phydev, bool sync)

set speed to lowest speed supported by both link partners

Parameters

struct phy_device *phydev

the phy_device struct

bool sync

perform action synchronously

Description

Typically used to save energy when waiting for a WoL packet

WARNING: Setting sync to false may cause the system being unable to suspend in case the PHY generates an interrupt when finishing the autonegotiation. This interrupt may wake up the system immediately after suspend. Therefore use sync = false only if you’re sure it’s safe with the respective network chip.

int phy_speed_up(struct phy_device *phydev)

(re)set advertised speeds to all supported speeds

Parameters

struct phy_device *phydev

the phy_device struct

Description

Used to revert the effect of phy_speed_down

void phy_start_machine(struct phy_device *phydev)

start PHY state machine tracking

Parameters

struct phy_device *phydev

the phy_device struct

Description

The PHY infrastructure can run a state machine

which tracks whether the PHY is starting up, negotiating, etc. This function starts the delayed workqueue which tracks the state of the PHY. If you want to maintain your own state machine, do not call this function.

void phy_error(struct phy_device *phydev)

enter HALTED state for this PHY device

Parameters

struct phy_device *phydev

target phy_device struct

Description

Moves the PHY to the HALTED state in response to a read or write error, and tells the controller the link is down. Must not be called from interrupt context, or while the phydev->lock is held.

void phy_request_interrupt(struct phy_device *phydev)

request and enable interrupt for a PHY device

Parameters

struct phy_device *phydev

target phy_device struct

Description

Request and enable the interrupt for the given PHY.

If this fails, then we set irq to PHY_POLL. This should only be called with a valid IRQ number.

void phy_free_interrupt(struct phy_device *phydev)

disable and free interrupt for a PHY device

Parameters

struct phy_device *phydev

target phy_device struct

Description

Disable and free the interrupt for the given PHY.

This should only be called with a valid IRQ number.

void phy_stop(struct phy_device *phydev)

Bring down the PHY link, and stop checking the status

Parameters

struct phy_device *phydev

target phy_device struct

void phy_start(struct phy_device *phydev)

start or restart a PHY device

Parameters

struct phy_device *phydev

target phy_device struct

Description

Indicates the attached device’s readiness to

handle PHY-related work. Used during startup to start the PHY, and after a call to phy_stop() to resume operation. Also used to indicate the MDIO bus has cleared an error condition.

void phy_mac_interrupt(struct phy_device *phydev)

MAC says the link has changed

Parameters

struct phy_device *phydev

phy_device struct with changed link

Description

The MAC layer is able to indicate there has been a change in the PHY link status. Trigger the state machine and work a work queue.

int phy_init_eee(struct phy_device *phydev, bool clk_stop_enable)

init and check the EEE feature

Parameters

struct phy_device *phydev

target phy_device struct

bool clk_stop_enable

PHY may stop the clock during LPI

Description

it checks if the Energy-Efficient Ethernet (EEE) is supported by looking at the MMD registers 3.20 and 7.60/61 and it programs the MMD register 3.0 setting the “Clock stop enable” bit if required.

int phy_get_eee_err(struct phy_device *phydev)

report the EEE wake error count

Parameters

struct phy_device *phydev

target phy_device struct

Description

it is to report the number of time where the PHY failed to complete its normal wake sequence.

int phy_ethtool_get_eee(struct phy_device *phydev, struct ethtool_eee *data)

get EEE supported and status

Parameters

struct phy_device *phydev

target phy_device struct

struct ethtool_eee *data

ethtool_eee data

Description

it reportes the Supported/Advertisement/LP Advertisement capabilities.

int phy_ethtool_set_eee(struct phy_device *phydev, struct ethtool_eee *data)

set EEE supported and status

Parameters

struct phy_device *phydev

target phy_device struct

struct ethtool_eee *data

ethtool_eee data

Description

it is to program the Advertisement EEE register.

int phy_ethtool_set_wol(struct phy_device *phydev, struct ethtool_wolinfo *wol)

Configure Wake On LAN

Parameters

struct phy_device *phydev

target phy_device struct

struct ethtool_wolinfo *wol

Configuration requested

void phy_ethtool_get_wol(struct phy_device *phydev, struct ethtool_wolinfo *wol)

Get the current Wake On LAN configuration

Parameters

struct phy_device *phydev

target phy_device struct

struct ethtool_wolinfo *wol

Store the current configuration here

int phy_ethtool_nway_reset(struct net_device *ndev)

Restart auto negotiation

Parameters

struct net_device *ndev

Network device to restart autoneg for

int phy_config_interrupt(struct phy_device *phydev, bool interrupts)

configure the PHY device for the requested interrupts

Parameters

struct phy_device *phydev

the phy_device struct

bool interrupts

interrupt flags to configure for this phydev

Description

Returns 0 on success or < 0 on error.

const struct phy_setting *phy_find_valid(int speed, int duplex, unsigned long *supported)

find a PHY setting that matches the requested parameters

Parameters

int speed

desired speed

int duplex

desired duplex

unsigned long *supported

mask of supported link modes

Description

Locate a supported phy setting that is, in priority order: - an exact match for the specified speed and duplex mode - a match for the specified speed, or slower speed - the slowest supported speed Returns the matched phy_setting entry, or NULL if no supported phy settings were found.

unsigned int phy_supported_speeds(struct phy_device *phy, unsigned int *speeds, unsigned int size)

return all speeds currently supported by a phy device

Parameters

struct phy_device *phy

The phy device to return supported speeds of.

unsigned int *speeds

buffer to store supported speeds in.

unsigned int size

size of speeds buffer.

Description

Returns the number of supported speeds, and fills the speeds buffer with the supported speeds. If speeds buffer is too small to contain all currently supported speeds, will return as many speeds as can fit.

bool phy_check_valid(int speed, int duplex, unsigned long *features)

check if there is a valid PHY setting which matches speed, duplex, and feature mask

Parameters

int speed

speed to match

int duplex

duplex to match

unsigned long *features

A mask of the valid settings

Description

Returns true if there is a valid setting, false otherwise.

void phy_sanitize_settings(struct phy_device *phydev)

make sure the PHY is set to supported speed and duplex

Parameters

struct phy_device *phydev

the target phy_device struct

Description

Make sure the PHY is set to supported speeds and

duplexes. Drop down by one in this order: 1000/FULL, 1000/HALF, 100/FULL, 100/HALF, 10/FULL, 10/HALF.

check link status and set state accordingly

Parameters

struct phy_device *phydev

the phy_device struct

Description

Check for link and whether autoneg was triggered / is running and set state accordingly

int _phy_start_aneg(struct phy_device *phydev)

start auto-negotiation for this PHY device

Parameters

struct phy_device *phydev

the phy_device struct

Description

Sanitizes the settings (if we’re not autonegotiating

them), and then calls the driver’s config_aneg function. If the PHYCONTROL Layer is operating, we change the state to reflect the beginning of Auto-negotiation or forcing.

void phy_stop_machine(struct phy_device *phydev)

stop the PHY state machine tracking

Parameters

struct phy_device *phydev

target phy_device struct

Description

Stops the state machine delayed workqueue, sets the

state to UP (unless it wasn’t up yet). This function must be called BEFORE phy_detach.

int phy_disable_interrupts(struct phy_device *phydev)

Disable the PHY interrupts from the PHY side

Parameters

struct phy_device *phydev

target phy_device struct

irqreturn_t phy_interrupt(int irq, void *phy_dat)

PHY interrupt handler

Parameters

int irq

interrupt line

void *phy_dat

phy_device pointer

Description

Handle PHY interrupt

int phy_enable_interrupts(struct phy_device *phydev)

Enable the interrupts from the PHY side

Parameters

struct phy_device *phydev

target phy_device struct

void phy_state_machine(struct work_struct *work)

Handle the state machine

Parameters

struct work_struct *work

work_struct that describes the work to be done

const char *phy_speed_to_str(int speed)

Return a string representing the PHY link speed

Parameters

int speed

Speed of the link

const char *phy_duplex_to_str(unsigned int duplex)

Return string describing the duplex

Parameters

unsigned int duplex

Duplex setting to describe

const struct phy_setting *phy_lookup_setting(int speed, int duplex, const unsigned long *mask, bool exact)

lookup a PHY setting

Parameters

int speed

speed to match

int duplex

duplex to match

const unsigned long *mask

allowed link modes

bool exact

an exact match is required

Description

Search the settings array for a setting that matches the speed and duplex, and which is supported.

If exact is unset, either an exact match or NULL for no match will be returned.

If exact is set, an exact match, the fastest supported setting at or below the specified speed, the slowest supported setting, or if they all fail, NULL will be returned.

int phy_set_max_speed(struct phy_device *phydev, u32 max_speed)

Set the maximum speed the PHY should support

Parameters

struct phy_device *phydev

The phy_device struct

u32 max_speed

Maximum speed

Description

The PHY might be more capable than the MAC. For example a Fast Ethernet is connected to a 1G PHY. This function allows the MAC to indicate its maximum speed, and so limit what the PHY will advertise.

void phy_resolve_aneg_pause(struct phy_device *phydev)

Determine pause autoneg results

Parameters

struct phy_device *phydev

The phy_device struct

Description

Once autoneg has completed the local pause settings can be resolved. Determine if pause and asymmetric pause should be used by the MAC.

void phy_resolve_aneg_linkmode(struct phy_device *phydev)

resolve the advertisements into PHY settings

Parameters

struct phy_device *phydev

The phy_device struct

Description

Resolve our and the link partner advertisements into their corresponding speed and duplex. If full duplex was negotiated, extract the pause mode from the link partner mask.

void phy_check_downshift(struct phy_device *phydev)

check whether downshift occurred

Parameters

struct phy_device *phydev

The phy_device struct

Description

Check whether a downshift to a lower speed occurred. If this should be the case warn the user. Prerequisite for detecting downshift is that PHY driver implements the read_status callback and sets phydev->speed to the actual link speed.

int __phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum)

Convenience function for reading a register from an MMD on a given PHY.

Parameters

struct phy_device *phydev

The phy_device struct

int devad

The MMD to read from (0..31)

u32 regnum

The register on the MMD to read (0..65535)

Description

Same rules as for __phy_read();

int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum)

Convenience function for reading a register from an MMD on a given PHY.

Parameters

struct phy_device *phydev

The phy_device struct

int devad

The MMD to read from

u32 regnum

The register on the MMD to read

Description

Same rules as for phy_read();

int __phy_write_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val)

Convenience function for writing a register on an MMD on a given PHY.

Parameters

struct phy_device *phydev

The phy_device struct

int devad

The MMD to read from

u32 regnum

The register on the MMD to read

u16 val

value to write to regnum

Description

Same rules as for __phy_write();

int phy_write_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val)

Convenience function for writing a register on an MMD on a given PHY.

Parameters

struct phy_device *phydev

The phy_device struct

int devad

The MMD to read from

u32 regnum

The register on the MMD to read

u16 val

value to write to regnum

Description

Same rules as for phy_write();

int phy_modify_changed(struct phy_device *phydev, u32 regnum, u16 mask, u16 set)

Function for modifying a PHY register

Parameters

struct phy_device *phydev

the phy_device struct

u32 regnum

register number to modify

u16 mask

bit mask of bits to clear

u16 set

new value of bits set in mask to write to regnum

NOTE

MUST NOT be called from interrupt context, because the bus read/write functions may wait for an interrupt to conclude the operation.

Description

Returns negative errno, 0 if there was no change, and 1 in case of change

int __phy_modify(struct phy_device *phydev, u32 regnum, u16 mask, u16 set)

Convenience function for modifying a PHY register

Parameters

struct phy_device *phydev

the phy_device struct

u32 regnum

register number to modify

u16 mask

bit mask of bits to clear

u16 set

new value of bits set in mask to write to regnum

NOTE

MUST NOT be called from interrupt context, because the bus read/write functions may wait for an interrupt to conclude the operation.

int phy_modify(struct phy_device *phydev, u32 regnum, u16 mask, u16 set)

Convenience function for modifying a given PHY register

Parameters

struct phy_device *phydev

the phy_device struct

u32 regnum

register number to write

u16 mask

bit mask of bits to clear

u16 set

new value of bits set in mask to write to regnum

NOTE

MUST NOT be called from interrupt context, because the bus read/write functions may wait for an interrupt to conclude the operation.

int __phy_modify_mmd_changed(struct phy_device *phydev, int devad, u32 regnum, u16 mask, u16 set)

Function for modifying a register on MMD

Parameters

struct phy_device *phydev

the phy_device struct

int devad

the MMD containing register to modify

u32 regnum

register number to modify

u16 mask

bit mask of bits to clear

u16 set

new value of bits set in mask to write to regnum

Description

Unlocked helper function which allows a MMD register to be modified as new register value = (old register value & ~mask) | set

Returns negative errno, 0 if there was no change, and 1 in case of change

int phy_modify_mmd_changed(struct phy_device *phydev, int devad, u32 regnum, u16 mask, u16 set)

Function for modifying a register on MMD

Parameters

struct phy_device *phydev

the phy_device struct

int devad

the MMD containing register to modify

u32 regnum

register number to modify

u16 mask

bit mask of bits to clear

u16 set

new value of bits set in mask to write to regnum

NOTE

MUST NOT be called from interrupt context, because the bus read/write functions may wait for an interrupt to conclude the operation.

Description

Returns negative errno, 0 if there was no change, and 1 in case of change

int __phy_modify_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 mask, u16 set)

Convenience function for modifying a register on MMD

Parameters

struct phy_device *phydev

the phy_device struct

int devad

the MMD containing register to modify

u32 regnum

register number to modify

u16 mask

bit mask of bits to clear

u16 set

new value of bits set in mask to write to regnum

NOTE

MUST NOT be called from interrupt context, because the bus read/write functions may wait for an interrupt to conclude the operation.

int phy_modify_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 mask, u16 set)

Convenience function for modifying a register on MMD

Parameters

struct phy_device *phydev

the phy_device struct

int devad

the MMD containing register to modify

u32 regnum

register number to modify

u16 mask

bit mask of bits to clear

u16 set

new value of bits set in mask to write to regnum

NOTE

MUST NOT be called from interrupt context, because the bus read/write functions may wait for an interrupt to conclude the operation.

int phy_save_page(struct phy_device *phydev)

take the bus lock and save the current page

Parameters

struct phy_device *phydev

a pointer to a struct phy_device

Description

Take the MDIO bus lock, and return the current page number. On error, returns a negative errno. phy_restore_page() must always be called after this, irrespective of success or failure of this call.

int phy_select_page(struct phy_device *phydev, int page)

take the bus lock, save the current page, and set a page

Parameters

struct phy_device *phydev

a pointer to a struct phy_device

int page

desired page

Description

Take the MDIO bus lock to protect against concurrent access, save the current PHY page, and set the current page. On error, returns a negative errno, otherwise returns the previous page number. phy_restore_page() must always be called after this, irrespective of success or failure of this call.

int phy_restore_page(struct phy_device *phydev, int oldpage, int ret)

restore the page register and release the bus lock

Parameters

struct phy_device *phydev

a pointer to a struct phy_device

int oldpage

the old page, return value from phy_save_page() or phy_select_page()

int ret

operation’s return code

Description

Release the MDIO bus lock, restoring oldpage if it is a valid page. This function propagates the earliest error code from the group of operations.

Return

oldpage if it was a negative value, otherwise ret if it was a negative errno value, otherwise phy_write_page()’s negative value if it were in error, otherwise ret.

int phy_read_paged(struct phy_device *phydev, int page, u32 regnum)

Convenience function for reading a paged register

Parameters

struct phy_device *phydev

a pointer to a struct phy_device

int page

the page for the phy

u32 regnum

register number

Description

Same rules as for phy_read().

int phy_write_paged(struct phy_device *phydev, int page, u32 regnum, u16 val)

Convenience function for writing a paged register

Parameters

struct phy_device *phydev

a pointer to a struct phy_device

int page

the page for the phy

u32 regnum

register number

u16 val

value to write

Description

Same rules as for phy_write().

int phy_modify_paged_changed(struct phy_device *phydev, int page, u32 regnum, u16 mask, u16 set)

Function for modifying a paged register

Parameters

struct phy_device *phydev

a pointer to a struct phy_device

int page

the page for the phy

u32 regnum

register number

u16 mask

bit mask of bits to clear

u16 set

bit mask of bits to set

Description

Returns negative errno, 0 if there was no change, and 1 in case of change

int phy_modify_paged(struct phy_device *phydev, int page, u32 regnum, u16 mask, u16 set)

Convenience function for modifying a paged register

Parameters

struct phy_device *phydev

a pointer to a struct phy_device

int page

the page for the phy

u32 regnum

register number

u16 mask

bit mask of bits to clear

u16 set

bit mask of bits to set

Description

Same rules as for phy_read() and phy_write().

int genphy_c45_pma_resume(struct phy_device *phydev)

wakes up the PMA module

Parameters

struct phy_device *phydev

target phy_device struct

int genphy_c45_pma_suspend(struct phy_device *phydev)

suspends the PMA module

Parameters

struct phy_device *phydev

target phy_device struct

int genphy_c45_pma_setup_forced(struct phy_device *phydev)

configures a forced speed

Parameters

struct phy_device *phydev

target phy_device struct

int genphy_c45_an_config_aneg(struct phy_device *phydev)

configure advertisement registers

Parameters

struct phy_device *phydev

target phy_device struct

Description

Configure advertisement registers based on modes set in phydev->advertising

Returns negative errno code on failure, 0 if advertisement didn’t change, or 1 if advertised modes changed.

int genphy_c45_an_disable_aneg(struct phy_device *phydev)

disable auto-negotiation

Parameters

struct phy_device *phydev

target phy_device struct

Description

Disable auto-negotiation in the Clause 45 PHY. The link parameters are controlled through the PMA/PMD MMD registers.

Returns zero on success, negative errno code on failure.

int genphy_c45_restart_aneg(struct phy_device *phydev)

Enable and restart auto-negotiation

Parameters

struct phy_device *phydev

target phy_device struct

Description

This assumes that the auto-negotiation MMD is present.

Enable and restart auto-negotiation.

int genphy_c45_check_and_restart_aneg(struct phy_device *phydev, bool restart)

Enable and restart auto-negotiation

Parameters

struct phy_device *phydev

target phy_device struct

bool restart

whether aneg restart is requested

Description

This assumes that the auto-negotiation MMD is present.

Check, and restart auto-negotiation if needed.

int genphy_c45_aneg_done(struct phy_device *phydev)

return auto-negotiation complete status

Parameters

struct phy_device *phydev

target phy_device struct

Description

This assumes that the auto-negotiation MMD is present.

Reads the status register from the auto-negotiation MMD, returning: - positive if auto-negotiation is complete - negative errno code on error - zero otherwise

read the overall link status from the MMDs

Parameters

struct phy_device *phydev

target phy_device struct

Description

Read the link status from the specified MMDs, and if they all indicate that the link is up, set phydev->link to 1. If an error is encountered, a negative errno will be returned, otherwise zero.

int genphy_c45_read_lpa(struct phy_device *phydev)

read the link partner advertisement and pause

Parameters

struct phy_device *phydev

target phy_device struct

Description

Read the Clause 45 defined base (7.19) and 10G (7.33) status registers, filling in the link partner advertisement, pause and asym_pause members in phydev. This assumes that the auto-negotiation MMD is present, and the backplane bit (7.48.0) is clear. Clause 45 PHY drivers are expected to fill in the remainder of the link partner advert from vendor registers.

int genphy_c45_read_pma(struct phy_device *phydev)

read link speed etc from PMA

Parameters

struct phy_device *phydev

target phy_device struct

int genphy_c45_read_mdix(struct phy_device *phydev)

read mdix status from PMA

Parameters

struct phy_device *phydev

target phy_device struct

int genphy_c45_pma_read_abilities(struct phy_device *phydev)

read supported link modes from PMA

Parameters

struct phy_device *phydev

target phy_device struct

Description

Read the supported link modes from the PMA Status 2 (1.8) register. If bit 1.8.9 is set, the list of supported modes is build using the values in the PMA Extended Abilities (1.11) register, indicating 1000BASET an 10G related modes. If bit 1.11.14 is set, then the list is also extended with the modes in the 2.5G/5G PMA Extended register (1.21), indicating if 2.5GBASET and 5GBASET are supported.

int genphy_c45_read_status(struct phy_device *phydev)

read PHY status

Parameters

struct phy_device *phydev

target phy_device struct

Description

Reads status from PHY and sets phy_device members accordingly.

int genphy_c45_config_aneg(struct phy_device *phydev)

restart auto-negotiation or forced setup

Parameters

struct phy_device *phydev

target phy_device struct

Description

If auto-negotiation is enabled, we configure the

advertising, and then restart auto-negotiation. If it is not enabled, then we force a configuration.

int genphy_c45_fast_retrain(struct phy_device *phydev, bool enable)

configure fast retrain registers

Parameters

struct phy_device *phydev

target phy_device struct

bool enable

enable fast retrain or not

Description

If fast-retrain is enabled, we configure PHY as

advertising fast retrain capable and THP Bypass Request, then enable fast retrain. If it is not enabled, we configure fast retrain disabled.

enum phy_interface_t

Interface Mode definitions

Constants

PHY_INTERFACE_MODE_NA

Not Applicable - don’t touch

PHY_INTERFACE_MODE_INTERNAL

No interface, MAC and PHY combined

PHY_INTERFACE_MODE_MII

Median-independent interface

PHY_INTERFACE_MODE_GMII

Gigabit median-independent interface

PHY_INTERFACE_MODE_SGMII

Serial gigabit media-independent interface

PHY_INTERFACE_MODE_TBI

Ten Bit Interface

PHY_INTERFACE_MODE_REVMII

Reverse Media Independent Interface

PHY_INTERFACE_MODE_RMII

Reduced Media Independent Interface

PHY_INTERFACE_MODE_REVRMII

Reduced Media Independent Interface in PHY role

PHY_INTERFACE_MODE_RGMII

Reduced gigabit media-independent interface

PHY_INTERFACE_MODE_RGMII_ID

RGMII with Internal RX+TX delay

PHY_INTERFACE_MODE_RGMII_RXID

RGMII with Internal RX delay

PHY_INTERFACE_MODE_RGMII_TXID

RGMII with Internal RX delay

PHY_INTERFACE_MODE_RTBI

Reduced TBI

PHY_INTERFACE_MODE_SMII

??? MII

PHY_INTERFACE_MODE_XGMII

10 gigabit media-independent interface

PHY_INTERFACE_MODE_XLGMII

40 gigabit media-independent interface

PHY_INTERFACE_MODE_MOCA

Multimedia over Coax

PHY_INTERFACE_MODE_QSGMII

Quad SGMII

PHY_INTERFACE_MODE_TRGMII

Turbo RGMII

PHY_INTERFACE_MODE_100BASEX

100 BaseX

PHY_INTERFACE_MODE_1000BASEX

1000 BaseX

PHY_INTERFACE_MODE_2500BASEX

2500 BaseX

PHY_INTERFACE_MODE_5GBASER

5G BaseR

PHY_INTERFACE_MODE_RXAUI

Reduced XAUI

PHY_INTERFACE_MODE_XAUI

10 Gigabit Attachment Unit Interface

PHY_INTERFACE_MODE_10GBASER

10G BaseR

PHY_INTERFACE_MODE_25GBASER

25G BaseR

PHY_INTERFACE_MODE_USXGMII

Universal Serial 10GE MII

PHY_INTERFACE_MODE_10GKR

10GBASE-KR - with Clause 73 AN

PHY_INTERFACE_MODE_MAX

Book keeping

Description

Describes the interface between the MAC and PHY.

const char *phy_modes(phy_interface_t interface)

map phy_interface_t enum to device tree binding of phy-mode

Parameters

phy_interface_t interface

enum phy_interface_t value

Description

maps enum phy_interface_t defined in this file into the device tree binding of ‘phy-mode’, so that Ethernet device driver can get PHY interface from device tree.

struct mdio_bus_stats

Statistics counters for MDIO busses

Definition

struct mdio_bus_stats {
  u64_stats_t transfers;
  u64_stats_t errors;
  u64_stats_t writes;
  u64_stats_t reads;
  struct u64_stats_sync syncp;
};

Members

transfers

Total number of transfers, i.e. writes + reads

errors

Number of MDIO transfers that returned an error

writes

Number of write transfers

reads

Number of read transfers

syncp

Synchronisation for incrementing statistics

struct phy_package_shared

Shared information in PHY packages

Definition

struct phy_package_shared {
  int addr;
  refcount_t refcnt;
  unsigned long flags;
  size_t priv_size;
  void *priv;
};

Members

addr

Common PHY address used to combine PHYs in one package

refcnt

Number of PHYs connected to this shared data

flags

Initialization of PHY package

priv_size

Size of the shared private data priv

priv

Driver private data shared across a PHY package

Description

Represents a shared structure between different phydev’s in the same package, for example a quad PHY. See phy_package_join() and phy_package_leave().

struct mii_bus

Represents an MDIO bus

Definition

struct mii_bus {
  struct module *owner;
  const char *name;
  char id[MII_BUS_ID_SIZE];
  void *priv;
  int (*read)(struct mii_bus *bus, int addr, int regnum);
  int (*write)(struct mii_bus *bus, int addr, int regnum, u16 val);
  int (*reset)(struct mii_bus *bus);
  struct mdio_bus_stats stats[PHY_MAX_ADDR];
  struct mutex mdio_lock;
  struct device *parent;
  enum {
    MDIOBUS_ALLOCATED = 1,
    MDIOBUS_REGISTERED,
    MDIOBUS_UNREGISTERED,
    MDIOBUS_RELEASED,
  } state;
  struct device dev;
  struct mdio_device *mdio_map[PHY_MAX_ADDR];
  u32 phy_mask;
  u32 phy_ignore_ta_mask;
  int irq[PHY_MAX_ADDR];
  int reset_delay_us;
  int reset_post_delay_us;
  struct gpio_desc *reset_gpiod;
  enum {
    MDIOBUS_NO_CAP = 0,
    MDIOBUS_C22,
    MDIOBUS_C45,
    MDIOBUS_C22_C45,
  } probe_capabilities;
  struct mutex shared_lock;
  struct phy_package_shared *shared[PHY_MAX_ADDR];
};

Members

owner

Who owns this device

name

User friendly name for this MDIO device, or driver name

id

Unique identifier for this bus, typical from bus hierarchy

priv

Driver private data

read

Perform a read transfer on the bus

write

Perform a write transfer on the bus

reset

Perform a reset of the bus

stats

Statistic counters per device on the bus

mdio_lock

A lock to ensure that only one thing can read/write the MDIO bus at a time

parent

Parent device of this bus

state

State of bus structure

dev

Kernel device representation

mdio_map

list of all MDIO devices on bus

phy_mask

PHY addresses to be ignored when probing

phy_ignore_ta_mask

PHY addresses to ignore the TA/read failure

irq

An array of interrupts, each PHY’s interrupt at the index matching its address

reset_delay_us

GPIO reset pulse width in microseconds

reset_post_delay_us

GPIO reset deassert delay in microseconds

reset_gpiod

Reset GPIO descriptor pointer

probe_capabilities

bus capabilities, used for probing

shared_lock

protect access to the shared element

shared

shared state across different PHYs

Description

The Bus class for PHYs. Devices which provide access to PHYs should register using this structure

struct mii_bus *mdiobus_alloc(void)

Allocate an MDIO bus structure

Parameters

void

no arguments

Description

The internal state of the MDIO bus will be set of MDIOBUS_ALLOCATED ready for the driver to register the bus.

enum phy_state

PHY state machine states:

Constants

PHY_DOWN

PHY device and driver are not ready for anything. probe should be called if and only if the PHY is in this state, given that the PHY device exists. - PHY driver probe function will set the state to PHY_READY

PHY_READY

PHY is ready to send and receive packets, but the controller is not. By default, PHYs which do not implement probe will be set to this state by phy_probe(). - start will set the state to UP

PHY_HALTED

PHY is up, but no polling or interrupts are done. Or PHY is in an error state. - phy_start moves to PHY_UP

PHY_UP

The PHY and attached device are ready to do work. Interrupts should be started here. - timer moves to PHY_NOLINK or PHY_RUNNING

PHY_RUNNING

PHY is currently up, running, and possibly sending and/or receiving packets - irq or timer will set PHY_NOLINK if link goes down - phy_stop moves to PHY_HALTED

PHY_NOLINK

PHY is up, but not currently plugged in. - irq or timer will set PHY_RUNNING if link comes back - phy_stop moves to PHY_HALTED

PHY_CABLETEST

PHY is performing a cable test. Packet reception/sending is not expected to work, carrier will be indicated as down. PHY will be poll once per second, or on interrupt for it current state. Once complete, move to UP to restart the PHY. - phy_stop aborts the running test and moves to PHY_HALTED

struct phy_c45_device_ids

802.3-c45 Device Identifiers

Definition

struct phy_c45_device_ids {
  u32 devices_in_package;
  u32 mmds_present;
  u32 device_ids[MDIO_MMD_NUM];
};

Members

devices_in_package

IEEE 802.3 devices in package register value.

mmds_present

bit vector of MMDs present.

device_ids

The device identifer for each present device.

struct phy_device

An instance of a PHY

Definition

struct phy_device {
  struct mdio_device mdio;
  struct phy_driver *drv;
  u32 phy_id;
  struct phy_c45_device_ids c45_ids;
  unsigned is_c45:1;
  unsigned is_internal:1;
  unsigned is_pseudo_fixed_link:1;
  unsigned is_gigabit_capable:1;
  unsigned has_fixups:1;
  unsigned suspended:1;
  unsigned suspended_by_mdio_bus:1;
  unsigned sysfs_links:1;
  unsigned loopback_enabled:1;
  unsigned downshifted_rate:1;
  unsigned is_on_sfp_module:1;
  unsigned mac_managed_pm:1;
  unsigned autoneg:1;
  unsigned link:1;
  unsigned autoneg_complete:1;
  unsigned interrupts:1;
  enum phy_state state;
  u32 dev_flags;
  phy_interface_t interface;
  int speed;
  int duplex;
  int port;
  int pause;
  int asym_pause;
  u8 master_slave_get;
  u8 master_slave_set;
  u8 master_slave_state;
  unsigned long supported[BITS_TO_LONGS(__ETHTOOL_LINK_MODE_MASK_NBITS)];
  unsigned long advertising[BITS_TO_LONGS(__ETHTOOL_LINK_MODE_MASK_NBITS)];
  unsigned long lp_advertising[BITS_TO_LONGS(__ETHTOOL_LINK_MODE_MASK_NBITS)];
  unsigned long adv_old[BITS_TO_LONGS(__ETHTOOL_LINK_MODE_MASK_NBITS)];
  u32 eee_broken_modes;
#ifdef CONFIG_LED_TRIGGER_PHY;
  struct phy_led_trigger *phy_led_triggers;
  unsigned int phy_num_led_triggers;
  struct phy_led_trigger *last_triggered;
  struct phy_led_trigger *led_link_trigger;
#endif;
  int irq;
  void *priv;
  struct phy_package_shared *shared;
  struct sk_buff *skb;
  void *ehdr;
  struct nlattr *nest;
  struct delayed_work state_queue;
  struct mutex lock;
  bool sfp_bus_attached;
  struct sfp_bus *sfp_bus;
  struct phylink *phylink;
  struct net_device *attached_dev;
  struct mii_timestamper *mii_ts;
  u8 mdix;
  u8 mdix_ctrl;
  void (*phy_link_change)(struct phy_device *phydev, bool up);
  void (*adjust_link)(struct net_device *dev);
#if IS_ENABLED(CONFIG_MACSEC);
  const struct macsec_ops *macsec_ops;
#endif;
};

Members

mdio

MDIO bus this PHY is on

drv

Pointer to the driver for this PHY instance

phy_id

UID for this device found during discovery

c45_ids

802.3-c45 Device Identifiers if is_c45.

is_c45

Set to true if this PHY uses clause 45 addressing.

is_internal

Set to true if this PHY is internal to a MAC.

is_pseudo_fixed_link

Set to true if this PHY is an Ethernet switch, etc.

is_gigabit_capable

Set to true if PHY supports 1000Mbps

has_fixups

Set to true if this PHY has fixups/quirks.

suspended

Set to true if this PHY has been suspended successfully.

suspended_by_mdio_bus

Set to true if this PHY was suspended by MDIO bus.

sysfs_links

Internal boolean tracking sysfs symbolic links setup/removal.

loopback_enabled

Set true if this PHY has been loopbacked successfully.

downshifted_rate

Set true if link speed has been downshifted.

is_on_sfp_module

Set true if PHY is located on an SFP module.

mac_managed_pm

Set true if MAC driver takes of suspending/resuming PHY

autoneg

Flag autoneg being used

link

Current link state

autoneg_complete

Flag auto negotiation of the link has completed

interrupts

Flag interrupts have been enabled

state

State of the PHY for management purposes

dev_flags

Device-specific flags used by the PHY driver. Bits [15:0] are free to use by the PHY driver to communicate

driver specific behavior.

Bits [23:16] are currently reserved for future use. Bits [31:24] are reserved for defining generic

PHY driver behavior.

interface

enum phy_interface_t value

speed

Current link speed

duplex

Current duplex

port

Current port

pause

Current pause

asym_pause

Current asymmetric pause

master_slave_get

Current master/slave advertisement

master_slave_set

User requested master/slave configuration

master_slave_state

Current master/slave configuration

supported

Combined MAC/PHY supported linkmodes

advertising

Currently advertised linkmodes

lp_advertising

Current link partner advertised linkmodes

adv_old

Saved advertised while power saving for WoL

eee_broken_modes

Energy efficient ethernet modes which should be prohibited

phy_led_triggers

Array of LED triggers

phy_num_led_triggers

Number of triggers in phy_led_triggers

last_triggered

last LED trigger for link speed

led_link_trigger

LED trigger for link up/down

irq

IRQ number of the PHY’s interrupt (-1 if none)

priv

Pointer to driver private data

shared

Pointer to private data shared by phys in one package

skb

Netlink message for cable diagnostics

ehdr

nNtlink header for cable diagnostics

nest

Netlink nest used for cable diagnostics

state_queue

Work queue for state machine

lock

Mutex for serialization access to PHY

sfp_bus_attached

Flag indicating whether the SFP bus has been attached

sfp_bus

SFP bus attached to this PHY’s fiber port

phylink

Pointer to phylink instance for this PHY

attached_dev

The attached enet driver’s device instance ptr

mii_ts

Pointer to time stamper callbacks

mdix

Current crossover

mdix_ctrl

User setting of crossover

phy_link_change

Callback for phylink for notification of link change

adjust_link

Callback for the enet controller to respond to changes: in the link state.

macsec_ops

MACsec offloading ops.

Description

interrupts currently only supports enabled or disabled, but could be changed in the future to support enabling and disabling specific interrupts

Contains some infrastructure for polling and interrupt handling, as well as handling shifts in PHY hardware state

struct phy_tdr_config

Configuration of a TDR raw test

Definition

struct phy_tdr_config {
  u32 first;
  u32 last;
  u32 step;
  s8 pair;
};

Members

first

Distance for first data collection point

last

Distance for last data collection point

step

Step between data collection points

pair

Bitmap of cable pairs to collect data for

Description

A structure containing possible configuration parameters for a TDR cable test. The driver does not need to implement all the parameters, but should report what is actually used. All distances are in centimeters.

struct phy_driver

Driver structure for a particular PHY type

Definition

struct phy_driver {
  struct mdio_driver_common mdiodrv;
  u32 phy_id;
  char *name;
  u32 phy_id_mask;
  const unsigned long * const features;
  u32 flags;
  const void *driver_data;
  int (*soft_reset)(struct phy_device *phydev);
  int (*config_init)(struct phy_device *phydev);
  int (*probe)(struct phy_device *phydev);
  int (*get_features)(struct phy_device *phydev);
  int (*suspend)(struct phy_device *phydev);
  int (*resume)(struct phy_device *phydev);
  int (*config_aneg)(struct phy_device *phydev);
  int (*aneg_done)(struct phy_device *phydev);
  int (*read_status)(struct phy_device *phydev);
  int (*config_intr)(struct phy_device *phydev);
  irqreturn_t (*handle_interrupt)(struct phy_device *phydev);
  void (*remove)(struct phy_device *phydev);
  int (*match_phy_device)(struct phy_device *phydev);
  int (*set_wol)(struct phy_device *dev, struct ethtool_wolinfo *wol);
  void (*get_wol)(struct phy_device *dev, struct ethtool_wolinfo *wol);
  void (*link_change_notify)(struct phy_device *dev);
  int (*read_mmd)(struct phy_device *dev, int devnum, u16 regnum);
  int (*write_mmd)(struct phy_device *dev, int devnum, u16 regnum, u16 val);
  int (*read_page)(struct phy_device *dev);
  int (*write_page)(struct phy_device *dev, int page);
  int (*module_info)(struct phy_device *dev, struct ethtool_modinfo *modinfo);
  int (*module_eeprom)(struct phy_device *dev, struct ethtool_eeprom *ee, u8 *data);
  int (*cable_test_start)(struct phy_device *dev);
  int (*cable_test_tdr_start)(struct phy_device *dev, const struct phy_tdr_config *config);
  int (*cable_test_get_status)(struct phy_device *dev, bool *finished);
  int (*get_sset_count)(struct phy_device *dev);
  void (*get_strings)(struct phy_device *dev, u8 *data);
  void (*get_stats)(struct phy_device *dev, struct ethtool_stats *stats, u64 *data);
  int (*get_tunable)(struct phy_device *dev, struct ethtool_tunable *tuna, void *data);
  int (*set_tunable)(struct phy_device *dev,struct ethtool_tunable *tuna, const void *data);
  int (*set_loopback)(struct phy_device *dev, bool enable);
  int (*get_sqi)(struct phy_device *dev);
  int (*get_sqi_max)(struct phy_device *dev);
};

Members

mdiodrv

Data common to all MDIO devices

phy_id

The result of reading the UID registers of this PHY type, and ANDing them with the phy_id_mask. This driver only works for PHYs with IDs which match this field

name

The friendly name of this PHY type

phy_id_mask

Defines the important bits of the phy_id

features

A mandatory list of features (speed, duplex, etc) supported by this PHY

flags

A bitfield defining certain other features this PHY supports (like interrupts)

driver_data

Static driver data

soft_reset

Called to issue a PHY software reset

config_init

Called to initialize the PHY, including after a reset

probe

Called during discovery. Used to set up device-specific structures, if any

get_features

Probe the hardware to determine what abilities it has. Should only set phydev->supported.

suspend

Suspend the hardware, saving state if needed

resume

Resume the hardware, restoring state if needed

config_aneg

Configures the advertisement and resets autonegotiation if phydev->autoneg is on, forces the speed to the current settings in phydev if phydev->autoneg is off

aneg_done

Determines the auto negotiation result

read_status

Determines the negotiated speed and duplex

config_intr

Enables or disables interrupts. It should also clear any pending interrupts prior to enabling the IRQs and after disabling them.

handle_interrupt

Override default interrupt handling

remove

Clears up any memory if needed

match_phy_device

Returns true if this is a suitable driver for the given phydev. If NULL, matching is based on phy_id and phy_id_mask.

set_wol

Some devices (e.g. qnap TS-119P II) require PHY register changes to enable Wake on LAN, so set_wol is provided to be called in the ethernet driver’s set_wol function.

get_wol

See set_wol, but for checking whether Wake on LAN is enabled.

link_change_notify

Called to inform a PHY device driver when the core is about to change the link state. This callback is supposed to be used as fixup hook for drivers that need to take action when the link state changes. Drivers are by no means allowed to mess with the PHY device structure in their implementations.

read_mmd

PHY specific driver override for reading a MMD register. This function is optional for PHY specific drivers. When not provided, the default MMD read function will be used by phy_read_mmd(), which will use either a direct read for Clause 45 PHYs or an indirect read for Clause 22 PHYs. devnum is the MMD device number within the PHY device, regnum is the register within the selected MMD device.

write_mmd

PHY specific driver override for writing a MMD register. This function is optional for PHY specific drivers. When not provided, the default MMD write function will be used by phy_write_mmd(), which will use either a direct write for Clause 45 PHYs, or an indirect write for Clause 22 PHYs. devnum is the MMD device number within the PHY device, regnum is the register within the selected MMD device. val is the value to be written.

read_page

Return the current PHY register page number

write_page

Set the current PHY register page number

module_info

Get the size and type of the eeprom contained within a plug-in module

module_eeprom

Get the eeprom information from the plug-in module

cable_test_start

Start a cable test

cable_test_tdr_start

Start a raw TDR cable test

cable_test_get_status

Once per second, or on interrupt, request the status of the test.

get_sset_count

Number of statistic counters

get_strings

Names of the statistic counters

get_stats

Return the statistic counter values

get_tunable

Return the value of a tunable

set_tunable

Set the value of a tunable

set_loopback

Set the loopback mood of the PHY

get_sqi

Get the signal quality indication

get_sqi_max

Get the maximum signal quality indication

Description

All functions are optional. If config_aneg or read_status are not implemented, the phy core uses the genphy versions. Note that none of these functions should be called from interrupt time. The goal is for the bus read/write functions to be able to block when the bus transaction is happening, and be freed up by an interrupt (The MPC85xx has this ability, though it is not currently supported in the driver).

bool phy_is_started(struct phy_device *phydev)

Convenience function to check whether PHY is started

Parameters

struct phy_device *phydev

The phy_device struct

int phy_read(struct phy_device *phydev, u32 regnum)

Convenience function for reading a given PHY register

Parameters

struct phy_device *phydev

the phy_device struct

u32 regnum

register number to read

NOTE

MUST NOT be called from interrupt context, because the bus read/write functions may wait for an interrupt to conclude the operation.

int __phy_read(struct phy_device *phydev, u32 regnum)

convenience function for reading a given PHY register

Parameters

struct phy_device *phydev

the phy_device struct

u32 regnum

register number to read

Description

The caller must have taken the MDIO bus lock.

int phy_write(struct phy_device *phydev, u32 regnum, u16 val)

Convenience function for writing a given PHY register

Parameters

struct phy_device *phydev

the phy_device struct

u32 regnum

register number to write

u16 val

value to write to regnum

NOTE

MUST NOT be called from interrupt context, because the bus read/write functions may wait for an interrupt to conclude the operation.

int __phy_write(struct phy_device *phydev, u32 regnum, u16 val)

Convenience function for writing a given PHY register

Parameters

struct phy_device *phydev

the phy_device struct

u32 regnum

register number to write

u16 val

value to write to regnum

Description

The caller must have taken the MDIO bus lock.

int __phy_modify_changed(struct phy_device *phydev, u32 regnum, u16 mask, u16 set)

Convenience function for modifying a PHY register

Parameters

struct phy_device *phydev

a pointer to a struct phy_device

u32 regnum

register number

u16 mask

bit mask of bits to clear

u16 set

bit mask of bits to set

Description

Unlocked helper function which allows a PHY register to be modified as new register value = (old register value & ~mask) | set

Returns negative errno, 0 if there was no change, and 1 in case of change

phy_read_mmd_poll_timeout

phy_read_mmd_poll_timeout (phydev, devaddr, regnum, val, cond, sleep_us, timeout_us, sleep_before_read)

Periodically poll a PHY register until a condition is met or a timeout occurs

Parameters

phydev

The phy_device struct

devaddr

The MMD to read from

regnum

The register on the MMD to read

val

Variable to read the register into

cond

Break condition (usually involving val)

sleep_us

Maximum time to sleep between reads in us (0 tight-loops). Should be less than ~20ms since usleep_range is used (see delays - Information on the various kernel delay / sleep mechanisms).

timeout_us

Timeout in us, 0 means never timeout

sleep_before_read

if it is true, sleep sleep_us before read. Returns 0 on success and -ETIMEDOUT upon a timeout. In either case, the last read value at args is stored in val. Must not be called from atomic context if sleep_us or timeout_us are used.

int __phy_set_bits(struct phy_device *phydev, u32 regnum, u16 val)

Convenience function for setting bits in a PHY register

Parameters

struct phy_device *phydev

the phy_device struct

u32 regnum

register number to write

u16 val

bits to set

Description

The caller must have taken the MDIO bus lock.

int __phy_clear_bits(struct phy_device *phydev, u32 regnum, u16 val)

Convenience function for clearing bits in a PHY register

Parameters

struct phy_device *phydev

the phy_device struct

u32 regnum

register number to write

u16 val

bits to clear

Description

The caller must have taken the MDIO bus lock.

int phy_set_bits(struct phy_device *phydev, u32 regnum, u16 val)

Convenience function for setting bits in a PHY register

Parameters

struct phy_device *phydev

the phy_device struct

u32 regnum

register number to write

u16 val

bits to set

int phy_clear_bits(struct phy_device *phydev, u32 regnum, u16 val)

Convenience function for clearing bits in a PHY register

Parameters

struct phy_device *phydev

the phy_device struct

u32 regnum

register number to write

u16 val

bits to clear

int __phy_set_bits_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val)

Convenience function for setting bits in a register on MMD

Parameters

struct phy_device *phydev

the phy_device struct

int devad

the MMD containing register to modify

u32 regnum

register number to modify

u16 val

bits to set

Description

The caller must have taken the MDIO bus lock.

int __phy_clear_bits_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val)

Convenience function for clearing bits in a register on MMD

Parameters

struct phy_device *phydev

the phy_device struct

int devad

the MMD containing register to modify

u32 regnum

register number to modify

u16 val

bits to clear

Description

The caller must have taken the MDIO bus lock.

int phy_set_bits_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val)

Convenience function for setting bits in a register on MMD

Parameters

struct phy_device *phydev

the phy_device struct

int devad

the MMD containing register to modify

u32 regnum

register number to modify

u16 val

bits to set

int phy_clear_bits_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val)

Convenience function for clearing bits in a register on MMD

Parameters

struct phy_device *phydev

the phy_device struct

int devad

the MMD containing register to modify

u32 regnum

register number to modify

u16 val

bits to clear

bool phy_interrupt_is_valid(struct phy_device *phydev)

Convenience function for testing a given PHY irq

Parameters

struct phy_device *phydev

the phy_device struct

NOTE

must be kept in sync with addition/removal of PHY_POLL and PHY_MAC_INTERRUPT

bool phy_polling_mode(struct phy_device *phydev)

Convenience function for testing whether polling is used to detect PHY status changes

Parameters

struct phy_device *phydev

the phy_device struct

bool phy_has_hwtstamp(struct phy_device *phydev)

Tests whether a PHY time stamp configuration.

Parameters

struct phy_device *phydev

the phy_device struct

bool phy_has_rxtstamp(struct phy_device *phydev)

Tests whether a PHY supports receive time stamping.

Parameters

struct phy_device *phydev

the phy_device struct

bool phy_has_tsinfo(struct phy_device *phydev)

Tests whether a PHY reports time stamping and/or PTP hardware clock capabilities.

Parameters

struct phy_device *phydev

the phy_device struct

bool phy_has_txtstamp(struct phy_device *phydev)

Tests whether a PHY supports transmit time stamping.

Parameters

struct phy_device *phydev

the phy_device struct

bool phy_is_internal(struct phy_device *phydev)

Convenience function for testing if a PHY is internal

Parameters

struct phy_device *phydev

the phy_device struct

bool phy_on_sfp(struct phy_device *phydev)

Convenience function for testing if a PHY is on an SFP module

Parameters

struct phy_device *phydev

the phy_device struct

bool phy_interface_mode_is_rgmii(phy_interface_t mode)

Convenience function for testing if a PHY interface mode is RGMII (all variants)

Parameters

phy_interface_t mode

the phy_interface_t enum

bool phy_interface_mode_is_8023z(phy_interface_t mode)

does the PHY interface mode use 802.3z negotiation

Parameters

phy_interface_t mode

one of enum phy_interface_t

Description

Returns true if the PHY interface mode uses the 16-bit negotiation word as defined in 802.3z. (See 802.3-2015 37.2.1 Config_Reg encoding)

bool phy_interface_is_rgmii(struct phy_device *phydev)

Convenience function for testing if a PHY interface is RGMII (all variants)

Parameters

struct phy_device *phydev

the phy_device struct

Convenience function for testing if this PHY is the CPU port facing side of an Ethernet switch, or similar.

Parameters

struct phy_device *phydev

the phy_device struct

phy_module_driver

phy_module_driver (__phy_drivers, __count)

Helper macro for registering PHY drivers

Parameters

__phy_drivers

array of PHY drivers to register

__count

Numbers of members in array

Description

Helper macro for PHY drivers which do not do anything special in module init/exit. Each module may only use this macro once, and calling it replaces module_init() and module_exit().

int phy_register_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask, int (*run)(struct phy_device*))

creates a new phy_fixup and adds it to the list

Parameters

const char *bus_id

A string which matches phydev->mdio.dev.bus_id (or PHY_ANY_ID)

u32 phy_uid

Used to match against phydev->phy_id (the UID of the PHY) It can also be PHY_ANY_UID

u32 phy_uid_mask

Applied to phydev->phy_id and fixup->phy_uid before comparison

int (*run)(struct phy_device *)

The actual code to be run when a matching PHY is found

int phy_unregister_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask)

remove a phy_fixup from the list

Parameters

const char *bus_id

A string matches fixup->bus_id (or PHY_ANY_ID) in phy_fixup_list

u32 phy_uid

A phy id matches fixup->phy_id (or PHY_ANY_UID) in phy_fixup_list

u32 phy_uid_mask

Applied to phy_uid and fixup->phy_uid before comparison

struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45)

reads the specified PHY device and returns its phy_device struct

Parameters

struct mii_bus *bus

the target MII bus

int addr

PHY address on the MII bus

bool is_c45

If true the PHY uses the 802.3 clause 45 protocol

Description

Probe for a PHY at addr on bus.

When probing for a clause 22 PHY, then read the ID registers. If we find a valid ID, allocate and return a struct phy_device.

When probing for a clause 45 PHY, read the “devices in package” registers. If the “devices in package” appears valid, read the ID registers for each MMD, allocate and return a struct phy_device.

Returns an allocated struct phy_device on success, -ENODEV if there is no PHY present, or -EIO on bus access error.

int phy_device_register(struct phy_device *phydev)

Register the phy device on the MDIO bus

Parameters

struct phy_device *phydev

phy_device structure to be added to the MDIO bus

void phy_device_remove(struct phy_device *phydev)

Remove a previously registered phy device from the MDIO bus

Parameters

struct phy_device *phydev

phy_device structure to remove

Description

This doesn’t free the phy_device itself, it merely reverses the effects of phy_device_register(). Use phy_device_free() to free the device after calling this function.

int phy_get_c45_ids(struct phy_device *phydev)

Read 802.3-c45 IDs for phy device.

Parameters

struct phy_device *phydev

phy_device structure to read 802.3-c45 IDs

Description

Returns zero on success, -EIO on bus access error, or -ENODEV if the “devices in package” is invalid.

struct phy_device *phy_find_first(struct mii_bus *bus)

finds the first PHY device on the bus

Parameters

struct mii_bus *bus

the target MII bus

int phy_connect_direct(struct net_device *dev, struct phy_device *phydev, void (*handler)(struct net_device*), phy_interface_t interface)

connect an ethernet device to a specific phy_device

Parameters

struct net_device *dev

the network device to connect

struct phy_device *phydev

the pointer to the phy device

void (*handler)(struct net_device *)

callback function for state change notifications

phy_interface_t interface

PHY device’s interface

struct phy_device *phy_connect(struct net_device *dev, const char *bus_id, void (*handler)(struct net_device*), phy_interface_t interface)

connect an ethernet device to a PHY device

Parameters

struct net_device *dev

the network device to connect

const char *bus_id

the id string of the PHY device to connect

void (*handler)(struct net_device *)

callback function for state change notifications

phy_interface_t interface

PHY device’s interface

Description

Convenience function for connecting ethernet

devices to PHY devices. The default behavior is for the PHY infrastructure to handle everything, and only notify the connected driver when the link status changes. If you don’t want, or can’t use the provided functionality, you may choose to call only the subset of functions which provide the desired functionality.

void phy_disconnect(struct phy_device *phydev)

disable interrupts, stop state machine, and detach a PHY device

Parameters

struct phy_device *phydev

target phy_device struct

void phy_sfp_attach(void *upstream, struct sfp_bus *bus)

attach the SFP bus to the PHY upstream network device

Parameters

void *upstream

pointer to the phy device

struct sfp_bus *bus

sfp bus representing cage being attached

Description

This is used to fill in the sfp_upstream_ops .attach member.

void phy_sfp_detach(void *upstream, struct sfp_bus *bus)

detach the SFP bus from the PHY upstream network device

Parameters

void *upstream

pointer to the phy device

struct sfp_bus *bus

sfp bus representing cage being attached

Description

This is used to fill in the sfp_upstream_ops .detach member.

int phy_sfp_probe(struct phy_device *phydev, const struct sfp_upstream_ops *ops)

probe for a SFP cage attached to this PHY device

Parameters

struct phy_device *phydev

Pointer to phy_device

const struct sfp_upstream_ops *ops

SFP’s upstream operations

int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, u32 flags, phy_interface_t interface)

attach a network device to a given PHY device pointer

Parameters

struct net_device *dev

network device to attach

struct phy_device *phydev

Pointer to phy_device to attach

u32 flags

PHY device’s dev_flags

phy_interface_t interface

PHY device’s interface

Description

Called by drivers to attach to a particular PHY

device. The phy_device is found, and properly hooked up to the phy_driver. If no driver is attached, then a generic driver is used. The phy_device is given a ptr to the attaching device, and given a callback for link status change. The phy_device is returned to the attaching driver. This function takes a reference on the phy device.

struct phy_device *phy_attach(struct net_device *dev, const char *bus_id, phy_interface_t interface)

attach a network device to a particular PHY device

Parameters

struct net_device *dev

network device to attach

const char *bus_id

Bus ID of PHY device to attach

phy_interface_t interface

PHY device’s interface

Description

Same as phy_attach_direct() except that a PHY bus_id

string is passed instead of a pointer to a struct phy_device.

int phy_package_join(struct phy_device *phydev, int addr, size_t priv_size)

join a common PHY group

Parameters

struct phy_device *phydev

target phy_device struct

int addr

cookie and PHY address for global register access

size_t priv_size

if non-zero allocate this amount of bytes for private data

Description

This joins a PHY group and provides a shared storage for all phydevs in this group. This is intended to be used for packages which contain more than one PHY, for example a quad PHY transceiver.

The addr parameter serves as a cookie which has to have the same value for all members of one group and as a PHY address to access generic registers of a PHY package. Usually, one of the PHY addresses of the different PHYs in the package provides access to these global registers. The address which is given here, will be used in the phy_package_read() and phy_package_write() convenience functions. If your PHY doesn’t have global registers you can just pick any of the PHY addresses.

This will set the shared pointer of the phydev to the shared storage. If this is the first call for a this cookie the shared storage will be allocated. If priv_size is non-zero, the given amount of bytes are allocated for the priv member.

Returns < 1 on error, 0 on success. Esp. calling phy_package_join() with the same cookie but a different priv_size is an error.

void phy_package_leave(struct phy_device *phydev)

leave a common PHY group

Parameters

struct phy_device *phydev

target phy_device struct

Description

This leaves a PHY group created by phy_package_join(). If this phydev was the last user of the shared data between the group, this data is freed. Resets the phydev->shared pointer to NULL.

int devm_phy_package_join(struct device *dev, struct phy_device *phydev, int addr, size_t priv_size)

resource managed phy_package_join()

Parameters

struct device *dev

device that is registering this PHY package

struct phy_device *phydev

target phy_device struct

int addr

cookie and PHY address for global register access

size_t priv_size

if non-zero allocate this amount of bytes for private data

Description

Managed phy_package_join(). Shared storage fetched by this function, phy_package_leave() is automatically called on driver detach. See phy_package_join() for more information.

void phy_detach(struct phy_device *phydev)

detach a PHY device from its network device

Parameters

struct phy_device *phydev

target phy_device struct

Description

This detaches the phy device from its network device and the phy driver, and drops the reference count taken in phy_attach_direct().

int phy_reset_after_clk_enable(struct phy_device *phydev)

perform a PHY reset if needed

Parameters

struct phy_device *phydev

target phy_device struct

Description

Some PHYs are known to need a reset after their refclk was

enabled. This function evaluates the flags and perform the reset if it’s needed. Returns < 0 on error, 0 if the phy wasn’t reset and 1 if the phy was reset.

int genphy_config_eee_advert(struct phy_device *phydev)

disable unwanted eee mode advertisement

Parameters

struct phy_device *phydev

target phy_device struct

Description

Writes MDIO_AN_EEE_ADV after disabling unsupported energy

efficent ethernet modes. Returns 0 if the PHY’s advertisement hasn’t changed, and 1 if it has changed.

int genphy_setup_forced(struct phy_device *phydev)

configures/forces speed/duplex from phydev

Parameters

struct phy_device *phydev

target phy_device struct

Description

Configures MII_BMCR to force speed/duplex

to the values in phydev. Assumes that the values are valid. Please see phy_sanitize_settings().

int genphy_restart_aneg(struct phy_device *phydev)

Enable and Restart Autonegotiation

Parameters

struct phy_device *phydev

target phy_device struct

int genphy_check_and_restart_aneg(struct phy_device *phydev, bool restart)

Enable and restart auto-negotiation

Parameters

struct phy_device *phydev

target phy_device struct

bool restart

whether aneg restart is requested

Description

Check, and restart auto-negotiation if needed.

int __genphy_config_aneg(struct phy_device *phydev, bool changed)

restart auto-negotiation or write BMCR

Parameters

struct phy_device *phydev

target phy_device struct

bool changed

whether autoneg is requested

Description

If auto-negotiation is enabled, we configure the

advertising, and then restart auto-negotiation. If it is not enabled, then we write the BMCR.

int genphy_c37_config_aneg(struct phy_device *phydev)

restart auto-negotiation or write BMCR

Parameters

struct phy_device *phydev

target phy_device struct

Description

If auto-negotiation is enabled, we configure the

advertising, and then restart auto-negotiation. If it is not enabled, then we write the BMCR. This function is intended for use with Clause 37 1000Base-X mode.

int genphy_aneg_done(struct phy_device *phydev)

return auto-negotiation status

Parameters

struct phy_device *phydev

target phy_device struct

Description

Reads the status register and returns 0 either if

auto-negotiation is incomplete, or if there was an error. Returns BMSR_ANEGCOMPLETE if auto-negotiation is done.

update link status in phydev

Parameters

struct phy_device *phydev

target phy_device struct

Description

Update the value in phydev->link to reflect the

current link value. In order to do this, we need to read the status register twice, keeping the second value.

int genphy_read_status_fixed(struct phy_device *phydev)

read the link parameters for !aneg mode

Parameters

struct phy_device *phydev

target phy_device struct

Description

Read the current duplex and speed state for a PHY operating with autonegotiation disabled.

int genphy_read_status(struct phy_device *phydev)

check the link status and update current link state

Parameters

struct phy_device *phydev

target phy_device struct

Description

Check the link, then figure out the current state

by comparing what we advertise with what the link partner advertises. Start by checking the gigabit possibilities, then move on to 10/100.

int genphy_c37_read_status(struct phy_device *phydev)

check the link status and update current link state

Parameters

struct phy_device *phydev

target phy_device struct

Description

Check the link, then figure out the current state

by comparing what we advertise with what the link partner advertises. This function is for Clause 37 1000Base-X mode.

int genphy_soft_reset(struct phy_device *phydev)

software reset the PHY via BMCR_RESET bit

Parameters

struct phy_device *phydev

target phy_device struct

Description

Perform a software PHY reset using the standard BMCR_RESET bit and poll for the reset bit to be cleared.

Return

0 on success, < 0 on failure

int genphy_read_abilities(struct phy_device *phydev)

read PHY abilities from Clause 22 registers

Parameters

struct phy_device *phydev

target phy_device struct

Description

Reads the PHY’s abilities and populates phydev->supported accordingly.

Return

0 on success, < 0 on failure

Remove a supported link mode

Parameters

struct phy_device *phydev

phy_device structure to remove link mode from

u32 link_mode

Link mode to be removed

Description

Some MACs don’t support all link modes which the PHY does. e.g. a 1G MAC often does not support 1000Half. Add a helper to remove a link mode.

void phy_advertise_supported(struct phy_device *phydev)

Advertise all supported modes

Parameters

struct phy_device *phydev

target phy_device struct

Description

Called to advertise all supported modes, doesn’t touch pause mode advertising.

void phy_support_sym_pause(struct phy_device *phydev)

Enable support of symmetrical pause

Parameters

struct phy_device *phydev

target phy_device struct

Description

Called by the MAC to indicate is supports symmetrical Pause, but not asym pause.

void phy_support_asym_pause(struct phy_device *phydev)

Enable support of asym pause

Parameters

struct phy_device *phydev

target phy_device struct

Description

Called by the MAC to indicate is supports Asym Pause.

void phy_set_sym_pause(struct phy_device *phydev, bool rx, bool tx, bool autoneg)

Configure symmetric Pause

Parameters

struct phy_device *phydev

target phy_device struct

bool rx

Receiver Pause is supported

bool tx

Transmit Pause is supported

bool autoneg

Auto neg should be used

Description

Configure advertised Pause support depending on if receiver pause and pause auto neg is supported. Generally called from the set_pauseparam .ndo.

void phy_set_asym_pause(struct phy_device *phydev, bool rx, bool tx)

Configure Pause and Asym Pause

Parameters

struct phy_device *phydev

target phy_device struct

bool rx

Receiver Pause is supported

bool tx

Transmit Pause is supported

Description

Configure advertised Pause support depending on if transmit and receiver pause is supported. If there has been a change in adverting, trigger a new autoneg. Generally called from the set_pauseparam .ndo.

bool phy_validate_pause(struct phy_device *phydev, struct ethtool_pauseparam *pp)

Test if the PHY/MAC support the pause configuration

Parameters

struct phy_device *phydev

phy_device struct

struct ethtool_pauseparam *pp

requested pause configuration

Description

Test if the PHY/MAC combination supports the Pause configuration the user is requesting. Returns True if it is supported, false otherwise.

void phy_get_pause(struct phy_device *phydev, bool *tx_pause, bool *rx_pause)

resolve negotiated pause modes

Parameters

struct phy_device *phydev

phy_device struct

bool *tx_pause

pointer to bool to indicate whether transmit pause should be enabled.

bool *rx_pause

pointer to bool to indicate whether receive pause should be enabled.

Description

Resolve and return the flow control modes according to the negotiation result. This includes checking that we are operating in full duplex mode. See linkmode_resolve_pause() for further details.

s32 phy_get_internal_delay(struct phy_device *phydev, struct device *dev, const int *delay_values, int size, bool is_rx)

returns the index of the internal delay

Parameters

struct phy_device *phydev

phy_device struct

struct device *dev

pointer to the devices device struct

const int *delay_values

array of delays the PHY supports

int size

the size of the delay array

bool is_rx

boolean to indicate to get the rx internal delay

Description

Returns the index within the array of internal delay passed in. If the device property is not present then the interface type is checked if the interface defines use of internal delay then a 1 is returned otherwise a 0 is returned. The array must be in ascending order. If PHY does not have an ascending order array then size = 0 and the value of the delay property is returned. Return -EINVAL if the delay is invalid or cannot be found.

struct mdio_device *fwnode_mdio_find_device(struct fwnode_handle *fwnode)

Given a fwnode, find the mdio_device

Parameters

struct fwnode_handle *fwnode

pointer to the mdio_device’s fwnode

Description

If successful, returns a pointer to the mdio_device with the embedded struct device refcount incremented by one, or NULL on failure. The caller should call put_device() on the mdio_device after its use.

struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode)

For provided phy_fwnode, find phy_device.

Parameters

struct fwnode_handle *phy_fwnode

Pointer to the phy’s fwnode.

Description

If successful, returns a pointer to the phy_device with the embedded struct device refcount incremented by one, or NULL on failure.

struct phy_device *device_phy_find_device(struct device *dev)

For the given device, get the phy_device

Parameters

struct device *dev

Pointer to the given device

Description

Refer return conditions of fwnode_phy_find_device().

struct fwnode_handle *fwnode_get_phy_node(struct fwnode_handle *fwnode)

Get the phy_node using the named reference.

Parameters

struct fwnode_handle *fwnode

Pointer to fwnode from which phy_node has to be obtained.

Description

Refer return conditions of fwnode_find_reference(). For ACPI, only “phy-handle” is supported. Legacy DT properties “phy” and “phy-device” are not supported in ACPI. DT supports all the three named references to the phy node.

int phy_driver_register(struct phy_driver *new_driver, struct module *owner)

register a phy_driver with the PHY layer

Parameters

struct phy_driver *new_driver

new phy_driver to register

struct module *owner

module owning this PHY

int get_phy_c45_ids(struct mii_bus *bus, int addr, struct phy_c45_device_ids *c45_ids)

reads the specified addr for its 802.3-c45 IDs.

Parameters

struct mii_bus *bus

the target MII bus

int addr

PHY address on the MII bus

struct phy_c45_device_ids *c45_ids

where to store the c45 ID information.

Description

Read the PHY “devices in package”. If this appears to be valid, read the PHY identifiers for each device. Return the “devices in package” and identifiers in c45_ids.

Returns zero on success, -EIO on bus access error, or -ENODEV if the “devices in package” is invalid.

int get_phy_c22_id(struct mii_bus *bus, int addr, u32 *phy_id)

reads the specified addr for its clause 22 ID.

Parameters

struct mii_bus *bus

the target MII bus

int addr

PHY address on the MII bus

u32 *phy_id

where to store the ID retrieved.

Description

Read the 802.3 clause 22 PHY ID from the PHY at addr on the bus, placing it in phy_id. Return zero on successful read and the ID is valid, -EIO on bus access error, or -ENODEV if no device responds or invalid ID.

prepares the PHY layer to monitor link status

Parameters

struct phy_device *phydev

target phy_device struct

void (*handler)(struct net_device *)

callback function for link status change notifications

Description

Tells the PHY infrastructure to handle the

gory details on monitoring link status (whether through polling or an interrupt), and to call back to the connected device driver when the link status changes. If you want to monitor your own link state, don’t call this function.

int phy_poll_reset(struct phy_device *phydev)

Safely wait until a PHY reset has properly completed

Parameters

struct phy_device *phydev

The PHY device to poll

Description

According to IEEE 802.3, Section 2, Subsection 22.2.4.1.1, as

published in 2008, a PHY reset may take up to 0.5 seconds. The MII BMCR register must be polled until the BMCR_RESET bit clears.

Furthermore, any attempts to write to PHY registers may have no effect or even generate MDIO bus errors until this is complete.

Some PHYs (such as the Marvell 88E1111) don’t entirely conform to the standard and do not fully reset after the BMCR_RESET bit is set, and may even REQUIRE a soft-reset to properly restart autonegotiation. In an effort to support such broken PHYs, this function is separate from the standard phy_init_hw() which will zero all the other bits in the BMCR and reapply all driver-specific and board-specific fixups.

int genphy_config_advert(struct phy_device *phydev)

sanitize and advertise auto-negotiation parameters

Parameters

struct phy_device *phydev

target phy_device struct

Description

Writes MII_ADVERTISE with the appropriate values,

after sanitizing the values to make sure we only advertise what is supported. Returns < 0 on error, 0 if the PHY’s advertisement hasn’t changed, and > 0 if it has changed.

int genphy_c37_config_advert(struct phy_device *phydev)

sanitize and advertise auto-negotiation parameters

Parameters

struct phy_device *phydev

target phy_device struct

Description

Writes MII_ADVERTISE with the appropriate values,

after sanitizing the values to make sure we only advertise what is supported. Returns < 0 on error, 0 if the PHY’s advertisement hasn’t changed, and > 0 if it has changed. This function is intended for Clause 37 1000Base-X mode.

int phy_probe(struct device *dev)

probe and init a PHY device

Parameters

struct device *dev

device to probe and init

Description

Take care of setting up the phy_device structure,

set the state to READY (the driver’s init function should set it to STARTING if needed).

struct mii_bus *mdiobus_alloc_size(size_t size)

allocate a mii_bus structure

Parameters

size_t size

extra amount of memory to allocate for private storage. If non-zero, then bus->priv is points to that memory.

Description

called by a bus driver to allocate an mii_bus structure to fill in.

struct mii_bus *mdio_find_bus(const char *mdio_name)

Given the name of a mdiobus, find the mii_bus.

Parameters

const char *mdio_name

The name of a mdiobus.

Description

Returns a reference to the mii_bus, or NULL if none found. The embedded struct device will have its reference count incremented, and this must be put_deviced’ed once the bus is finished with.

struct mii_bus *of_mdio_find_bus(struct device_node *mdio_bus_np)

Given an mii_bus node, find the mii_bus.

Parameters

struct device_node *mdio_bus_np

Pointer to the mii_bus.

Description

Returns a reference to the mii_bus, or NULL if none found. The embedded struct device will have its reference count incremented, and this must be put once the bus is finished with.

Because the association of a device_node and mii_bus is made via of_mdiobus_register(), the mii_bus cannot be found before it is registered with of_mdiobus_register().

int __mdiobus_register(struct mii_bus *bus, struct module *owner)

bring up all the PHYs on a given bus and attach them to bus

Parameters

struct mii_bus *bus

target mii_bus

struct module *owner

module containing bus accessor functions

Description

Called by a bus driver to bring up all the PHYs

on a given bus, and attach them to the bus. Drivers should use mdiobus_register() rather than __mdiobus_register() unless they need to pass a specific owner module. MDIO devices which are not PHYs will not be brought up by this function. They are expected to be explicitly listed in DT and instantiated by of_mdiobus_register().

Returns 0 on success or < 0 on error.

void mdiobus_free(struct mii_bus *bus)

free a struct mii_bus

Parameters

struct mii_bus *bus

mii_bus to free

Description

This function releases the reference to the underlying device object in the mii_bus. If this is the last reference, the mii_bus will be freed.

struct phy_device *mdiobus_scan(struct mii_bus *bus, int addr)

scan a bus for MDIO devices.

Parameters

struct mii_bus *bus

mii_bus to scan

int addr

address on bus to scan

Description

This function scans the MDIO bus, looking for devices which can be identified using a vendor/product ID in registers 2 and 3. Not all MDIO devices have such registers, but PHY devices typically do. Hence this function assumes anything found is a PHY, or can be treated as a PHY. Other MDIO devices, such as switches, will probably not be found during the scan.

int __mdiobus_read(struct mii_bus *bus, int addr, u32 regnum)

Unlocked version of the mdiobus_read function

Parameters

struct mii_bus *bus

the mii_bus struct

int addr

the phy address

u32 regnum

register number to read

Description

Read a MDIO bus register. Caller must hold the mdio bus lock.

NOTE

MUST NOT be called from interrupt context.

int __mdiobus_write(struct mii_bus *bus, int addr, u32 regnum, u16 val)

Unlocked version of the mdiobus_write function

Parameters

struct mii_bus *bus

the mii_bus struct

int addr

the phy address

u32 regnum

register number to write

u16 val

value to write to regnum

Description

Write a MDIO bus register. Caller must hold the mdio bus lock.

NOTE

MUST NOT be called from interrupt context.

int __mdiobus_modify_changed(struct mii_bus *bus, int addr, u32 regnum, u16 mask, u16 set)

Unlocked version of the mdiobus_modify function

Parameters

struct mii_bus *bus

the mii_bus struct

int addr

the phy address

u32 regnum

register number to modify

u16 mask

bit mask of bits to clear

u16 set

bit mask of bits to set

Description

Read, modify, and if any change, write the register value back to the device. Any error returns a negative number.

NOTE

MUST NOT be called from interrupt context.

int mdiobus_read_nested(struct mii_bus *bus, int addr, u32 regnum)

Nested version of the mdiobus_read function

Parameters

struct mii_bus *bus

the mii_bus struct

int addr

the phy address

u32 regnum

register number to read

Description

In case of nested MDIO bus access avoid lockdep false positives by using mutex_lock_nested().

NOTE

MUST NOT be called from interrupt context, because the bus read/write functions may wait for an interrupt to conclude the operation.

int mdiobus_read(struct mii_bus *bus, int addr, u32 regnum)

Convenience function for reading a given MII mgmt register

Parameters

struct mii_bus *bus

the mii_bus struct

int addr

the phy address

u32 regnum

register number to read

NOTE

MUST NOT be called from interrupt context, because the bus read/write functions may wait for an interrupt to conclude the operation.

int mdiobus_write_nested(struct mii_bus *bus, int addr, u32 regnum, u16 val)

Nested version of the mdiobus_write function

Parameters

struct mii_bus *bus

the mii_bus struct

int addr

the phy address

u32 regnum

register number to write

u16 val

value to write to regnum

Description

In case of nested MDIO bus access avoid lockdep false positives by using mutex_lock_nested().

NOTE

MUST NOT be called from interrupt context, because the bus read/write functions may wait for an interrupt to conclude the operation.

int mdiobus_write(struct mii_bus *bus, int addr, u32 regnum, u16 val)

Convenience function for writing a given MII mgmt register

Parameters

struct mii_bus *bus

the mii_bus struct

int addr

the phy address

u32 regnum

register number to write

u16 val

value to write to regnum

NOTE

MUST NOT be called from interrupt context, because the bus read/write functions may wait for an interrupt to conclude the operation.

int mdiobus_modify(struct mii_bus *bus, int addr, u32 regnum, u16 mask, u16 set)

Convenience function for modifying a given mdio device register

Parameters

struct mii_bus *bus

the mii_bus struct

int addr

the phy address

u32 regnum

register number to write

u16 mask

bit mask of bits to clear

u16 set

bit mask of bits to set

int mdiobus_modify_changed(struct mii_bus *bus, int addr, u32 regnum, u16 mask, u16 set)

Convenience function for modifying a given mdio device register and returning if it changed

Parameters

struct mii_bus *bus

the mii_bus struct

int addr

the phy address

u32 regnum

register number to write

u16 mask

bit mask of bits to clear

u16 set

bit mask of bits to set

void mdiobus_release(struct device *d)

mii_bus device release callback

Parameters

struct device *d

the target struct device that contains the mii_bus

Description

called when the last reference to an mii_bus is dropped, to free the underlying memory.

int mdiobus_create_device(struct mii_bus *bus, struct mdio_board_info *bi)

create a full MDIO device given a mdio_board_info structure

Parameters

struct mii_bus *bus

MDIO bus to create the devices on

struct mdio_board_info *bi

mdio_board_info structure describing the devices

Description

Returns 0 on success or < 0 on error.

int mdio_bus_match(struct device *dev, struct device_driver *drv)

determine if given MDIO driver supports the given MDIO device

Parameters

struct device *dev

target MDIO device

struct device_driver *drv

given MDIO driver

Description

Given a MDIO device, and a MDIO driver, return 1 if

the driver supports the device. Otherwise, return 0. This may require calling the devices own match function, since different classes of MDIO devices have different match criteria.

SFP support

struct sfp_bus

internal representation of a sfp bus

Definition

struct sfp_bus {
};

Members

struct sfp_eeprom_id

raw SFP module identification information

Definition

struct sfp_eeprom_id {
  struct sfp_eeprom_base base;
  struct sfp_eeprom_ext ext;
};

Members

base

base SFP module identification structure

ext

extended SFP module identification structure

Description

See the SFF-8472 specification and related documents for the definition of these structure members. This can be obtained from https://www.snia.org/technology-communities/sff/specifications

struct sfp_upstream_ops

upstream operations structure

Definition

struct sfp_upstream_ops {
  void (*attach)(void *priv, struct sfp_bus *bus);
  void (*detach)(void *priv, struct sfp_bus *bus);
  int (*module_insert)(void *priv, const struct sfp_eeprom_id *id);
  void (*module_remove)(void *priv);
  int (*module_start)(void *priv);
  void (*module_stop)(void *priv);
  void (*link_down)(void *priv);
  void (*link_up)(void *priv);
  int (*connect_phy)(void *priv, struct phy_device *);
  void (*disconnect_phy)(void *priv);
};

Members

attach

called when the sfp socket driver is bound to the upstream (mandatory).

detach

called when the sfp socket driver is unbound from the upstream (mandatory).

module_insert

called after a module has been detected to determine whether the module is supported for the upstream device.

module_remove

called after the module has been removed.

module_start

called after the PHY probe step

module_stop

called before the PHY is removed

link_down

called when the link is non-operational for whatever reason.

link_up

called when the link is operational.

connect_phy

called when an I2C accessible PHY has been detected on the module.

disconnect_phy

called when a module with an I2C accessible PHY has been removed.

int sfp_parse_port(struct sfp_bus *bus, const struct sfp_eeprom_id *id, unsigned long *support)

Parse the EEPROM base ID, setting the port type

Parameters

struct sfp_bus *bus

a pointer to the struct sfp_bus structure for the sfp module

const struct sfp_eeprom_id *id

a pointer to the module’s struct sfp_eeprom_id

unsigned long *support

optional pointer to an array of unsigned long for the ethtool support mask

Description

Parse the EEPROM identification given in id, and return one of PORT_TP, PORT_FIBRE or PORT_OTHER. If support is non-NULL, also set the ethtool ETHTOOL_LINK_MODE_xxx_BIT corresponding with the connector type.

If the port type is not known, returns PORT_OTHER.

bool sfp_may_have_phy(struct sfp_bus *bus, const struct sfp_eeprom_id *id)

indicate whether the module may have a PHY

Parameters

struct sfp_bus *bus

a pointer to the struct sfp_bus structure for the sfp module

const struct sfp_eeprom_id *id

a pointer to the module’s struct sfp_eeprom_id

Description

Parse the EEPROM identification given in id, and return whether this module may have a PHY.

void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id, unsigned long *support)

Parse the eeprom id for supported link modes

Parameters

struct sfp_bus *bus

a pointer to the struct sfp_bus structure for the sfp module

const struct sfp_eeprom_id *id

a pointer to the module’s struct sfp_eeprom_id

unsigned long *support

pointer to an array of unsigned long for the ethtool support mask

Description

Parse the EEPROM identification information and derive the supported ethtool link modes for the module.

phy_interface_t sfp_select_interface(struct sfp_bus *bus, unsigned long *link_modes)

Select appropriate phy_interface_t mode

Parameters

struct sfp_bus *bus

a pointer to the struct sfp_bus structure for the sfp module

unsigned long *link_modes

ethtool link modes mask

Description

Derive the phy_interface_t mode for the SFP module from the link modes mask.

void sfp_bus_put(struct sfp_bus *bus)

put a reference on the struct sfp_bus

Parameters

struct sfp_bus *bus

the struct sfp_bus found via sfp_bus_find_fwnode()

Description

Put a reference on the struct sfp_bus and free the underlying structure if this was the last reference.

int sfp_get_module_info(struct sfp_bus *bus, struct ethtool_modinfo *modinfo)

Get the ethtool_modinfo for a SFP module

Parameters

struct sfp_bus *bus

a pointer to the struct sfp_bus structure for the sfp module

struct ethtool_modinfo *modinfo

a struct ethtool_modinfo

Description

Fill in the type and eeprom_len parameters in modinfo for a module on the sfp bus specified by bus.

Returns 0 on success or a negative errno number.

int sfp_get_module_eeprom(struct sfp_bus *bus, struct ethtool_eeprom *ee, u8 *data)

Read the SFP module EEPROM

Parameters

struct sfp_bus *bus

a pointer to the struct sfp_bus structure for the sfp module

struct ethtool_eeprom *ee

a struct ethtool_eeprom

u8 *data

buffer to contain the EEPROM data (must be at least ee->len bytes)

Description

Read the EEPROM as specified by the supplied ee. See the documentation for struct ethtool_eeprom for the region to be read.

Returns 0 on success or a negative errno number.

int sfp_get_module_eeprom_by_page(struct sfp_bus *bus, const struct ethtool_module_eeprom *page, struct netlink_ext_ack *extack)

Read a page from the SFP module EEPROM

Parameters

struct sfp_bus *bus

a pointer to the struct sfp_bus structure for the sfp module

const struct ethtool_module_eeprom *page

a struct ethtool_module_eeprom

struct netlink_ext_ack *extack

extack for reporting problems

Description

Read an EEPROM page as specified by the supplied page. See the documentation for struct ethtool_module_eeprom for the page to be read.

Returns 0 on success or a negative errno number. More error information might be provided via extack

void sfp_upstream_start(struct sfp_bus *bus)

Inform the SFP that the network device is up

Parameters

struct sfp_bus *bus

a pointer to the struct sfp_bus structure for the sfp module

Description

Inform the SFP socket that the network device is now up, so that the module can be enabled by allowing TX_DISABLE to be deasserted. This should be called from the network device driver’s struct net_device_ops ndo_open() method.

void sfp_upstream_stop(struct sfp_bus *bus)

Inform the SFP that the network device is down

Parameters

struct sfp_bus *bus

a pointer to the struct sfp_bus structure for the sfp module

Description

Inform the SFP socket that the network device is now up, so that the module can be disabled by asserting TX_DISABLE, disabling the laser in optical modules. This should be called from the network device driver’s struct net_device_ops ndo_stop() method.

struct sfp_bus *sfp_bus_find_fwnode(struct fwnode_handle *fwnode)

parse and locate the SFP bus from fwnode

Parameters

struct fwnode_handle *fwnode

firmware node for the parent device (MAC or PHY)

Description

Parse the parent device’s firmware node for a SFP bus, and locate the sfp_bus structure, incrementing its reference count. This must be put via sfp_bus_put() when done.

Return

  • on success, a pointer to the sfp_bus structure,

  • NULL if no SFP is specified,

  • on failure, an error pointer value:

  • corresponding to the errors detailed for fwnode_property_get_reference_args().

  • -ENOMEM if we failed to allocate the bus.

  • an error from the upstream’s connect_phy() method.

int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream, const struct sfp_upstream_ops *ops)

parse and register the neighbouring device

Parameters

struct sfp_bus *bus

the struct sfp_bus found via sfp_bus_find_fwnode()

void *upstream

the upstream private data

const struct sfp_upstream_ops *ops

the upstream’s struct sfp_upstream_ops

Description

Add upstream driver for the SFP bus, and if the bus is complete, register the SFP bus using sfp_register_upstream(). This takes a reference on the bus, so it is safe to put the bus after this call.

Return

  • on success, a pointer to the sfp_bus structure,

  • NULL if no SFP is specified,

  • on failure, an error pointer value:

  • corresponding to the errors detailed for fwnode_property_get_reference_args().

  • -ENOMEM if we failed to allocate the bus.

  • an error from the upstream’s connect_phy() method.

void sfp_bus_del_upstream(struct sfp_bus *bus)

Delete a sfp bus

Parameters

struct sfp_bus *bus

a pointer to the struct sfp_bus structure for the sfp module

Description

Delete a previously registered upstream connection for the SFP module. bus should have been added by sfp_bus_add_upstream().