view usr/src/uts/common/rpc/rpc_rdma.h @ 13988:81670e8b6dd9

3620 Corruption of the `xprt-ready' queue in svc_xprt_qdelete() Reviewed by: Boris Protopopov <Boris.Protopopov@nexenta.com> Reviewed by: Gordon Ross <gordon.ross@nexenta.com> Reviewed by: Jeffry Molanus <Jeffry.Molanus@nexenta.com> Approved by: Richard Lowe <richlowe@richlowe.net>
author Marcel Telka <Marcel.Telka@nexenta.com>
date Fri, 15 Mar 2013 16:26:41 -0400
parents e64e5d843075
children
line wrap: on
line source

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
 */

/*
 * Copyright (c) 2007, The Ohio State University. All rights reserved.
 *
 * Portions of this source code is developed by the team members of
 * The Ohio State University's Network-Based Computing Laboratory (NBCL),
 * headed by Professor Dhabaleswar K. (DK) Panda.
 *
 * Acknowledgements to contributions from developors:
 *   Ranjit Noronha: noronha@cse.ohio-state.edu
 *   Lei Chai      : chail@cse.ohio-state.edu
 *   Weikuan Yu    : yuw@cse.ohio-state.edu
 *
 */

#ifndef	_RPC_RPC_RDMA_H
#define	_RPC_RPC_RDMA_H

#include <rpc/rpc.h>
#include <rpc/rpc_sztypes.h>
#include <sys/sunddi.h>
#include <sys/sunldi.h>

#ifdef __cplusplus
extern "C" {
#endif

#define	RPCRDMA_VERS	1	/* Version of the RPC over RDMA protocol */
#define	RDMATF_VERS	1	/* Version of the API used by RPC for RDMA */
#define	RDMATF_VERS_1	1	/* Current version of RDMATF */

/*
 * The size of an RPC call or reply message
 */
#define	RPC_MSG_SZ	1024

/*
 * RDMA chunk size
 */
#define	RDMA_MINCHUNK	1024

/*
 * Storage for a chunk list
 */
#define	RPC_CL_SZ  1024

/*
 * Chunk size
 */
#define	MINCHUNK  1024

/*
 * Size of receive buffer
 */
#define	RPC_BUF_SIZE	2048

#define	NOWAIT	0	/* don't wait for operation of complete */
#define	WAIT	1	/* wait and ensure that operation is complete */

/*
 * RDMA xdr buffer control and other control flags. Add new flags here,
 * set them in private structure for xdr over RDMA in xdr_rdma.c
 */
#define	XDR_RDMA_CHUNK			0x1
#define	XDR_RDMA_WLIST_REG		0x2
#define	XDR_RDMA_RLIST_REG		0x4

#define	LONG_REPLY_LEN	65536
#define	WCL_BUF_LEN	32768
#define	RCL_BUF_LEN	32768


#define	RDMA_BUFS_RQST	34	/* Num bufs requested by client */
#define	RDMA_BUFS_GRANT	32	/* Num bufs granted by server */

struct xdr_ops *xdrrdma_xops(void);

/*
 * Credit Control Structures.
 */
typedef enum rdma_cc_type {
	RDMA_CC_CLNT,	/* CONN is for a client */
	RDMA_CC_SRV	/* CONN is for a server */
} rdma_cc_type_t;

/*
 * Client side credit control data structure.
 */
typedef struct rdma_clnt_cred_ctrl {
	uint32_t	clnt_cc_granted_ops;
	uint32_t	clnt_cc_in_flight_ops;
	kcondvar_t	clnt_cc_cv;
} rdma_clnt_cred_ctrl_t;

/*
 * Server side credit control data structure.
 */
typedef struct rdma_srv_cred_ctrl {
	uint32_t	srv_cc_buffers_granted;
	uint32_t	srv_cc_cur_buffers_used;
	uint32_t	srv_cc_posted;
	uint32_t	srv_cc_max_buf_size;	/* to be determined by CCP */
	uint32_t	srv_cc_cur_buf_size;	/* to be determined by CCP */
} rdma_srv_cred_ctrl_t;

typedef enum {
    RPCCALL_WLIST,
    RPCCALL_WCHUNK,
    RPCCALL_NOWRITE
}rpccall_write_t;

typedef enum {
	CLIST_REG_SOURCE = 1,
	CLIST_REG_DST
} clist_dstsrc;

/*
 * Return codes from RDMA operations
 */
typedef enum {

	RDMA_SUCCESS = 0,	/* successful operation */

	RDMA_INVAL = 1,		/* invalid parameter */
	RDMA_TIMEDOUT = 2,	/* operation timed out */
	RDMA_INTR = 3,		/* operation interrupted */
	RDMA_NORESOURCE = 4,	/* insufficient resource */
	/*
	 * connection errors
	 */
	RDMA_REJECT = 5,	/* connection req rejected */
	RDMA_NOLISTENER = 6,	/* no listener on server */
	RDMA_UNREACHABLE = 7,	/* host unreachable */
	RDMA_CONNLOST = 8,	/* connection lost */

	RDMA_XPRTFAILED = 9,	/* RDMA transport failed */
	RDMA_PROTECTERR = 10,	/* memory protection error */
	RDMA_OVERRUN = 11,	/* transport overrun */
	RDMA_RECVQEMPTY = 12,	/* incoming pkt dropped, recv q empty */
	RDMA_PROTFAILED = 13,	/* RDMA protocol failed */
	RDMA_NOTSUPP = 14,	/* requested feature not supported */
	RDMA_REMOTERR = 15,	/* error at remote end */
	/*
	 * RDMATF errors
	 */
	RDMA_BADVERS = 16,	/* mismatch RDMATF versions */
	RDMA_REG_EXIST = 17,	/* RDMATF registration already exists */
	RDMA_HCA_ATTACH = 18,
	RDMA_HCA_DETACH = 19,

	/*
	 * fallback error
	 */
	RDMA_FAILED = 20	/* generic error */
} rdma_stat;

/*
 * Memory region context. This is an RDMA provider generated
 * handle for a registered arbitrary size contiguous virtual
 * memory. The RDMA Interface Adapter needs this for local or
 * remote memory access.
 *
 * The mrc_rmr field holds the remote memory region context
 * which is sent over-the-wire to provide the remote host
 * with RDMA access to the memory region.
 */
struct mrc {
	uint32_t	mrc_rmr;	/* Remote MR context, sent OTW */
	union {
		struct mr {
			uint32_t	lmr; 	/* Local MR context */
			uint64_t	linfo;	/* Local memory info */
		} mr;
	} lhdl;
};

#define	mrc_lmr		lhdl.mr.lmr
#define	mrc_linfo	lhdl.mr.linfo

/*
 * Memory management for the RDMA buffers
 */
/*
 * RDMA buffer types
 */
typedef enum {
	SEND_BUFFER,	/* buf for send msg */
	SEND_DESCRIPTOR, /* buf used for send msg descriptor in plugins only */
	RECV_BUFFER,	/* buf for recv msg */
	RECV_DESCRIPTOR, /* buf used for recv msg descriptor in plugins only */
	RDMA_LONG_BUFFER /* chunk buf used in RDMATF only and not in plugins */
} rdma_btype;

/*
 * RDMA buffer information
 */
typedef struct rdma_buf {
	rdma_btype	type;	/* buffer type */
	uint_t		len;	/* length of buffer */
	caddr_t		addr;	/* buffer address */
	struct mrc	handle;	/* buffer registration handle */
	caddr_t		rb_private;
} rdma_buf_t;


/*
 * The XDR offset value is used by the XDR
 * routine to identify the position in the
 * RPC message where the opaque object would
 * normally occur. Neither the data content
 * of the chunk, nor its size field are included
 * in the RPC message.  The XDR offset is calculated
 * as if the chunks were present.
 *
 * The remaining fields identify the chunk of data
 * on the sender.  The c_memhandle identifies a
 * registered RDMA memory region and the c_addr
 * and c_len fields identify the chunk within it.
 */
struct clist {
	uint32		c_xdroff;	/* XDR offset */
	uint32		c_len;		/* Length */
	clist_dstsrc	c_regtype;	/* type of registration */
	struct mrc	c_smemhandle;	/* src memory handle */
	uint64 		c_ssynchandle;	/* src sync handle */
	union {
		uint64		c_saddr;	/* src address */
		caddr_t 	c_saddr3;
	} w;
	struct mrc	c_dmemhandle;	/* dst memory handle */
	uint64		c_dsynchandle;	/* dst sync handle */
	union {
		uint64	c_daddr;	/* dst address */
		caddr_t	c_daddr3;
	} u;
	struct as	*c_adspc;	/* address space for saddr/daddr */
	rdma_buf_t	rb_longbuf;	/* used for long requests/replies */
	struct clist	*c_next;	/* Next chunk */
};

typedef struct clist clist;

/*
 * max 4M wlist xfer size
 * This is defined because the rfs3_tsize service requires
 * svc_req struct (which we don't have that in krecv).
 */
#define	MAX_SVC_XFER_SIZE (4*1024*1024)

enum rdma_proc {
	RDMA_MSG	= 0,	/* chunk list and RPC msg follow */
	RDMA_NOMSG	= 1,	/* only chunk list follows */
	RDMA_MSGP	= 2,	/* chunk list and RPC msg with padding follow */
	RDMA_DONE	= 3	/* signal completion of chunk transfer */
};

/*
 * Listener information for a service
 */
struct rdma_svc_data {
	queue_t		q;	/* queue_t to place incoming pkts */
	int		active;	/* If active, after registeration startup */
	rdma_stat	err_code;	/* Error code from plugin layer */
	int32_t		svcid;		/* RDMA based service identifier */
};

/*
 * Per RDMA plugin module information.
 * Will be populated by each plugin
 * module during its initialization.
 */
typedef struct rdma_mod {
	char 		*rdma_api;		/* "kvipl", "ibtf", etc */
	uint_t 		rdma_version;		/* RDMATF API version */
	int		rdma_count;		/* # of devices */
	struct rdmaops 	*rdma_ops;		/* rdma op vector for api */
} rdma_mod_t;

/*
 * Registry of RDMA plugins
 */
typedef struct rdma_registry {
	rdma_mod_t	*r_mod;		/* plugin mod info */
	uint32_t	r_mod_state;
	struct rdma_registry *r_next;	/* next registered RDMA plugin */
} rdma_registry_t;

/*
 * RDMA MODULE state flags (r_mod_state).
 */
#define	RDMA_MOD_ACTIVE		1
#define	RDMA_MOD_INACTIVE	0

/*
 * RDMA transport information
 */
typedef struct rdma_info {
	uint_t	addrlen;	/* address length */
	uint_t  mts;		/* max transfer size */
	uint_t  mtu;		/* native mtu size of unlerlying network */
} rdma_info_t;

typedef enum {
	C_IDLE		= 0x00000001,
	C_CONN_PEND	= 0x00000002,
	C_CONNECTED	= 0x00000004,
	C_ERROR_CONN	= 0x00000008,
	C_DISCONN_PEND	= 0x00000010,
	C_REMOTE_DOWN	= 0x00000020
} conn_c_state;

/* c_flags */
#define	C_CLOSE_NOTNEEDED	0x00000001	/* just free the channel */
#define	C_CLOSE_PENDING		0x00000002	/* a close in progress */

/*
 * RDMA Connection information
 */
typedef struct conn {
	rdma_mod_t	*c_rdmamod;	/* RDMA transport info for conn */
	char 		*c_netid;	/* tcp or tcp6 token */
	struct netbuf	c_raddr;	/* remote address */
	struct netbuf	c_laddr;	/* local address */
	struct netbuf	c_addrmask;	/* Address Mask */
	int		c_ref;		/* no. of clients of connection */
	struct conn	*c_next;	/* next in list of connections */
	struct conn	*c_prev;	/* prev in list of connections */
	caddr_t		c_private;	/* transport specific stuff */
	conn_c_state	c_state;	/* state of connection */
	int		c_flags;	/* flags for connection management */
	rdma_cc_type_t	c_cc_type;	/* client or server, for credit cntrl */
	union {
		rdma_clnt_cred_ctrl_t	c_clnt_cc;
		rdma_srv_cred_ctrl_t	c_srv_cc;
	} rdma_conn_cred_ctrl_u;
	kmutex_t	c_lock;		/* protect c_state and c_ref fields */
	kcondvar_t	c_cv;		/* to signal when pending is done */
	timeout_id_t	c_timeout;	/* timeout id for untimeout() */
	time_t		c_last_used;	/* last time any activity on the conn */
} CONN;


/*
 * Data transferred from plugin interrupt to svc_queuereq()
 */
typedef struct rdma_recv_data {
	CONN		*conn;
	int		status;
	rdma_buf_t	rpcmsg;
} rdma_recv_data_t;

/* structure used to pass information for READ over rdma write */
typedef enum {
	RCI_WRITE_UIO_CHUNK = 1,
	RCI_WRITE_ADDR_CHUNK = 2,
	RCI_REPLY_CHUNK = 3
} rci_type_t;

typedef struct {
	rci_type_t rci_type;
	union {
		struct uio *rci_uiop;
		caddr_t    rci_addr;
	} rci_a;
	uint32	rci_len;
	struct clist	**rci_clpp; /* point to write chunk list in readargs */
} rdma_chunkinfo_t;

typedef struct {
	uint_t rcil_len;
	uint_t rcil_len_alt;
} rdma_chunkinfo_lengths_t;

typedef struct {
	struct	clist	*rwci_wlist;
	CONN		*rwci_conn;
} rdma_wlist_conn_info_t;

/*
 * Operations vector for RDMA transports.
 */
typedef struct rdmaops {
	/* Network */
	rdma_stat	(*rdma_reachable)(int addr_type, struct netbuf *,
						void **handle);
	/* Connection */
	rdma_stat	(*rdma_get_conn)(struct netbuf *, struct netbuf *,
					int addr_type, void *, CONN **);
	rdma_stat	(*rdma_rel_conn)(CONN *);
	/* Server side listner start and stop routines */
	void		(*rdma_svc_listen)(struct rdma_svc_data *);
	void		(*rdma_svc_stop)(struct rdma_svc_data *);
	/* Memory */
	rdma_stat	(*rdma_regmem)(CONN *, caddr_t, caddr_t,
			    uint_t, struct mrc *);
	rdma_stat	(*rdma_deregmem)(CONN *, caddr_t, struct mrc);
	rdma_stat	(*rdma_regmemsync)(CONN *, caddr_t, caddr_t, uint_t,
				struct mrc *, void **, void *);
	rdma_stat	(*rdma_deregmemsync)(CONN *, caddr_t, struct mrc,
			    void *, void *);
	rdma_stat	(*rdma_syncmem)(CONN *, void *, caddr_t, int, int);
	/* Buffer */
	rdma_stat	(*rdma_buf_alloc)(CONN *, rdma_buf_t *);
	void		(*rdma_buf_free)(CONN *, rdma_buf_t *);
	/* Transfer */
	rdma_stat	(*rdma_send)(CONN *, clist *, uint32_t);
	rdma_stat	(*rdma_send_resp)(CONN *, clist *, uint32_t);
	rdma_stat	(*rdma_clnt_recvbuf)(CONN *, clist *, uint32_t);
	rdma_stat	(*rdma_clnt_recvbuf_remove)(CONN *, uint32_t);
	rdma_stat	(*rdma_svc_recvbuf)(CONN *, clist *);
	rdma_stat	(*rdma_recv)(CONN *, clist **, uint32_t);
	/* RDMA */
	rdma_stat	(*rdma_read)(CONN *, clist *, int);
	rdma_stat	(*rdma_write)(CONN *, clist *, int);
	/* INFO */
	rdma_stat	(*rdma_getinfo)(rdma_info_t *info);
} rdmaops_t;

typedef struct rdma_svc_wait {
	kmutex_t svc_lock;
	kcondvar_t svc_cv;
	rdma_stat svc_stat;
} rdma_svc_wait_t;

extern rdma_svc_wait_t rdma_wait;

/*
 * RDMA operations.
 */
#define	RDMA_REACHABLE(rdma_ops, addr_type, addr, handle)	\
	(*(rdma_ops)->rdma_reachable)(addr_type, addr, handle)

#define	RDMA_GET_CONN(rdma_ops, saddr, daddr, addr_type, handle, conn)	\
	(*(rdma_ops)->rdma_get_conn)(saddr, daddr, addr_type, handle, conn)

#define	RDMA_REL_CONN(conn)	\
	(*(conn)->c_rdmamod->rdma_ops->rdma_rel_conn)(conn)

#define	RDMA_REGMEM(conn, adsp, buff, len, handle)	\
	(*(conn)->c_rdmamod->rdma_ops->rdma_regmem)(conn, adsp,	\
		buff, len, handle)

#define	RDMA_DEREGMEM(conn, buff, handle)	\
	(*(conn)->c_rdmamod->rdma_ops->rdma_deregmem)(conn, buff, handle)

#define	RDMA_REGMEMSYNC(conn, adsp, buff, len, handle, synchandle, lrc)	\
	(*(conn)->c_rdmamod->rdma_ops->rdma_regmemsync)(conn, adsp, buff, \
	len, handle, synchandle, lrc)

#define	RDMA_DEREGMEMSYNC(conn, buff, handle, synchandle, lrc)	\
	(*(conn)->c_rdmamod->rdma_ops->rdma_deregmemsync)(conn, buff,	\
	handle, synchandle, lrc)

#define	RDMA_SYNCMEM(conn, handle, buff, len, direction)	\
	(*(conn)->c_rdmamod->rdma_ops->rdma_syncmem)(conn, handle, \
	    buff, len, direction)

#define	RDMA_BUF_ALLOC(conn, rbuf)	\
	(*(conn)->c_rdmamod->rdma_ops->rdma_buf_alloc)(conn, rbuf)

#define	RDMA_BUF_FREE(conn, rbuf)	\
	(*(conn)->c_rdmamod->rdma_ops->rdma_buf_free)(conn, rbuf)

#define	RDMA_SEND(conn, sendlist, xid)	\
	(*(conn)->c_rdmamod->rdma_ops->rdma_send)(conn, sendlist, xid)

#define	RDMA_SEND_RESP(conn, sendlist, xid)	\
	(*(conn)->c_rdmamod->rdma_ops->rdma_send_resp)(conn, sendlist, xid)

#define	RDMA_CLNT_RECVBUF(conn, cl, xid)	\
	(*(conn)->c_rdmamod->rdma_ops->rdma_clnt_recvbuf)(conn, cl, xid)

#define	RDMA_CLNT_RECVBUF_REMOVE(conn, xid)	\
	(*(conn)->c_rdmamod->rdma_ops->rdma_clnt_recvbuf_remove)(conn, xid)

#define	RDMA_SVC_RECVBUF(conn, cl)	\
	(*(conn)->c_rdmamod->rdma_ops->rdma_svc_recvbuf)(conn, cl)

#define	RDMA_RECV(conn, recvlist, xid)	\
	(*(conn)->c_rdmamod->rdma_ops->rdma_recv)(conn, recvlist, xid)

#define	RDMA_READ(conn, cl, wait)	\
	(*(conn)->c_rdmamod->rdma_ops->rdma_read)(conn, cl, wait)

#define	RDMA_WRITE(conn, cl, wait)	\
	(*(conn)->c_rdmamod->rdma_ops->rdma_write)(conn, cl, wait)

#define	RDMA_GETINFO(rdma_mod, info)	\
	(*(rdma_mod)->rdma_ops->rdma_getinfo)(info)

#ifdef _KERNEL
extern rdma_registry_t	*rdma_mod_head;
extern krwlock_t rdma_lock;		/* protects rdma_mod_head list */
extern int rdma_modloaded;		/* flag for loading RDMA plugins */
extern int rdma_dev_available;		/* rdma device is loaded or not */
extern kmutex_t rdma_modload_lock;	/* protects rdma_modloaded flag */
extern uint_t rdma_minchunk;
extern ldi_ident_t rpcmod_li; 		/* needed by layed driver framework */

/*
 * General RDMA routines
 */
extern struct clist *clist_alloc(void);
extern void clist_add(struct clist **, uint32_t, int,
	struct mrc *, caddr_t, struct mrc *, caddr_t);
extern void clist_free(struct clist *);
extern uint32_t clist_len(struct clist *);
extern void clist_zero_len(struct clist *);
extern rdma_stat clist_register(CONN *conn, struct clist *cl, clist_dstsrc);
extern rdma_stat clist_deregister(CONN *conn, struct clist *cl);
extern rdma_stat clist_syncmem(CONN *conn, struct clist *cl, clist_dstsrc);
extern rdma_stat rdma_clnt_postrecv(CONN *conn, uint32_t xid);
extern rdma_stat rdma_clnt_postrecv_remove(CONN *conn, uint32_t xid);
extern rdma_stat rdma_svc_postrecv(CONN *conn);
extern rdma_stat rdma_register_mod(rdma_mod_t *mod);
extern rdma_stat rdma_unregister_mod(rdma_mod_t *mod);
extern rdma_stat rdma_buf_alloc(CONN *, rdma_buf_t *);
extern void rdma_buf_free(CONN *, rdma_buf_t *);
extern int rdma_modload();
extern bool_t   rdma_get_wchunk(struct svc_req *, iovec_t *, struct clist *);
extern rdma_stat rdma_kwait(void);
extern int rdma_setup_read_chunks(struct clist *, uint32_t, int *);

/*
 * RDMA XDR
 */
extern void xdrrdma_create(XDR *, caddr_t, uint_t, int, struct clist *,
	enum xdr_op, CONN *);
extern void xdrrdma_destroy(XDR *);

extern uint_t xdrrdma_getpos(XDR *);
extern bool_t xdrrdma_setpos(XDR *, uint_t);
extern bool_t xdr_clist(XDR *, clist *);
extern bool_t xdr_do_clist(XDR *, clist **);
extern uint_t xdr_getbufsize(XDR *);
extern unsigned int xdrrdma_sizeof(xdrproc_t, void *, int, uint_t *, uint_t *);
extern unsigned int xdrrdma_authsize(AUTH *, struct cred *, int);

extern void xdrrdma_store_wlist(XDR *, struct clist *);
extern struct clist *xdrrdma_wclist(XDR *);
extern bool_t xdr_decode_reply_wchunk(XDR *, struct clist **);
extern bool_t xdr_decode_wlist(XDR *xdrs, struct clist **, bool_t *);
extern bool_t xdr_decode_wlist_svc(XDR *xdrs, struct clist **, bool_t *,
	uint32_t *, CONN *);
extern bool_t xdr_encode_rlist_svc(XDR *, clist *);
extern bool_t xdr_encode_wlist(XDR *, clist *);
extern bool_t xdr_encode_reply_wchunk(XDR *, struct clist *,
		uint32_t seg_array_len);
bool_t xdrrdma_getrdmablk(XDR *, struct clist **, uint_t *,
	CONN **conn, const uint_t);
bool_t xdrrdma_read_from_client(struct clist *, CONN **, uint_t);
bool_t xdrrdma_send_read_data(XDR *, uint_t, struct clist *);
bool_t xdrrdma_free_clist(CONN *, struct clist *);
#endif /* _KERNEL */

#ifdef __cplusplus
}
#endif

#endif	/* _RPC_RPC_RDMA_H */