Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 28 Oct 2014 17:39:23 +0800
From:      James Pan <jiaming.pan@yahoo.com>
To:        Jason Bacon <jwbacon@tds.net>
Cc:        freebsd-infiniband@freebsd.org
Subject:   Re: Infiniband loopback fails to connect
Message-ID:  <C50A06D8-B6F3-4B8D-A567-D11F90B08F85@yahoo.com>
In-Reply-To: <544E4F8E.9000904@tds.net>
References:  <E93F1DC5-FEBE-4E52-BDEF-20641A593F64@yahoo.com> <544E4F8E.9000904@tds.net>

next in thread | previous in thread | raw e-mail | index | archive | help
Jason,

Thank you for your reply, the HCA I am using is Mellanox MT26428, my OS =
is FreeBSD 10.0, below is the detail.
Please let me know if you need more information. Thanks!

root@rd1:~ # ibstat=20
CA 'mlx4_0'
	CA type: MT26428
	Number of ports: 2
	Firmware version: 2.8.0
	Hardware version: b0
	Node GUID: 0x0002c9030007f6e2
	System image GUID: 0x0002c9030007f6e5
	Port 1:
		State: Down
		Physical state: Polling
		Rate: 10
		Base lid: 0
		LMC: 0
		SM lid: 0
		Capability mask: 0x02510868
		Port GUID: 0x0002c9030007f6e3
	Port 2:
		State: Down
		Physical state: Polling
		Rate: 10
		Base lid: 0
		LMC: 0
		SM lid: 0
		Capability mask: 0x02510868
		Port GUID: 0x0002c9030007f6e4
root@rd1:~ # uname -a
FreeBSD rd1 10.0-RELEASE FreeBSD 10.0-RELEASE #23 fba8f85(master)-dirty: =
Fri Oct 24 04:08:09 CST 2014     root@rd1:/usr/obj/usr/src/sys/GENERIC  =
amd64
root@rd1:~ #=20

Best regards,

James Pan

> =D4=DA 2014=C4=EA10=D4=C227=C8=D5=A3=AC=CF=C2=CE=E79:58=A3=ACJason =
Bacon <jwbacon@tds.net> =D0=B4=B5=C0=A3=BA
>=20
>=20
> Jim,
>=20
> Thanks much for your efforts on this.  I'm sure the Mellanox =
developers will look into it as they're actively working on an overhaul =
of the code right now.
>=20
> In the meantime, you might want to post more info, like what HCA =
you're using.
>=20
> Cheers,
>=20
>    Jason
>=20
> On 10/27/14 6:39 AM, James Pan via freebsd-infiniband wrote:
>> Hi,
>> I=A1=AFve configured Inifniband on two FreeBSD machines (rd1 and =
rd2), the IB interface appears as ib0 on both machines,
>> I assigned 10.9.0.1 to ib0 on rd1 and 10.9.0.2 to ib0 on rd2.
>> Then I downloaded a sample code from =
http://thegeekinthecorner.wordpress.com and modified them a little, the =
code could be found as attached.
>>=20
>> The problem I met is:
>> if I run the server on rd2 and then run the client from rd1, the =
program works as expected.
>> if I run both the server and the client on the same host, the client =
will fail to connect to the server:
>>=20
>> root@rd2:~/the-geek-in-the-corner/01_basic-client-server # ./server
>> listening on port 21277.
>> root@rd2:~/the-geek-in-the-corner/01_basic-client-server # ./client =
10.9.0.2 21277
>> event 1, status -60
>> on_event: unknown event.
>>=20
>> It looks like rdma_resolve_addr() has failed.
>>=20
>> I did some debugging on the driver and the cause seems to be:
>> In the infiniband driver addr_resolve() (in file =
/usr/src/sys/ofed/driers/infiniband/core/addr.c)
>> depends on arpresolve() to resolve the address but unfortunately the =
ifp passed to
>> arpresolve() is lo0, because the ifp is got from the route table =
while route table on rd2 is:
>>=20
>> root@rd2:~/the-geek-in-the-corner/01_basic-client-server # netstat =
-rn
>> Routing tables
>>=20
>> Internet:
>> Destination        Gateway            Flags    Refs      Use  Netif =
Expire
>> default            192.168.1.1        UGS         0        0    ix0
>> 10.9.0.0/24        link#5             U           0        1    ib0
>> 10.9.0.2           link#5             UHS         0        0    lo0 =
<-----------------
>> 127.0.0.1          link#4             UH          0        0    lo0
>>=20
>> as the destination ip address is 10.9.0.2,  lo0 is found and passed =
to arpresolve()
>> lo0 doesn=A1=AFt have an address so it fails.
>>=20
>> I made some changes to the driver and pass the correct ifp to =
addr_resolve(), this time addr_resolve() passed but rmda_resolve_route() =
failed.
>>=20
>> Could anyone who has experience with infiniband programming help take =
a look?
>> Your help is very appreciated, thanks a lot!
>>=20
>>=20
>>=20
>> ------------------------------ the sample code =
--------------------------------
>>=20
>> root@rd2:~/the-geek-in-the-corner/01_basic-client-server # cat =
server.c
>> #include <stdio.h>
>> #include <stdlib.h>
>> #include <string.h>
>> #include <unistd.h>
>> #include <rdma/rdma_cma.h>
>>=20
>> #define TEST_NZ(x) do { if ( (x)) die("error: " #x " failed (returned =
non-zero)." ); } while (0)
>> #define TEST_Z(x)  do { if (!(x)) die("error: " #x " failed (returned =
zero/null)."); } while (0)
>>=20
>> const int BUFFER_SIZE =3D 1024;
>>=20
>> struct context {
>>   struct ibv_context *ctx;
>>   struct ibv_pd *pd;
>>   struct ibv_cq *cq;
>>   struct ibv_comp_channel *comp_channel;
>>=20
>>   pthread_t cq_poller_thread;
>> };
>>=20
>> struct connection {
>>   struct ibv_qp *qp;
>>=20
>>   struct ibv_mr *recv_mr;
>>   struct ibv_mr *send_mr;
>>=20
>>   char *recv_region;
>>   char *send_region;
>> };
>>=20
>> static void die(const char *reason);
>>=20
>> static void build_context(struct ibv_context *verbs);
>> static void build_qp_attr(struct ibv_qp_init_attr *qp_attr);
>> static void * poll_cq(void *);
>> static void post_receives(struct connection *conn);
>> static void register_memory(struct connection *conn);
>>=20
>> static void on_completion(struct ibv_wc *wc);
>> static int on_connect_request(struct rdma_cm_id *id);
>> static int on_connection(void *context);
>> static int on_disconnect(struct rdma_cm_id *id);
>> static int on_event(struct rdma_cm_event *event);
>>=20
>> static struct context *s_ctx =3D NULL;
>>=20
>> int main(int argc, char **argv)
>> {
>>   struct sockaddr_in addr;
>>   struct rdma_cm_event *event =3D NULL;
>>   struct rdma_cm_id *listener =3D NULL;
>>   struct rdma_event_channel *ec =3D NULL;
>>   uint16_t port =3D 0;
>>=20
>>   memset(&addr, 0, sizeof(addr));
>>   addr.sin_family =3D AF_INET;
>>   addr.sin_len =3D sizeof addr;
>>=20
>>   TEST_Z(ec =3D rdma_create_event_channel());
>>   TEST_NZ(rdma_create_id(ec, &listener, NULL, RDMA_PS_TCP));
>>   return 0;
>> }
>>=20
>> int on_disconnect(struct rdma_cm_id *id)
>> {
>>   struct connection *conn =3D (struct connection *)id->context;
>>=20
>>   printf("peer disconnected.\n");
>>=20
>>   rdma_destroy_qp(id);
>>=20
>>   ibv_dereg_mr(conn->send_mr);
>>   ibv_dereg_mr(conn->recv_mr);
>>=20
>>   free(conn->send_region);
>>   free(conn->recv_region);
>>=20
>>   free(conn);
>>=20
>>   rdma_destroy_id(id);
>>=20
>>   return 0;
>> }
>>=20
>> int on_event(struct rdma_cm_event *event)
>> {
>>   int r =3D 0;
>>=20
>>   if (event->event =3D=3D RDMA_CM_EVENT_CONNECT_REQUEST)
>>     r =3D on_connect_request(event->id);
>>   else if (event->event =3D=3D RDMA_CM_EVENT_ESTABLISHED)
>>     r =3D on_connection(event->id->context);
>>   else if (event->event =3D=3D RDMA_CM_EVENT_DISCONNECTED)
>>     r =3D on_disconnect(event->id);
>>   else
>>     die("on_event: unknown event.");
>>=20
>>   return r;
>> }
>>=20
>>=20
>> root@rd2:~/the-geek-in-the-corner/01_basic-client-server # cat =
client.c
>> #include <netdb.h>
>> #include <stdio.h>
>> #include <stdlib.h>
>> #include <string.h>
>> #include <unistd.h>
>> #include <rdma/rdma_cma.h>
>>=20
>> #define TEST_NZ(x) do { if ( (x)) die("error: " #x " failed (returned =
non-zero)." ); } while (0)
>> #define TEST_Z(x)  do { if (!(x)) die("error: " #x " failed (returned =
zero/null)."); } while (0)
>>=20
>> const int BUFFER_SIZE =3D 1024;
>> const int TIMEOUT_IN_MS =3D 500; /* ms */
>>=20
>> struct context {
>>   struct ibv_context *ctx;
>>   struct ibv_pd *pd;
>>   struct ibv_cq *cq;
>>   struct ibv_comp_channel *comp_channel;
>>=20
>>   pthread_t cq_poller_thread;
>> };
>>=20
>> struct connection {
>>   struct rdma_cm_id *id;
>>   struct ibv_qp *qp;
>>=20
>>   struct ibv_mr *recv_mr;
>>   struct ibv_mr *send_mr;
>>=20
>>   char *recv_region;
>>   char *send_region;
>>=20
>>   int num_completions;
>> };
>>=20
>> static void die(const char *reason);
>>=20
>> static void build_context(struct ibv_context *verbs);
>> static void build_qp_attr(struct ibv_qp_init_attr *qp_attr);
>> static void * poll_cq(void *);
>> static void post_receives(struct connection *conn);
>> static void register_memory(struct connection *conn);
>>=20
>> static int on_addr_resolved(struct rdma_cm_id *id);
>> static void on_completion(struct ibv_wc *wc);
>> static int on_connection(void *context);
>> static int on_disconnect(struct rdma_cm_id *id);
>> static int on_event(struct rdma_cm_event *event);
>> static int on_route_resolved(struct rdma_cm_id *id);
>>=20
>> static struct context *s_ctx =3D NULL;
>>=20
>> int main(int argc, char **argv)
>> {
>>   struct addrinfo *addr;
>>   struct rdma_cm_event *event =3D NULL;
>>   struct rdma_cm_id *conn=3D NULL;
>>   struct rdma_event_channel *ec =3D NULL;
>>=20
>>   if (argc !=3D 3)
>>     die("usage: client <server-address> <server-port>");
>>=20
>>   TEST_NZ(getaddrinfo(argv[1], argv[2], NULL, &addr));
>>=20
>>   TEST_Z(ec =3D rdma_create_event_channel());
>>   TEST_NZ(rdma_create_id(ec, &conn, NULL, RDMA_PS_TCP));
>>   TEST_NZ(rdma_resolve_addr(conn, NULL, addr->ai_addr, =
TIMEOUT_IN_MS));
>>=20
>>   freeaddrinfo(addr);
>>=20
>>   while (rdma_get_cm_event(ec, &event) =3D=3D 0) {
>>     struct rdma_cm_event event_copy;
>>=20
>>     memcpy(&event_copy, event, sizeof(*event));
>>     rdma_ack_cm_event(event);
>>=20
>>     if (on_event(&event_copy))
>>       break;
>>   }
>>   free(conn->recv_region);
>>=20
>>   free(conn);
>>=20
>>   rdma_destroy_id(id);
>>=20
>>   return 1; /* exit event loop */
>> }
>>=20
>> int on_event(struct rdma_cm_event *event)
>> {
>>   int r =3D 0;
>>=20
>>   if (event->event =3D=3D RDMA_CM_EVENT_ADDR_RESOLVED)
>>     r =3D on_addr_resolved(event->id);
>>   else if (event->event =3D=3D RDMA_CM_EVENT_ROUTE_RESOLVED)
>>     r =3D on_route_resolved(event->id);
>>   else if (event->event =3D=3D RDMA_CM_EVENT_ESTABLISHED)
>>     r =3D on_connection(event->id->context);
>>   else if (event->event =3D=3D RDMA_CM_EVENT_DISCONNECTED)
>>     r =3D on_disconnect(event->id);
>>   else {
>>     printf("event %d, status %d\n", event->event, event->status);
>>     die("on_event: unknown event.");
>>   }
>>=20
>>   return r;
>> }
>>=20
>> int on_route_resolved(struct rdma_cm_id *id)
>> {
>>   struct rdma_conn_param cm_params;
>>=20
>>   printf("route resolved.\n");
>>=20
>>   memset(&cm_params, 0, sizeof(cm_params));
>>   TEST_NZ(rdma_connect(id, &cm_params));
>>=20
>>   return 0;
>> }
>>=20
>>  _______________________________________________
>> freebsd-infiniband@freebsd.org mailing list
>> http://lists.freebsd.org/mailman/listinfo/freebsd-infiniband
>> To unsubscribe, send any mail to =
"freebsd-infiniband-unsubscribe@freebsd.org"
>=20
>=20
> --=20
> ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>  Jason W. Bacon
>  jwbacon@tds.net
>=20
>  Circumstances don't make a man:
>  They reveal him.
> ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>=20




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?C50A06D8-B6F3-4B8D-A567-D11F90B08F85>