Date: Tue, 28 Oct 2014 17:39:23 +0800 From: James Pan <jiaming.pan@yahoo.com> To: Jason Bacon <jwbacon@tds.net> Cc: freebsd-infiniband@freebsd.org Subject: Re: Infiniband loopback fails to connect Message-ID: <C50A06D8-B6F3-4B8D-A567-D11F90B08F85@yahoo.com> In-Reply-To: <544E4F8E.9000904@tds.net> References: <E93F1DC5-FEBE-4E52-BDEF-20641A593F64@yahoo.com> <544E4F8E.9000904@tds.net>
next in thread | previous in thread | raw e-mail | index | archive | help
Jason, Thank you for your reply, the HCA I am using is Mellanox MT26428, my OS = is FreeBSD 10.0, below is the detail. Please let me know if you need more information. Thanks! root@rd1:~ # ibstat=20 CA 'mlx4_0' CA type: MT26428 Number of ports: 2 Firmware version: 2.8.0 Hardware version: b0 Node GUID: 0x0002c9030007f6e2 System image GUID: 0x0002c9030007f6e5 Port 1: State: Down Physical state: Polling Rate: 10 Base lid: 0 LMC: 0 SM lid: 0 Capability mask: 0x02510868 Port GUID: 0x0002c9030007f6e3 Port 2: State: Down Physical state: Polling Rate: 10 Base lid: 0 LMC: 0 SM lid: 0 Capability mask: 0x02510868 Port GUID: 0x0002c9030007f6e4 root@rd1:~ # uname -a FreeBSD rd1 10.0-RELEASE FreeBSD 10.0-RELEASE #23 fba8f85(master)-dirty: = Fri Oct 24 04:08:09 CST 2014 root@rd1:/usr/obj/usr/src/sys/GENERIC = amd64 root@rd1:~ #=20 Best regards, James Pan > =D4=DA 2014=C4=EA10=D4=C227=C8=D5=A3=AC=CF=C2=CE=E79:58=A3=ACJason = Bacon <jwbacon@tds.net> =D0=B4=B5=C0=A3=BA >=20 >=20 > Jim, >=20 > Thanks much for your efforts on this. I'm sure the Mellanox = developers will look into it as they're actively working on an overhaul = of the code right now. >=20 > In the meantime, you might want to post more info, like what HCA = you're using. >=20 > Cheers, >=20 > Jason >=20 > On 10/27/14 6:39 AM, James Pan via freebsd-infiniband wrote: >> Hi, >> I=A1=AFve configured Inifniband on two FreeBSD machines (rd1 and = rd2), the IB interface appears as ib0 on both machines, >> I assigned 10.9.0.1 to ib0 on rd1 and 10.9.0.2 to ib0 on rd2. >> Then I downloaded a sample code from = http://thegeekinthecorner.wordpress.com and modified them a little, the = code could be found as attached. >>=20 >> The problem I met is: >> if I run the server on rd2 and then run the client from rd1, the = program works as expected. >> if I run both the server and the client on the same host, the client = will fail to connect to the server: >>=20 >> root@rd2:~/the-geek-in-the-corner/01_basic-client-server # ./server >> listening on port 21277. >> root@rd2:~/the-geek-in-the-corner/01_basic-client-server # ./client = 10.9.0.2 21277 >> event 1, status -60 >> on_event: unknown event. >>=20 >> It looks like rdma_resolve_addr() has failed. >>=20 >> I did some debugging on the driver and the cause seems to be: >> In the infiniband driver addr_resolve() (in file = /usr/src/sys/ofed/driers/infiniband/core/addr.c) >> depends on arpresolve() to resolve the address but unfortunately the = ifp passed to >> arpresolve() is lo0, because the ifp is got from the route table = while route table on rd2 is: >>=20 >> root@rd2:~/the-geek-in-the-corner/01_basic-client-server # netstat = -rn >> Routing tables >>=20 >> Internet: >> Destination Gateway Flags Refs Use Netif = Expire >> default 192.168.1.1 UGS 0 0 ix0 >> 10.9.0.0/24 link#5 U 0 1 ib0 >> 10.9.0.2 link#5 UHS 0 0 lo0 = <----------------- >> 127.0.0.1 link#4 UH 0 0 lo0 >>=20 >> as the destination ip address is 10.9.0.2, lo0 is found and passed = to arpresolve() >> lo0 doesn=A1=AFt have an address so it fails. >>=20 >> I made some changes to the driver and pass the correct ifp to = addr_resolve(), this time addr_resolve() passed but rmda_resolve_route() = failed. >>=20 >> Could anyone who has experience with infiniband programming help take = a look? >> Your help is very appreciated, thanks a lot! >>=20 >>=20 >>=20 >> ------------------------------ the sample code = -------------------------------- >>=20 >> root@rd2:~/the-geek-in-the-corner/01_basic-client-server # cat = server.c >> #include <stdio.h> >> #include <stdlib.h> >> #include <string.h> >> #include <unistd.h> >> #include <rdma/rdma_cma.h> >>=20 >> #define TEST_NZ(x) do { if ( (x)) die("error: " #x " failed (returned = non-zero)." ); } while (0) >> #define TEST_Z(x) do { if (!(x)) die("error: " #x " failed (returned = zero/null)."); } while (0) >>=20 >> const int BUFFER_SIZE =3D 1024; >>=20 >> struct context { >> struct ibv_context *ctx; >> struct ibv_pd *pd; >> struct ibv_cq *cq; >> struct ibv_comp_channel *comp_channel; >>=20 >> pthread_t cq_poller_thread; >> }; >>=20 >> struct connection { >> struct ibv_qp *qp; >>=20 >> struct ibv_mr *recv_mr; >> struct ibv_mr *send_mr; >>=20 >> char *recv_region; >> char *send_region; >> }; >>=20 >> static void die(const char *reason); >>=20 >> static void build_context(struct ibv_context *verbs); >> static void build_qp_attr(struct ibv_qp_init_attr *qp_attr); >> static void * poll_cq(void *); >> static void post_receives(struct connection *conn); >> static void register_memory(struct connection *conn); >>=20 >> static void on_completion(struct ibv_wc *wc); >> static int on_connect_request(struct rdma_cm_id *id); >> static int on_connection(void *context); >> static int on_disconnect(struct rdma_cm_id *id); >> static int on_event(struct rdma_cm_event *event); >>=20 >> static struct context *s_ctx =3D NULL; >>=20 >> int main(int argc, char **argv) >> { >> struct sockaddr_in addr; >> struct rdma_cm_event *event =3D NULL; >> struct rdma_cm_id *listener =3D NULL; >> struct rdma_event_channel *ec =3D NULL; >> uint16_t port =3D 0; >>=20 >> memset(&addr, 0, sizeof(addr)); >> addr.sin_family =3D AF_INET; >> addr.sin_len =3D sizeof addr; >>=20 >> TEST_Z(ec =3D rdma_create_event_channel()); >> TEST_NZ(rdma_create_id(ec, &listener, NULL, RDMA_PS_TCP)); >> return 0; >> } >>=20 >> int on_disconnect(struct rdma_cm_id *id) >> { >> struct connection *conn =3D (struct connection *)id->context; >>=20 >> printf("peer disconnected.\n"); >>=20 >> rdma_destroy_qp(id); >>=20 >> ibv_dereg_mr(conn->send_mr); >> ibv_dereg_mr(conn->recv_mr); >>=20 >> free(conn->send_region); >> free(conn->recv_region); >>=20 >> free(conn); >>=20 >> rdma_destroy_id(id); >>=20 >> return 0; >> } >>=20 >> int on_event(struct rdma_cm_event *event) >> { >> int r =3D 0; >>=20 >> if (event->event =3D=3D RDMA_CM_EVENT_CONNECT_REQUEST) >> r =3D on_connect_request(event->id); >> else if (event->event =3D=3D RDMA_CM_EVENT_ESTABLISHED) >> r =3D on_connection(event->id->context); >> else if (event->event =3D=3D RDMA_CM_EVENT_DISCONNECTED) >> r =3D on_disconnect(event->id); >> else >> die("on_event: unknown event."); >>=20 >> return r; >> } >>=20 >>=20 >> root@rd2:~/the-geek-in-the-corner/01_basic-client-server # cat = client.c >> #include <netdb.h> >> #include <stdio.h> >> #include <stdlib.h> >> #include <string.h> >> #include <unistd.h> >> #include <rdma/rdma_cma.h> >>=20 >> #define TEST_NZ(x) do { if ( (x)) die("error: " #x " failed (returned = non-zero)." ); } while (0) >> #define TEST_Z(x) do { if (!(x)) die("error: " #x " failed (returned = zero/null)."); } while (0) >>=20 >> const int BUFFER_SIZE =3D 1024; >> const int TIMEOUT_IN_MS =3D 500; /* ms */ >>=20 >> struct context { >> struct ibv_context *ctx; >> struct ibv_pd *pd; >> struct ibv_cq *cq; >> struct ibv_comp_channel *comp_channel; >>=20 >> pthread_t cq_poller_thread; >> }; >>=20 >> struct connection { >> struct rdma_cm_id *id; >> struct ibv_qp *qp; >>=20 >> struct ibv_mr *recv_mr; >> struct ibv_mr *send_mr; >>=20 >> char *recv_region; >> char *send_region; >>=20 >> int num_completions; >> }; >>=20 >> static void die(const char *reason); >>=20 >> static void build_context(struct ibv_context *verbs); >> static void build_qp_attr(struct ibv_qp_init_attr *qp_attr); >> static void * poll_cq(void *); >> static void post_receives(struct connection *conn); >> static void register_memory(struct connection *conn); >>=20 >> static int on_addr_resolved(struct rdma_cm_id *id); >> static void on_completion(struct ibv_wc *wc); >> static int on_connection(void *context); >> static int on_disconnect(struct rdma_cm_id *id); >> static int on_event(struct rdma_cm_event *event); >> static int on_route_resolved(struct rdma_cm_id *id); >>=20 >> static struct context *s_ctx =3D NULL; >>=20 >> int main(int argc, char **argv) >> { >> struct addrinfo *addr; >> struct rdma_cm_event *event =3D NULL; >> struct rdma_cm_id *conn=3D NULL; >> struct rdma_event_channel *ec =3D NULL; >>=20 >> if (argc !=3D 3) >> die("usage: client <server-address> <server-port>"); >>=20 >> TEST_NZ(getaddrinfo(argv[1], argv[2], NULL, &addr)); >>=20 >> TEST_Z(ec =3D rdma_create_event_channel()); >> TEST_NZ(rdma_create_id(ec, &conn, NULL, RDMA_PS_TCP)); >> TEST_NZ(rdma_resolve_addr(conn, NULL, addr->ai_addr, = TIMEOUT_IN_MS)); >>=20 >> freeaddrinfo(addr); >>=20 >> while (rdma_get_cm_event(ec, &event) =3D=3D 0) { >> struct rdma_cm_event event_copy; >>=20 >> memcpy(&event_copy, event, sizeof(*event)); >> rdma_ack_cm_event(event); >>=20 >> if (on_event(&event_copy)) >> break; >> } >> free(conn->recv_region); >>=20 >> free(conn); >>=20 >> rdma_destroy_id(id); >>=20 >> return 1; /* exit event loop */ >> } >>=20 >> int on_event(struct rdma_cm_event *event) >> { >> int r =3D 0; >>=20 >> if (event->event =3D=3D RDMA_CM_EVENT_ADDR_RESOLVED) >> r =3D on_addr_resolved(event->id); >> else if (event->event =3D=3D RDMA_CM_EVENT_ROUTE_RESOLVED) >> r =3D on_route_resolved(event->id); >> else if (event->event =3D=3D RDMA_CM_EVENT_ESTABLISHED) >> r =3D on_connection(event->id->context); >> else if (event->event =3D=3D RDMA_CM_EVENT_DISCONNECTED) >> r =3D on_disconnect(event->id); >> else { >> printf("event %d, status %d\n", event->event, event->status); >> die("on_event: unknown event."); >> } >>=20 >> return r; >> } >>=20 >> int on_route_resolved(struct rdma_cm_id *id) >> { >> struct rdma_conn_param cm_params; >>=20 >> printf("route resolved.\n"); >>=20 >> memset(&cm_params, 0, sizeof(cm_params)); >> TEST_NZ(rdma_connect(id, &cm_params)); >>=20 >> return 0; >> } >>=20 >> _______________________________________________ >> freebsd-infiniband@freebsd.org mailing list >> http://lists.freebsd.org/mailman/listinfo/freebsd-infiniband >> To unsubscribe, send any mail to = "freebsd-infiniband-unsubscribe@freebsd.org" >=20 >=20 > --=20 > ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ > Jason W. Bacon > jwbacon@tds.net >=20 > Circumstances don't make a man: > They reveal him. > ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ >=20
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?C50A06D8-B6F3-4B8D-A567-D11F90B08F85>