Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 27 Oct 2014 19:39:49 +0800
From:      James Pan <jiaming.pan@yahoo.com>
To:        "freebsd-infiniband@freebsd.org" <freebsd-infiniband@freebsd.org>
Subject:   Infiniband loopback fails to connect
Message-ID:  <E93F1DC5-FEBE-4E52-BDEF-20641A593F64@yahoo.com>

next in thread | raw e-mail | index | archive | help
Hi,
I=A1=AFve configured Inifniband on two FreeBSD machines (rd1 and rd2), =
the IB interface appears as ib0 on both machines,
I assigned 10.9.0.1 to ib0 on rd1 and 10.9.0.2 to ib0 on rd2.
Then I downloaded a sample code from =
http://thegeekinthecorner.wordpress.com and modified them a little, the =
code could be found as attached.

The problem I met is:
if I run the server on rd2 and then run the client from rd1, the program =
works as expected.
if I run both the server and the client on the same host, the client =
will fail to connect to the server:

root@rd2:~/the-geek-in-the-corner/01_basic-client-server # ./server
listening on port 21277.
root@rd2:~/the-geek-in-the-corner/01_basic-client-server # ./client =
10.9.0.2 21277                                                           =
        =20
event 1, status -60
on_event: unknown event.

It looks like rdma_resolve_addr() has failed.

I did some debugging on the driver and the cause seems to be:
In the infiniband driver addr_resolve() (in file =
/usr/src/sys/ofed/driers/infiniband/core/addr.c)=20
depends on arpresolve() to resolve the address but unfortunately the ifp =
passed to
arpresolve() is lo0, because the ifp is got from the route table while =
route table on rd2 is:

root@rd2:~/the-geek-in-the-corner/01_basic-client-server # netstat -rn
Routing tables

Internet:
Destination        Gateway            Flags    Refs      Use  Netif =
Expire
default            192.168.1.1        UGS         0        0    ix0
10.9.0.0/24        link#5             U           0        1    ib0
10.9.0.2           link#5             UHS         0        0    lo0 =
<-----------------
127.0.0.1          link#4             UH          0        0    lo0

as the destination ip address is 10.9.0.2,  lo0 is found and passed to =
arpresolve()
lo0 doesn=A1=AFt have an address so it fails.

I made some changes to the driver and pass the correct ifp to =
addr_resolve(), this time addr_resolve() passed but rmda_resolve_route() =
failed.

Could anyone who has experience with infiniband programming help take a =
look?
Your help is very appreciated, thanks a lot!



------------------------------ the sample code =
--------------------------------

root@rd2:~/the-geek-in-the-corner/01_basic-client-server # cat server.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <rdma/rdma_cma.h>

#define TEST_NZ(x) do { if ( (x)) die("error: " #x " failed (returned =
non-zero)." ); } while (0)
#define TEST_Z(x)  do { if (!(x)) die("error: " #x " failed (returned =
zero/null)."); } while (0)

const int BUFFER_SIZE =3D 1024;

struct context {
  struct ibv_context *ctx;
  struct ibv_pd *pd;
  struct ibv_cq *cq;
  struct ibv_comp_channel *comp_channel;

  pthread_t cq_poller_thread;
};

struct connection {
  struct ibv_qp *qp;

  struct ibv_mr *recv_mr;
  struct ibv_mr *send_mr;

  char *recv_region;
  char *send_region;
};

static void die(const char *reason);

static void build_context(struct ibv_context *verbs);
static void build_qp_attr(struct ibv_qp_init_attr *qp_attr);
static void * poll_cq(void *);
static void post_receives(struct connection *conn);
static void register_memory(struct connection *conn);

static void on_completion(struct ibv_wc *wc);
static int on_connect_request(struct rdma_cm_id *id);
static int on_connection(void *context);
static int on_disconnect(struct rdma_cm_id *id);
static int on_event(struct rdma_cm_event *event);

static struct context *s_ctx =3D NULL;

int main(int argc, char **argv)
{
  struct sockaddr_in addr;
  struct rdma_cm_event *event =3D NULL;
  struct rdma_cm_id *listener =3D NULL;
  struct rdma_event_channel *ec =3D NULL;
  uint16_t port =3D 0;

  memset(&addr, 0, sizeof(addr));
  addr.sin_family =3D AF_INET;
  addr.sin_len =3D sizeof addr;

  TEST_Z(ec =3D rdma_create_event_channel());
  TEST_NZ(rdma_create_id(ec, &listener, NULL, RDMA_PS_TCP));
  return 0;
}

int on_disconnect(struct rdma_cm_id *id)
{
  struct connection *conn =3D (struct connection *)id->context;

  printf("peer disconnected.\n");

  rdma_destroy_qp(id);

  ibv_dereg_mr(conn->send_mr);
  ibv_dereg_mr(conn->recv_mr);

  free(conn->send_region);
  free(conn->recv_region);

  free(conn);

  rdma_destroy_id(id);

  return 0;
}

int on_event(struct rdma_cm_event *event)
{
  int r =3D 0;

  if (event->event =3D=3D RDMA_CM_EVENT_CONNECT_REQUEST)
    r =3D on_connect_request(event->id);
  else if (event->event =3D=3D RDMA_CM_EVENT_ESTABLISHED)
    r =3D on_connection(event->id->context);
  else if (event->event =3D=3D RDMA_CM_EVENT_DISCONNECTED)
    r =3D on_disconnect(event->id);
  else
    die("on_event: unknown event.");

  return r;
}


root@rd2:~/the-geek-in-the-corner/01_basic-client-server # cat client.c
#include <netdb.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <rdma/rdma_cma.h>

#define TEST_NZ(x) do { if ( (x)) die("error: " #x " failed (returned =
non-zero)." ); } while (0)
#define TEST_Z(x)  do { if (!(x)) die("error: " #x " failed (returned =
zero/null)."); } while (0)

const int BUFFER_SIZE =3D 1024;
const int TIMEOUT_IN_MS =3D 500; /* ms */

struct context {
  struct ibv_context *ctx;
  struct ibv_pd *pd;
  struct ibv_cq *cq;
  struct ibv_comp_channel *comp_channel;

  pthread_t cq_poller_thread;
};

struct connection {
  struct rdma_cm_id *id;
  struct ibv_qp *qp;

  struct ibv_mr *recv_mr;
  struct ibv_mr *send_mr;

  char *recv_region;
  char *send_region;

  int num_completions;
};

static void die(const char *reason);

static void build_context(struct ibv_context *verbs);
static void build_qp_attr(struct ibv_qp_init_attr *qp_attr);
static void * poll_cq(void *);
static void post_receives(struct connection *conn);
static void register_memory(struct connection *conn);

static int on_addr_resolved(struct rdma_cm_id *id);
static void on_completion(struct ibv_wc *wc);
static int on_connection(void *context);
static int on_disconnect(struct rdma_cm_id *id);
static int on_event(struct rdma_cm_event *event);
static int on_route_resolved(struct rdma_cm_id *id);

static struct context *s_ctx =3D NULL;

int main(int argc, char **argv)
{
  struct addrinfo *addr;
  struct rdma_cm_event *event =3D NULL;
  struct rdma_cm_id *conn=3D NULL;
  struct rdma_event_channel *ec =3D NULL;

  if (argc !=3D 3)
    die("usage: client <server-address> <server-port>");

  TEST_NZ(getaddrinfo(argv[1], argv[2], NULL, &addr));

  TEST_Z(ec =3D rdma_create_event_channel());
  TEST_NZ(rdma_create_id(ec, &conn, NULL, RDMA_PS_TCP));
  TEST_NZ(rdma_resolve_addr(conn, NULL, addr->ai_addr, TIMEOUT_IN_MS));

  freeaddrinfo(addr);

  while (rdma_get_cm_event(ec, &event) =3D=3D 0) {
    struct rdma_cm_event event_copy;

    memcpy(&event_copy, event, sizeof(*event));
    rdma_ack_cm_event(event);

    if (on_event(&event_copy))
      break;
  }
  free(conn->recv_region);

  free(conn);

  rdma_destroy_id(id);

  return 1; /* exit event loop */
}

int on_event(struct rdma_cm_event *event)
{
  int r =3D 0;

  if (event->event =3D=3D RDMA_CM_EVENT_ADDR_RESOLVED)
    r =3D on_addr_resolved(event->id);
  else if (event->event =3D=3D RDMA_CM_EVENT_ROUTE_RESOLVED)
    r =3D on_route_resolved(event->id);
  else if (event->event =3D=3D RDMA_CM_EVENT_ESTABLISHED)
    r =3D on_connection(event->id->context);
  else if (event->event =3D=3D RDMA_CM_EVENT_DISCONNECTED)
    r =3D on_disconnect(event->id);
  else {
    printf("event %d, status %d\n", event->event, event->status);
    die("on_event: unknown event.");
  }

  return r;
}

int on_route_resolved(struct rdma_cm_id *id)
{
  struct rdma_conn_param cm_params;

  printf("route resolved.\n");

  memset(&cm_params, 0, sizeof(cm_params));
  TEST_NZ(rdma_connect(id, &cm_params));

  return 0;
}

=20=



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?E93F1DC5-FEBE-4E52-BDEF-20641A593F64>