Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 9 Apr 2015 10:16:27 -0700
From:      Jim Harris <jim.harris@gmail.com>
To:        Konstantin Belousov <kostikbel@gmail.com>
Cc:        "freebsd-hackers@freebsd.org" <freebsd-hackers@freebsd.org>, Tobias Oberstein <tobias.oberstein@gmail.com>, Michael Fuckner <michael@fuckner.net>, Alan Somers <asomers@freebsd.org>
Subject:   Re: NVMe performance 4x slower than expected
Message-ID:  <CAJP=Hc-WLKe3%2BDQ=2o21CY=aaQAjADrzEfnD7NVO1Cotu4vcGg@mail.gmail.com>
In-Reply-To: <CAJP=Hc87FMYCrQYGfAtefQ8PLT3WtnvPfPSppp3zRF-0noQR9Q@mail.gmail.com>
References:  <551BC57D.5070101@gmail.com> <CAOtMX2jVwMHSnQfphAF%2Ba2%2Bo7eLp62nHmUo4t%2BEahrXLWReaFQ@mail.gmail.com> <CAJP=Hc-RNVuhPePg7bnpmT4ByzyXs_CNvAs7Oy7ntXjqhZYhCQ@mail.gmail.com> <551C5A82.2090306@gmail.com> <20150401212303.GB2379@kib.kiev.ua> <CAJP=Hc87FMYCrQYGfAtefQ8PLT3WtnvPfPSppp3zRF-0noQR9Q@mail.gmail.com>

next in thread | previous in thread | raw e-mail | index | archive | help
On Wed, Apr 1, 2015 at 2:34 PM, Jim Harris <jim.harris@gmail.com> wrote:

>
>
> Yes - this is exactly the problem.
>
> nvme does use MSI-X if it can allocate the vectors (one per core).  With
> 48 cores,
> I suspect we are quickly running out of vectors, so NVMe is reverting to
> INTx.
>
> Could you actually send vmstat -ia (I left off the 'a' previously) - just
> so we can
> see all allocated interrupt vectors.
>
> As an experiment, can you try disabling hyperthreading - this will reduce
> the
> number of cores and should let you get MSI-X vectors allocated for at least
> the first couple of NVMe controllers.  Then please re-run your performance
> test on one of those controllers.
>
> sys/x86/x86/local_apic.c defines APIC_NUM_IOINTS to 191 - it looks like
> this
> is the actual limit for MSI-X vectors, even though NUM_MSI_INTS is set to
> 512.
>

Update on this vector allocation discussion:

The DC P3*00 SSDs have a 32-entry MSI-X vector table, which is fewer than
the 48 cores on your system, so it falls back to INTx.  I committed r281280
which will make it fall back to 2 MSI-X vectors (one for admin queue, one
for
a single I/O queue) which is better than INTx.  In the future this driver
will be
improved to utilize the available number of interrupts rather than falling
back to
a single I/O queue.  (Based on your ramdisk performance data, it does not
appear that lack of per-CPU NVMe I/O queues is the cause of the performance
issues on this system - but I'm working to confirm on a system in my lab.)

I also root caused a vector allocation issue and filed PR199321.  In
summary,
driver initialization occurs before APs are started, so MSI-X allocation
code only
allocates IRQs from the BSP's lapic.  The BSP's lapic can only accommodate
191 (APIC_NUM_IOINTS) total vectors.  The allocation code is designed to
round-robin allocations across all lapics, but does not allow it until
after the
APs are started which is after driver initialization during boot.  Drivers
that are
loaded after boot should round-robin IRQ allocation across all physical
cores,
up to the NUM_MSI_INTS (512) limit.





>
>
>>
>> I have the following patch for a long time, it allowed to increase pps
>> in iperf and similar tests when DMAR is enabled. In your case it could
>> reduce the rate of the DMAR interrupts.
>>
>> diff --git a/sys/x86/iommu/intel_ctx.c b/sys/x86/iommu/intel_ctx.c
>> index a18adcf..b23a4c1 100644
>> --- a/sys/x86/iommu/intel_ctx.c
>> +++ b/sys/x86/iommu/intel_ctx.c
>> @@ -586,6 +586,18 @@ dmar_ctx_unload_entry(struct dmar_map_entry *entry,
>> bool free)
>>         }
>>  }
>>
>> +static struct dmar_qi_genseq *
>> +dmar_ctx_unload_gseq(struct dmar_ctx *ctx, struct dmar_map_entry *entry,
>> +    struct dmar_qi_genseq *gseq)
>> +{
>> +
>> +       if (TAILQ_NEXT(entry, dmamap_link) != NULL)
>> +               return (NULL);
>> +       if (ctx->batch_no++ % dmar_batch_coalesce != 0)
>> +               return (NULL);
>> +       return (gseq);
>> +}
>> +
>>  void
>>  dmar_ctx_unload(struct dmar_ctx *ctx, struct dmar_map_entries_tailq
>> *entries,
>>      bool cansleep)
>> @@ -619,8 +631,7 @@ dmar_ctx_unload(struct dmar_ctx *ctx, struct
>> dmar_map_entries_tailq *entries,
>>                 entry->gseq.gen = 0;
>>                 entry->gseq.seq = 0;
>>                 dmar_qi_invalidate_locked(ctx, entry->start, entry->end -
>> -                   entry->start, TAILQ_NEXT(entry, dmamap_link) == NULL ?
>> -                   &gseq : NULL);
>> +                   entry->start, dmar_ctx_unload_gseq(ctx, entry,
>> &gseq));
>>         }
>>         TAILQ_FOREACH_SAFE(entry, entries, dmamap_link, entry1) {
>>                 entry->gseq = gseq;
>> diff --git a/sys/x86/iommu/intel_dmar.h b/sys/x86/iommu/intel_dmar.h
>> index 2865ab5..6e0ab7f 100644
>> --- a/sys/x86/iommu/intel_dmar.h
>> +++ b/sys/x86/iommu/intel_dmar.h
>> @@ -93,6 +93,7 @@ struct dmar_ctx {
>>         u_int entries_cnt;
>>         u_long loads;
>>         u_long unloads;
>> +       u_int batch_no;
>>         struct dmar_gas_entries_tree rb_root;
>>         struct dmar_map_entries_tailq unload_entries; /* Entries to
>> unload */
>>         struct dmar_map_entry *first_place, *last_place;
>> @@ -339,6 +340,7 @@ extern dmar_haddr_t dmar_high;
>>  extern int haw;
>>  extern int dmar_tbl_pagecnt;
>>  extern int dmar_match_verbose;
>> +extern int dmar_batch_coalesce;
>>  extern int dmar_check_free;
>>
>>  static inline uint32_t
>> diff --git a/sys/x86/iommu/intel_drv.c b/sys/x86/iommu/intel_drv.c
>> index c239579..e7dc3f9 100644
>> --- a/sys/x86/iommu/intel_drv.c
>> +++ b/sys/x86/iommu/intel_drv.c
>> @@ -153,7 +153,7 @@ dmar_count_iter(ACPI_DMAR_HEADER *dmarh, void *arg)
>>         return (1);
>>  }
>>
>> -static int dmar_enable = 0;
>> +static int dmar_enable = 1;
>>  static void
>>  dmar_identify(driver_t *driver, device_t parent)
>>  {
>> diff --git a/sys/x86/iommu/intel_utils.c b/sys/x86/iommu/intel_utils.c
>> index f696f9d..d3c3267 100644
>> --- a/sys/x86/iommu/intel_utils.c
>> +++ b/sys/x86/iommu/intel_utils.c
>> @@ -624,6 +624,7 @@ dmar_barrier_exit(struct dmar_unit *dmar, u_int
>> barrier_id)
>>  }
>>
>>  int dmar_match_verbose;
>> +int dmar_batch_coalesce = 100;
>>
>>  static SYSCTL_NODE(_hw, OID_AUTO, dmar, CTLFLAG_RD, NULL, "");
>>  SYSCTL_INT(_hw_dmar, OID_AUTO, tbl_pagecnt, CTLFLAG_RD,
>> @@ -632,6 +633,9 @@ SYSCTL_INT(_hw_dmar, OID_AUTO, tbl_pagecnt,
>> CTLFLAG_RD,
>>  SYSCTL_INT(_hw_dmar, OID_AUTO, match_verbose, CTLFLAG_RWTUN,
>>      &dmar_match_verbose, 0,
>>      "Verbose matching of the PCI devices to DMAR paths");
>> +SYSCTL_INT(_hw_dmar, OID_AUTO, batch_coalesce, CTLFLAG_RW | CTLFLAG_TUN,
>> +    &dmar_batch_coalesce, 0,
>> +    "Number of qi batches between interrupt");
>>  #ifdef INVARIANTS
>>  int dmar_check_free;
>>  SYSCTL_INT(_hw_dmar, OID_AUTO, check_free, CTLFLAG_RWTUN,
>>
>
>



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?CAJP=Hc-WLKe3%2BDQ=2o21CY=aaQAjADrzEfnD7NVO1Cotu4vcGg>