The gserialized_analyze_nd sets this function as a callback on the stats object when called by the ANALYZE command.
ANALYZE then gathers the requisite number of sample rows and then calls this function.
We could also pass stats->extra_data in from gserialized_analyze_nd (things like the column type or other stuff from the system catalogs) but so far we don't use that capability.
Our job is to build some statistics on the sample data for use by operator estimators.
We will populate an n-d histogram using the provided sample rows. The selectivity estimators (sel and j_oinsel) can then use the histogram
1368{
1369 MemoryContext old_context;
1370 int d, i;
1371 int notnull_cnt = 0;
1372 int null_cnt = 0;
1373 int histogram_features = 0;
1374
1376 size_t nd_stats_size;
1377
1378 double total_width = 0;
1379 double total_sample_volume = 0;
1380 double total_cell_count = 0;
1381
1385
1386 const ND_BOX **sample_boxes;
1391 int histo_cells_target;
1392 int histo_cells;
1393 int histo_cells_new = 1;
1394
1395 int ndims = 2;
1396 int histo_ndims = 0;
1397 double sample_distribution[
ND_DIMS];
1398 double total_distribution;
1399
1400 int stats_slot;
1401 int stats_kind;
1402
1403
1409
1410
1411
1412
1413
1414
1415
1416 POSTGIS_DEBUG(2, "compute_gserialized_stats called");
1417 POSTGIS_DEBUGF(3, " # sample_rows: %d", sample_rows);
1418 POSTGIS_DEBUGF(3, " estimate of total_rows: %.6g", total_rows);
1419
1420
1421
1422
1423
1424 sample_boxes = palloc(
sizeof(
ND_BOX*) * sample_rows);
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436 for ( i = 0; i < sample_rows; i++ )
1437 {
1438 Datum datum;
1442 bool is_null;
1443 bool is_copy;
1444
1445 datum = fetchfunc(stats, i, &is_null);
1446
1447
1448 if ( is_null )
1449 {
1450 POSTGIS_DEBUGF(4, " skipped null geometry %d", i);
1451 null_cnt++;
1452 continue;
1453 }
1454
1455
1457 is_copy = VARATT_IS_EXTENDED(datum);
1459 {
1460
1461 POSTGIS_DEBUGF(3, " skipped empty geometry %d", i);
1462 continue;
1463 }
1464
1465
1466 if ( mode == 2 )
1468
1469
1471 {
1472 POSTGIS_DEBUGF(3, " skipped infinite/nan geometry %d", i);
1473 continue;
1474 }
1475
1476
1477
1478
1479
1480 if ( mode != 2 )
1482
1483
1484 nd_box = palloc(
sizeof(
ND_BOX));
1486
1487
1488 sample_boxes[notnull_cnt] = nd_box;
1489
1490
1491 if ( ! notnull_cnt )
1493
1494
1496
1497
1498 total_width += VARSIZE(geom);
1499
1500
1501 for ( d = 0; d < ndims; d++ )
1502 {
1503 sum.
min[d] += nd_box->
min[d];
1504 sum.
max[d] += nd_box->
max[d];
1505 }
1506
1507
1508 notnull_cnt++;
1509
1510
1511 if ( is_copy )
1512 pfree(geom);
1513
1514
1515 vacuum_delay_point();
1516 }
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526 histo_cells_target = (int)pow((double)(stats->attr->attstattarget), (double)ndims);
1527 histo_cells_target = Min(histo_cells_target, ndims * 10000);
1528 histo_cells_target = Min(histo_cells_target, (int)(total_rows/5));
1529 POSTGIS_DEBUGF(3, " stats->attr->attstattarget: %d", stats->attr->attstattarget);
1530 POSTGIS_DEBUGF(3, " target # of histogram cells: %d", histo_cells_target);
1531
1532
1533 if ( ! notnull_cnt )
1534 {
1535 Oid relation_oid = stats->attr->attrelid;
1536 char *relation_name = get_rel_name(relation_oid);
1537 elog(NOTICE,
1538 "PostGIS: Unable to compute statistics for \"%s.%s\": No non-null/empty features",
1539 relation_name ? relation_name : "(NULL)",
1540 stats->attr->attname.
data);
1541 stats->stats_valid = false;
1542 return;
1543 }
1544
1545 POSTGIS_DEBUGF(3,
" sample_extent: %s",
nd_box_to_json(&sample_extent, ndims));
1546
1547
1548
1549
1550
1551 for ( d = 0; d < ndims; d++ )
1552 {
1553
1554 avg.
min[d] = sum.
min[d] / notnull_cnt;
1555 avg.
max[d] = sum.
max[d] / notnull_cnt;
1556
1557
1558 for ( i = 0; i < notnull_cnt; i++ )
1559 {
1560 const ND_BOX *ndb = sample_boxes[i];
1561 stddev.
min[d] += (ndb->
min[d] - avg.
min[d]) * (ndb->
min[d] - avg.
min[d]);
1562 stddev.
max[d] += (ndb->
max[d] - avg.
max[d]) * (ndb->
max[d] - avg.
max[d]);
1563 }
1564 stddev.
min[d] = sqrt(stddev.
min[d] / notnull_cnt);
1565 stddev.
max[d] = sqrt(stddev.
max[d] / notnull_cnt);
1566
1567
1570 }
1571
1572
1573
1574
1575
1576
1578 for ( i = 0; i < notnull_cnt; i++ )
1579 {
1580 const ND_BOX *ndb = sample_boxes[i];
1581
1583 {
1584 POSTGIS_DEBUGF(4, " feature %d is a hard deviant, skipped", i);
1585 sample_boxes[i] = NULL;
1586 continue;
1587 }
1588
1590 }
1591
1592
1593
1594
1596 histo_extent = histo_extent_new;
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1611 sample_distribution);
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627 for ( d = 0; d < ndims; d++ )
1628 {
1629 if ( sample_distribution[d] > 0 )
1630 histo_ndims++;
1631 }
1632
1633 if ( histo_ndims == 0 )
1634 {
1635
1636
1637 POSTGIS_DEBUG(3, " special case: no axes have variability");
1638 histo_cells_new = 1;
1639 for ( d = 0; d < ndims; d++ )
1640 {
1641 histo_size[d] = (int)pow((double)histo_cells_target, 1/(double)ndims);
1642 if ( ! histo_size[d] )
1643 histo_size[d] = 1;
1644 POSTGIS_DEBUGF(3, " histo_size[d]: %d", histo_size[d]);
1645 histo_cells_new *= histo_size[d];
1646 }
1647 POSTGIS_DEBUGF(3, " histo_cells_new: %d", histo_cells_new);
1648 }
1649 else
1650 {
1651
1652
1653
1654
1655
1656 POSTGIS_DEBUG(3, " allocating histogram axes based on axis variability");
1657 total_distribution =
total_double(sample_distribution, ndims);
1658 POSTGIS_DEBUGF(3, " total_distribution: %.8g", total_distribution);
1659 histo_cells_new = 1;
1660 for ( d = 0; d < ndims; d++ )
1661 {
1662 if ( sample_distribution[d] == 0 )
1663 {
1664 histo_size[d] = 1;
1665 }
1666 else
1667 {
1668
1669 float edge_ratio = (float)sample_distribution[d] / (float)total_distribution;
1670
1671
1672
1673
1674
1675 histo_size[d] = (int)pow(histo_cells_target * histo_ndims * edge_ratio, 1/(double)histo_ndims);
1676
1677 if ( ! histo_size[d] )
1678 histo_size[d] = 1;
1679 }
1680 histo_cells_new *= histo_size[d];
1681 }
1682 POSTGIS_DEBUGF(3, " histo_cells_new: %d", histo_cells_new);
1683 }
1684
1685
1686 histo_cells = histo_cells_new;
1687 POSTGIS_DEBUGF(3, " histo_cells: %d", histo_cells);
1688
1689
1690
1691
1692 old_context = MemoryContextSwitchTo(stats->anl_context);
1693 nd_stats_size =
sizeof(
ND_STATS) + ((histo_cells - 1) *
sizeof(float4));
1694 nd_stats = palloc(nd_stats_size);
1695 memset(nd_stats, 0, nd_stats_size);
1696 MemoryContextSwitchTo(old_context);
1697
1698
1699 nd_stats->
ndims = ndims;
1700 nd_stats->
extent = histo_extent;
1704
1705 for ( d = 0; d < ndims; d++ )
1706 nd_stats->
size[d] = histo_size[d];
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721 for ( i = 0; i < notnull_cnt; i++ )
1722 {
1726 int d;
1727 double num_cells = 0;
1728 double tmp_volume = 1.0;
1729 double min[
ND_DIMS] = {0.0, 0.0, 0.0, 0.0};
1730 double max[
ND_DIMS] = {0.0, 0.0, 0.0, 0.0};
1731 double cellsize[
ND_DIMS] = {0.0, 0.0, 0.0, 0.0};
1732
1733 nd_box = sample_boxes[i];
1734 if ( ! nd_box ) continue;
1735
1736
1737 vacuum_delay_point();
1738
1739
1741 memset(at, 0,
sizeof(
int)*
ND_DIMS);
1742
1743 POSTGIS_DEBUGF(3, " feature %d: ibox (%d, %d, %d, %d) (%d, %d, %d, %d)", i,
1744 nd_ibox.
min[0], nd_ibox.
min[1], nd_ibox.
min[2], nd_ibox.
min[3],
1745 nd_ibox.
max[0], nd_ibox.
max[1], nd_ibox.
max[2], nd_ibox.
max[3]);
1746
1747 for ( d = 0; d < nd_stats->
ndims; d++ )
1748 {
1749
1750 at[d] = nd_ibox.
min[d];
1753 cellsize[d] = (max[d] - min[d])/(nd_stats->
size[d]);
1754
1755
1756 tmp_volume *= (nd_box->
max[d] - nd_box->
min[d]);
1757 }
1758
1759
1760 total_sample_volume += tmp_volume;
1761
1762
1763
1764
1765
1766 do
1767 {
1768 ND_BOX nd_cell = { {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0} };
1769 double ratio;
1770
1771 for ( d = 0; d < nd_stats->
ndims; d++ )
1772 {
1773 nd_cell.
min[d] = min[d] + (at[d]+0) * cellsize[d];
1774 nd_cell.
max[d] = min[d] + (at[d]+1) * cellsize[d];
1775 }
1776
1777
1778
1779
1780
1781
1784 num_cells += ratio;
1785 POSTGIS_DEBUGF(3, " ratio (%.8g) num_cells (%.8g)", ratio, num_cells);
1786 POSTGIS_DEBUGF(3, " at (%d, %d, %d, %d)", at[0], at[1], at[2], at[3]);
1787 }
1789
1790
1791 total_cell_count += num_cells;
1792
1793 histogram_features++;
1794 }
1795
1796 POSTGIS_DEBUGF(3, " histogram_features: %d", histogram_features);
1797 POSTGIS_DEBUGF(3, " sample_rows: %d", sample_rows);
1798 POSTGIS_DEBUGF(3, " table_rows: %.6g", total_rows);
1799
1800
1801 if ( ! histogram_features )
1802 {
1803 POSTGIS_DEBUG(3, " no stats have been gathered");
1804 elog(NOTICE, " no features lie in the stats histogram, invalid stats");
1805 stats->stats_valid = false;
1806 return;
1807 }
1808
1812
1813
1814 if ( mode == 2 )
1815 {
1818 }
1819 else
1820 {
1823 }
1824
1825
1826 stats->stakind[stats_slot] = stats_kind;
1827 stats->staop[stats_slot] = InvalidOid;
1828 stats->stanumbers[stats_slot] = (float4*)nd_stats;
1829 stats->numnumbers[stats_slot] = nd_stats_size/sizeof(float4);
1830 stats->stanullfrac = (float4)null_cnt/sample_rows;
1831 stats->stawidth = total_width/notnull_cnt;
1832 stats->stadistinct = -1.0;
1833 stats->stats_valid = true;
1834
1835 POSTGIS_DEBUGF(3, " out: slot 0: kind %d (STATISTIC_KIND_ND)", stats->stakind[0]);
1836 POSTGIS_DEBUGF(3, " out: slot 0: op %d (InvalidOid)", stats->staop[0]);
1837 POSTGIS_DEBUGF(3, " out: slot 0: numnumbers %d", stats->numnumbers[0]);
1838 POSTGIS_DEBUGF(3, " out: null fraction: %f=%d/%d", stats->stanullfrac, null_cnt, sample_rows);
1839 POSTGIS_DEBUGF(3, " out: average width: %d bytes", stats->stawidth);
1840 POSTGIS_DEBUG (3, " out: distinct values: all (no check done)");
1842
1843
1844
1845
1846 return;
1847}
int gbox_is_valid(const GBOX *gbox)
Return false if any of the dimensions is NaN or infinite.
int gserialized_get_gbox_p(const GSERIALIZED *g, GBOX *gbox)
Read the box from the GSERIALIZED or calculate it if necessary.
struct ND_STATS_T ND_STATS
N-dimensional statistics structure.
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return true if ND_BOX a overlaps b, false otherwise.
static int nd_box_init_bounds(ND_BOX *a)
Prepare an ND_BOX for bounds calculation: set the maxes to the smallest thing possible and the mins t...
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one,...
#define STATISTIC_SLOT_ND
static int gbox_ndims(const GBOX *gbox)
Given that geodetic boxes are X/Y/Z regardless of the underlying geometry dimensionality and other bo...
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
#define ND_DIMS
The maximum number of dimensions our code can handle.
#define STATISTIC_KIND_2D
static int nd_box_merge(const ND_BOX *source, ND_BOX *target)
Create a printable view of the ND_STATS histogram.
#define STATISTIC_KIND_ND
static double total_double(const double *vals, int nvals)
Given double array, return sum of values.
static void nd_box_from_gbox(const GBOX *gbox, ND_BOX *nd_box)
Set the values of an ND_BOX from a GBOX.
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
static int nd_box_expand(ND_BOX *nd_box, double expansion_factor)
Expand an ND_BOX ever so slightly.
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
static double nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
Returns the proportion of b2 that is covered by b1.
static int nd_stats_value_index(const ND_STATS *stats, int *indexes)
Given a position in the n-d histogram (i,j,k) return the position in the 1-d values array.
#define STATISTIC_SLOT_2D
static int nd_box_array_distribution(const ND_BOX **nd_boxes, int num_boxes, const ND_BOX *extent, int ndims, double *distribution)
Calculate how much a set of boxes is homogenously distributed or contentrated within one dimension,...
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...
N-dimensional box index type.
float4 histogram_features
N-dimensional statistics structure.