◆ estimate_join_selectivity()

static float8 estimate_join_selectivity	(	const ND_STATS *	s1,
		const ND_STATS *	s2
	)

static

Given two statistics histograms, what is the selectivity of a join driven by the && or &&& operator?

Join selectivity is defined as the number of rows returned by the join operator divided by the number of rows that an unconstrained join would return (nrows1*nrows2).

To get the estimate of join rows, we walk through the cells of one histogram, and multiply the cell value by the proportion of the cells in the other histogram the cell overlaps: val += val1 * ( val2 * overlap_ratio )

Definition at line 1030 of file gserialized_estimate.c.

{
        int ncells1, ncells2;
        int ndims1, ndims2, ndims;
        double ntuples_max;
        double ntuples_not_null1, ntuples_not_null2;
 
        ND_BOX extent1, extent2;
        ND_IBOX ibox1, ibox2;
        int at1[ND_DIMS];
        int at2[ND_DIMS];
        double min1[ND_DIMS];
        double width1[ND_DIMS];
        double cellsize1[ND_DIMS];
        int size2[ND_DIMS];
        double min2[ND_DIMS];
        double width2[ND_DIMS];
        double cellsize2[ND_DIMS];
        int size1[ND_DIMS];
        int d;
        double val = 0;
        float8 selectivity;
 
        /* Drop out on null inputs */
        if ( ! ( s1 && s2 ) )
        {
                elog(NOTICE, " estimate_join_selectivity called with null inputs");
                return FALLBACK_ND_SEL;
        }
 
        /* We need to know how many cells each side has... */
        ncells1 = (int)roundf(s1->histogram_cells);
        ncells2 = (int)roundf(s2->histogram_cells);
 
        /* ...so that we can drive the summation loop with the smaller histogram. */
        if ( ncells1 > ncells2 )
        {
                const ND_STATS *stats_tmp = s1;
                s1 = s2;
                s2 = stats_tmp;
        }
 
        POSTGIS_DEBUGF(3, "s1: %s", nd_stats_to_json(s1));
        POSTGIS_DEBUGF(3, "s2: %s", nd_stats_to_json(s2));
 
        /* Re-read that info after the swap */
        ncells1 = (int)roundf(s1->histogram_cells);
        ncells2 = (int)roundf(s2->histogram_cells);
 
        /* Q: What's the largest possible join size these relations can create? */
        /* A: The product of the # of non-null rows in each relation. */
        ntuples_not_null1 = s1->table_features * (s1->not_null_features / s1->sample_features);
        ntuples_not_null2 = s2->table_features * (s2->not_null_features / s2->sample_features);
        ntuples_max = ntuples_not_null1 * ntuples_not_null2;
 
        /* Get the ndims as ints */
        ndims1 = (int)roundf(s1->ndims);
        ndims2 = (int)roundf(s2->ndims);
        ndims = Max(ndims1, ndims2);
 
        /* Get the extents */
        extent1 = s1->extent;
        extent2 = s2->extent;
 
        /* If relation stats do not intersect, join is very very selective. */
        if ( ! nd_box_intersects(&extent1, &extent2, ndims) )
        {
                POSTGIS_DEBUG(3, "relation stats do not intersect, returning 0");
                PG_RETURN_FLOAT8(0.0);
        }
 
        /*
         * First find the index range of the part of the smaller
         * histogram that overlaps the larger one.
         */
        if ( ! nd_box_overlap(s1, &extent2, &ibox1) )
        {
                POSTGIS_DEBUG(3, "could not calculate overlap of relations");
                PG_RETURN_FLOAT8(FALLBACK_ND_JOINSEL);
        }
 
        /* Initialize counters / constants on s1 */
        for ( d = 0; d < ndims1; d++ )
        {
                at1[d] = ibox1.min[d];
                min1[d] = s1->extent.min[d];
                width1[d] = s1->extent.max[d] - s1->extent.min[d];
                size1[d] = (int)roundf(s1->size[d]);
                cellsize1[d] = width1[d] / size1[d];
        }
 
        /* Initialize counters / constants on s2 */
        for ( d = 0; d < ndims2; d++ )
        {
                min2[d] = s2->extent.min[d];
                width2[d] = s2->extent.max[d] - s2->extent.min[d];
                size2[d] = (int)roundf(s2->size[d]);
                cellsize2[d] = width2[d] / size2[d];
        }
 
        /* For each affected cell of s1... */
        do
        {
                double val1;
                /* Construct the bounds of this cell */
                ND_BOX nd_cell1;
                nd_box_init(&nd_cell1);
                for ( d = 0; d < ndims1; d++ )
                {
                        nd_cell1.min[d] = min1[d] + (at1[d]+0) * cellsize1[d];
                        nd_cell1.max[d] = min1[d] + (at1[d]+1) * cellsize1[d];
                }
 
                /* Find the cells of s2 that cell1 overlaps.. */
                nd_box_overlap(s2, &nd_cell1, &ibox2);
 
                /* Initialize counter */
                for ( d = 0; d < ndims2; d++ )
                {
                        at2[d] = ibox2.min[d];
                }
 
                POSTGIS_DEBUGF(3, "at1 %d,%d  %s", at1[0], at1[1], nd_box_to_json(&nd_cell1, ndims1));
 
                /* Get the value at this cell */
                val1 = s1->value[nd_stats_value_index(s1, at1)];
 
                /* For each overlapped cell of s2... */
                do
                {
                        double ratio2;
                        double val2;
 
                        /* Construct the bounds of this cell */
                        ND_BOX nd_cell2;
                        nd_box_init(&nd_cell2);
                        for ( d = 0; d < ndims2; d++ )
                        {
                                nd_cell2.min[d] = min2[d] + (at2[d]+0) * cellsize2[d];
                                nd_cell2.max[d] = min2[d] + (at2[d]+1) * cellsize2[d];
                        }
 
                        POSTGIS_DEBUGF(3, "  at2 %d,%d  %s", at2[0], at2[1], nd_box_to_json(&nd_cell2, ndims2));
 
                        /* Calculate overlap ratio of the cells */
                        ratio2 = nd_box_ratio(&nd_cell1, &nd_cell2, Max(ndims1, ndims2));
 
                        /* Multiply the cell counts, scaled by overlap ratio */
                        val2 = s2->value[nd_stats_value_index(s2, at2)];
                        POSTGIS_DEBUGF(3, "  val1 %.6g  val2 %.6g  ratio %.6g", val1, val2, ratio2);
                        val += val1 * (val2 * ratio2);
                }
                while ( nd_increment(&ibox2, ndims2, at2) );
 
        }
        while( nd_increment(&ibox1, ndims1, at1) );
 
        POSTGIS_DEBUGF(3, "val of histogram = %g", val);
 
        /*
         * In order to compare our total cell count "val" to the
         * ntuples_max, we need to scale val up to reflect a full
         * table estimate. So, multiply by ratio of table size to
         * sample size.
         */
        val *= (s1->table_features / s1->sample_features);
        val *= (s2->table_features / s2->sample_features);
 
        POSTGIS_DEBUGF(3, "val scaled to full table size = %g", val);
 
        /*
         * Because the cell counts are over-determined due to
         * double counting of features that overlap multiple cells
         * (see the compute_gserialized_stats routine)
         * we also have to scale our cell count "val" *down*
         * to adjust for the double counting.
         */
//      val /= (s1->cells_covered / s1->histogram_features);
//      val /= (s2->cells_covered / s2->histogram_features);
 
        /*
         * Finally, the selectivity is the estimated number of
         * rows to be returned divided by the maximum possible
         * number of rows that can be returned.
         */
        selectivity = val / ntuples_max;
 
        /* Guard against over-estimates and crazy numbers :) */
        if ( isnan(selectivity) || ! isfinite(selectivity) || selectivity < 0.0 )
        {
                selectivity = DEFAULT_ND_JOINSEL;
        }
        else if ( selectivity > 1.0 )
        {
                selectivity = 1.0;
        }
 
        return selectivity;
}

References DEFAULT_ND_JOINSEL, ND_STATS_T::extent, FALLBACK_ND_JOINSEL, FALLBACK_ND_SEL, ND_STATS_T::histogram_cells, ND_BOX_T::max, ND_BOX_T::min, ND_IBOX_T::min, nd_box_init(), nd_box_intersects(), nd_box_overlap(), nd_box_ratio(), nd_box_to_json(), ND_DIMS, nd_increment(), nd_stats_to_json(), nd_stats_value_index(), ND_STATS_T::ndims, ND_STATS_T::not_null_features, ND_STATS_T::sample_features, ND_STATS_T::size, ND_STATS_T::table_features, and ND_STATS_T::value.

Referenced by _postgis_gserialized_joinsel(), and gserialized_joinsel_internal().

Here is the call graph for this function:

Here is the caller graph for this function: