PostGIS 3.0.6dev-r@@SVN_REVISION@@
Loading...
Searching...
No Matches

◆ estimate_join_selectivity()

static float8 estimate_join_selectivity ( const ND_STATS s1,
const ND_STATS s2 
)
static

Given two statistics histograms, what is the selectivity of a join driven by the && or &&& operator?

Join selectivity is defined as the number of rows returned by the join operator divided by the number of rows that an unconstrained join would return (nrows1*nrows2).

To get the estimate of join rows, we walk through the cells of one histogram, and multiply the cell value by the proportion of the cells in the other histogram the cell overlaps: val += val1 * ( val2 * overlap_ratio )

Definition at line 1030 of file gserialized_estimate.c.

1031{
1032 int ncells1, ncells2;
1033 int ndims1, ndims2, ndims;
1034 double ntuples_max;
1035 double ntuples_not_null1, ntuples_not_null2;
1036
1037 ND_BOX extent1, extent2;
1038 ND_IBOX ibox1, ibox2;
1039 int at1[ND_DIMS];
1040 int at2[ND_DIMS];
1041 double min1[ND_DIMS];
1042 double width1[ND_DIMS];
1043 double cellsize1[ND_DIMS];
1044 int size2[ND_DIMS];
1045 double min2[ND_DIMS];
1046 double width2[ND_DIMS];
1047 double cellsize2[ND_DIMS];
1048 int size1[ND_DIMS];
1049 int d;
1050 double val = 0;
1051 float8 selectivity;
1052
1053 /* Drop out on null inputs */
1054 if ( ! ( s1 && s2 ) )
1055 {
1056 elog(NOTICE, " estimate_join_selectivity called with null inputs");
1057 return FALLBACK_ND_SEL;
1058 }
1059
1060 /* We need to know how many cells each side has... */
1061 ncells1 = (int)roundf(s1->histogram_cells);
1062 ncells2 = (int)roundf(s2->histogram_cells);
1063
1064 /* ...so that we can drive the summation loop with the smaller histogram. */
1065 if ( ncells1 > ncells2 )
1066 {
1067 const ND_STATS *stats_tmp = s1;
1068 s1 = s2;
1069 s2 = stats_tmp;
1070 }
1071
1072 POSTGIS_DEBUGF(3, "s1: %s", nd_stats_to_json(s1));
1073 POSTGIS_DEBUGF(3, "s2: %s", nd_stats_to_json(s2));
1074
1075 /* Re-read that info after the swap */
1076 ncells1 = (int)roundf(s1->histogram_cells);
1077 ncells2 = (int)roundf(s2->histogram_cells);
1078
1079 /* Q: What's the largest possible join size these relations can create? */
1080 /* A: The product of the # of non-null rows in each relation. */
1081 ntuples_not_null1 = s1->table_features * (s1->not_null_features / s1->sample_features);
1082 ntuples_not_null2 = s2->table_features * (s2->not_null_features / s2->sample_features);
1083 ntuples_max = ntuples_not_null1 * ntuples_not_null2;
1084
1085 /* Get the ndims as ints */
1086 ndims1 = (int)roundf(s1->ndims);
1087 ndims2 = (int)roundf(s2->ndims);
1088 ndims = Max(ndims1, ndims2);
1089
1090 /* Get the extents */
1091 extent1 = s1->extent;
1092 extent2 = s2->extent;
1093
1094 /* If relation stats do not intersect, join is very very selective. */
1095 if ( ! nd_box_intersects(&extent1, &extent2, ndims) )
1096 {
1097 POSTGIS_DEBUG(3, "relation stats do not intersect, returning 0");
1098 PG_RETURN_FLOAT8(0.0);
1099 }
1100
1101 /*
1102 * First find the index range of the part of the smaller
1103 * histogram that overlaps the larger one.
1104 */
1105 if ( ! nd_box_overlap(s1, &extent2, &ibox1) )
1106 {
1107 POSTGIS_DEBUG(3, "could not calculate overlap of relations");
1108 PG_RETURN_FLOAT8(FALLBACK_ND_JOINSEL);
1109 }
1110
1111 /* Initialize counters / constants on s1 */
1112 for ( d = 0; d < ndims1; d++ )
1113 {
1114 at1[d] = ibox1.min[d];
1115 min1[d] = s1->extent.min[d];
1116 width1[d] = s1->extent.max[d] - s1->extent.min[d];
1117 size1[d] = (int)roundf(s1->size[d]);
1118 cellsize1[d] = width1[d] / size1[d];
1119 }
1120
1121 /* Initialize counters / constants on s2 */
1122 for ( d = 0; d < ndims2; d++ )
1123 {
1124 min2[d] = s2->extent.min[d];
1125 width2[d] = s2->extent.max[d] - s2->extent.min[d];
1126 size2[d] = (int)roundf(s2->size[d]);
1127 cellsize2[d] = width2[d] / size2[d];
1128 }
1129
1130 /* For each affected cell of s1... */
1131 do
1132 {
1133 double val1;
1134 /* Construct the bounds of this cell */
1135 ND_BOX nd_cell1;
1136 nd_box_init(&nd_cell1);
1137 for ( d = 0; d < ndims1; d++ )
1138 {
1139 nd_cell1.min[d] = min1[d] + (at1[d]+0) * cellsize1[d];
1140 nd_cell1.max[d] = min1[d] + (at1[d]+1) * cellsize1[d];
1141 }
1142
1143 /* Find the cells of s2 that cell1 overlaps.. */
1144 nd_box_overlap(s2, &nd_cell1, &ibox2);
1145
1146 /* Initialize counter */
1147 for ( d = 0; d < ndims2; d++ )
1148 {
1149 at2[d] = ibox2.min[d];
1150 }
1151
1152 POSTGIS_DEBUGF(3, "at1 %d,%d %s", at1[0], at1[1], nd_box_to_json(&nd_cell1, ndims1));
1153
1154 /* Get the value at this cell */
1155 val1 = s1->value[nd_stats_value_index(s1, at1)];
1156
1157 /* For each overlapped cell of s2... */
1158 do
1159 {
1160 double ratio2;
1161 double val2;
1162
1163 /* Construct the bounds of this cell */
1164 ND_BOX nd_cell2;
1165 nd_box_init(&nd_cell2);
1166 for ( d = 0; d < ndims2; d++ )
1167 {
1168 nd_cell2.min[d] = min2[d] + (at2[d]+0) * cellsize2[d];
1169 nd_cell2.max[d] = min2[d] + (at2[d]+1) * cellsize2[d];
1170 }
1171
1172 POSTGIS_DEBUGF(3, " at2 %d,%d %s", at2[0], at2[1], nd_box_to_json(&nd_cell2, ndims2));
1173
1174 /* Calculate overlap ratio of the cells */
1175 ratio2 = nd_box_ratio(&nd_cell1, &nd_cell2, Max(ndims1, ndims2));
1176
1177 /* Multiply the cell counts, scaled by overlap ratio */
1178 val2 = s2->value[nd_stats_value_index(s2, at2)];
1179 POSTGIS_DEBUGF(3, " val1 %.6g val2 %.6g ratio %.6g", val1, val2, ratio2);
1180 val += val1 * (val2 * ratio2);
1181 }
1182 while ( nd_increment(&ibox2, ndims2, at2) );
1183
1184 }
1185 while( nd_increment(&ibox1, ndims1, at1) );
1186
1187 POSTGIS_DEBUGF(3, "val of histogram = %g", val);
1188
1189 /*
1190 * In order to compare our total cell count "val" to the
1191 * ntuples_max, we need to scale val up to reflect a full
1192 * table estimate. So, multiply by ratio of table size to
1193 * sample size.
1194 */
1195 val *= (s1->table_features / s1->sample_features);
1196 val *= (s2->table_features / s2->sample_features);
1197
1198 POSTGIS_DEBUGF(3, "val scaled to full table size = %g", val);
1199
1200 /*
1201 * Because the cell counts are over-determined due to
1202 * double counting of features that overlap multiple cells
1203 * (see the compute_gserialized_stats routine)
1204 * we also have to scale our cell count "val" *down*
1205 * to adjust for the double counting.
1206 */
1207// val /= (s1->cells_covered / s1->histogram_features);
1208// val /= (s2->cells_covered / s2->histogram_features);
1209
1210 /*
1211 * Finally, the selectivity is the estimated number of
1212 * rows to be returned divided by the maximum possible
1213 * number of rows that can be returned.
1214 */
1215 selectivity = val / ntuples_max;
1216
1217 /* Guard against over-estimates and crazy numbers :) */
1218 if ( isnan(selectivity) || ! isfinite(selectivity) || selectivity < 0.0 )
1219 {
1220 selectivity = DEFAULT_ND_JOINSEL;
1221 }
1222 else if ( selectivity > 1.0 )
1223 {
1224 selectivity = 1.0;
1225 }
1226
1227 return selectivity;
1228}
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return true if ND_BOX a overlaps b, false otherwise.
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one,...
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
#define ND_DIMS
The maximum number of dimensions our code can handle.
#define DEFAULT_ND_JOINSEL
#define FALLBACK_ND_SEL
More modest fallback selectivity factor.
#define FALLBACK_ND_JOINSEL
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
static double nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
Returns the proportion of b2 that is covered by b1.
static int nd_stats_value_index(const ND_STATS *stats, int *indexes)
Given a position in the n-d histogram (i,j,k) return the position in the 1-d values array.
float4 max[ND_DIMS]
float4 min[ND_DIMS]
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...
int min[ND_DIMS]
N-dimensional box index type.
float4 size[ND_DIMS]
N-dimensional statistics structure.

References DEFAULT_ND_JOINSEL, ND_STATS_T::extent, FALLBACK_ND_JOINSEL, FALLBACK_ND_SEL, ND_STATS_T::histogram_cells, ND_BOX_T::max, ND_BOX_T::min, ND_IBOX_T::min, nd_box_init(), nd_box_intersects(), nd_box_overlap(), nd_box_ratio(), nd_box_to_json(), ND_DIMS, nd_increment(), nd_stats_to_json(), nd_stats_value_index(), ND_STATS_T::ndims, ND_STATS_T::not_null_features, ND_STATS_T::sample_features, ND_STATS_T::size, ND_STATS_T::table_features, and ND_STATS_T::value.

Referenced by _postgis_gserialized_joinsel(), and gserialized_joinsel_internal().

Here is the call graph for this function:
Here is the caller graph for this function: