Given two statistics histograms, what is the selectivity of a join driven by the && or &&& operator?
Join selectivity is defined as the number of rows returned by the join operator divided by the number of rows that an unconstrained join would return (nrows1*nrows2).
To get the estimate of join rows, we walk through the cells of one histogram, and multiply the cell value by the proportion of the cells in the other histogram the cell overlaps: val += val1 * ( val2 * overlap_ratio )
1031{
1032 int ncells1, ncells2;
1033 int ndims1, ndims2, ndims;
1034 double ntuples_max;
1035 double ntuples_not_null1, ntuples_not_null2;
1036
1049 int d;
1050 double val = 0;
1051 float8 selectivity;
1052
1053
1054 if ( ! ( s1 && s2 ) )
1055 {
1056 elog(NOTICE, " estimate_join_selectivity called with null inputs");
1058 }
1059
1060
1063
1064
1065 if ( ncells1 > ncells2 )
1066 {
1068 s1 = s2;
1069 s2 = stats_tmp;
1070 }
1071
1074
1075
1078
1079
1080
1083 ntuples_max = ntuples_not_null1 * ntuples_not_null2;
1084
1085
1086 ndims1 = (int)roundf(s1->
ndims);
1087 ndims2 = (int)roundf(s2->
ndims);
1088 ndims = Max(ndims1, ndims2);
1089
1090
1093
1094
1096 {
1097 POSTGIS_DEBUG(3, "relation stats do not intersect, returning 0");
1098 PG_RETURN_FLOAT8(0.0);
1099 }
1100
1101
1102
1103
1104
1106 {
1107 POSTGIS_DEBUG(3, "could not calculate overlap of relations");
1109 }
1110
1111
1112 for ( d = 0; d < ndims1; d++ )
1113 {
1114 at1[d] = ibox1.
min[d];
1117 size1[d] = (int)roundf(s1->
size[d]);
1118 cellsize1[d] = width1[d] / size1[d];
1119 }
1120
1121
1122 for ( d = 0; d < ndims2; d++ )
1123 {
1126 size2[d] = (int)roundf(s2->
size[d]);
1127 cellsize2[d] = width2[d] / size2[d];
1128 }
1129
1130
1131 do
1132 {
1133 double val1;
1134
1137 for ( d = 0; d < ndims1; d++ )
1138 {
1139 nd_cell1.
min[d] = min1[d] + (at1[d]+0) * cellsize1[d];
1140 nd_cell1.
max[d] = min1[d] + (at1[d]+1) * cellsize1[d];
1141 }
1142
1143
1145
1146
1147 for ( d = 0; d < ndims2; d++ )
1148 {
1149 at2[d] = ibox2.
min[d];
1150 }
1151
1152 POSTGIS_DEBUGF(3,
"at1 %d,%d %s", at1[0], at1[1],
nd_box_to_json(&nd_cell1, ndims1));
1153
1154
1156
1157
1158 do
1159 {
1160 double ratio2;
1161 double val2;
1162
1163
1166 for ( d = 0; d < ndims2; d++ )
1167 {
1168 nd_cell2.
min[d] = min2[d] + (at2[d]+0) * cellsize2[d];
1169 nd_cell2.
max[d] = min2[d] + (at2[d]+1) * cellsize2[d];
1170 }
1171
1172 POSTGIS_DEBUGF(3,
" at2 %d,%d %s", at2[0], at2[1],
nd_box_to_json(&nd_cell2, ndims2));
1173
1174
1175 ratio2 =
nd_box_ratio(&nd_cell1, &nd_cell2, Max(ndims1, ndims2));
1176
1177
1179 POSTGIS_DEBUGF(3, " val1 %.6g val2 %.6g ratio %.6g", val1, val2, ratio2);
1180 val += val1 * (val2 * ratio2);
1181 }
1183
1184 }
1186
1187 POSTGIS_DEBUGF(3, "val of histogram = %g", val);
1188
1189
1190
1191
1192
1193
1194
1197
1198 POSTGIS_DEBUGF(3, "val scaled to full table size = %g", val);
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215 selectivity = val / ntuples_max;
1216
1217
1218 if ( isnan(selectivity) || ! isfinite(selectivity) || selectivity < 0.0 )
1219 {
1221 }
1222 else if ( selectivity > 1.0 )
1223 {
1224 selectivity = 1.0;
1225 }
1226
1227 return selectivity;
1228}
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return true if ND_BOX a overlaps b, false otherwise.
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one,...
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
#define ND_DIMS
The maximum number of dimensions our code can handle.
#define DEFAULT_ND_JOINSEL
#define FALLBACK_ND_SEL
More modest fallback selectivity factor.
#define FALLBACK_ND_JOINSEL
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
static double nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
Returns the proportion of b2 that is covered by b1.
static int nd_stats_value_index(const ND_STATS *stats, int *indexes)
Given a position in the n-d histogram (i,j,k) return the position in the 1-d values array.
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...
N-dimensional box index type.
N-dimensional statistics structure.