Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,10 @@ the foreground set of "ended in failure" versus "NOT ended in failure".
`"background_is_superset": false` indicates that the background set does
not contain the counts of the foreground set as they are filtered out.

`"normalize_above": 1000` facilitates returning consistent significance results
at various scales. `1000` indicates that term counts greater than `1000` are
scaled down by a factor of `1000/term_count`.

[source,console]
--------------------------------------------------
GET /_search
Expand Down Expand Up @@ -466,7 +470,7 @@ GET /_search
]
}
},
"p_value": {"background_is_superset": false}
"p_value": {"background_is_superset": false, "normalize_above": 1000}
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import org.elasticsearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.InternalAggregationTestCase;
import org.elasticsearch.test.VersionUtils;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
Expand All @@ -52,7 +53,6 @@
import static java.util.Collections.emptyMap;
import static java.util.Collections.singletonList;
import static org.elasticsearch.search.aggregations.AggregationBuilders.significantTerms;
import static org.elasticsearch.test.VersionUtils.randomVersion;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.greaterThan;
Expand All @@ -69,9 +69,13 @@ public abstract class AbstractSignificanceHeuristicTestCase extends ESTestCase {
*/
protected abstract SignificanceHeuristic getHeuristic();

protected Version randomVersion() {
return VersionUtils.randomVersion(random());
}

// test that stream output can actually be read - does not replace bwc test
public void testStreamResponse() throws Exception {
Version version = randomVersion(random());
Version version = randomVersion();
InternalMappedSignificantTerms<?, ?> sigTerms = getRandomSignificantTerms(getHeuristic());

// write
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,57 +10,92 @@


import org.apache.commons.math3.util.FastMath;
import org.elasticsearch.Version;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.xcontent.ConstructingObjectParser;
import org.elasticsearch.xcontent.ParseField;
import org.elasticsearch.xcontent.XContentBuilder;
import org.elasticsearch.xcontent.XContentParser;
import org.elasticsearch.search.aggregations.AggregationExecutionException;
import org.elasticsearch.search.aggregations.bucket.terms.heuristic.NXYSignificanceHeuristic;
import org.elasticsearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic;

import java.io.IOException;
import java.util.Objects;

import static org.elasticsearch.xcontent.ConstructingObjectParser.optionalConstructorArg;

/**
* Significant terms heuristic that calculates the p-value between the term existing in foreground and background sets.
*
* The p-value is the probability of obtaining test results at least as extreme as
* the results actually observed, under the assumption that the null hypothesis is
* correct. The p-value is calculated assuming that the foreground set and the
* background set are independent https://en.wikipedia.org/wiki/Bernoulli_trial, with the null
* hypothesis that the probabilities are the same.
*/
public class PValueScore extends NXYSignificanceHeuristic {
public static final String NAME = "p_value";
public static final ParseField NORMALIZE_ABOVE = new ParseField("normalize_above");
public static final ConstructingObjectParser<PValueScore, Void> PARSER = new ConstructingObjectParser<>(NAME, args -> {
boolean backgroundIsSuperset = args[0] == null || (boolean) args[0];
return new PValueScore(backgroundIsSuperset);
return new PValueScore(backgroundIsSuperset, (Long)args[1]);
});
static {
PARSER.declareBoolean(optionalConstructorArg(), BACKGROUND_IS_SUPERSET);
PARSER.declareLong(optionalConstructorArg(), NORMALIZE_ABOVE);
}

private static final MlChiSquaredDistribution CHI_SQUARED_DISTRIBUTION = new MlChiSquaredDistribution(1);

public PValueScore(boolean backgroundIsSuperset) {
// NOTE: `0` is a magic value indicating no normalization occurs
private final long normalizeAbove;
Comment thread
benwtrent marked this conversation as resolved.

/**
* @param backgroundIsSuperset Does the background contain the foreground docs?
* @param normalizeAbove Should the results be normalized when above the given value.
* Note: `0` is a special value which means no normalization (set as such when `null` is provided)
*/
public PValueScore(boolean backgroundIsSuperset, Long normalizeAbove) {
super(true, backgroundIsSuperset);
if (normalizeAbove != null && normalizeAbove <= 0) {
throw new IllegalArgumentException(
"[" + NORMALIZE_ABOVE.getPreferredName() + "] must be a positive value, provided [" + normalizeAbove + "]"
);
}
this.normalizeAbove = normalizeAbove == null ? 0L : normalizeAbove;
}

public PValueScore(StreamInput in) throws IOException {
super(true, in.readBoolean());
if (in.getVersion().onOrAfter(Version.V_8_0_0)) {
normalizeAbove = in.readVLong();
} else {
normalizeAbove = 0L;
}
}

@Override
public void writeTo(StreamOutput out) throws IOException {
out.writeBoolean(backgroundIsSuperset);
if (out.getVersion().onOrAfter(Version.V_8_0_0)) {
out.writeVLong(normalizeAbove);
}
}

@Override
public boolean equals(Object obj) {
if ((obj instanceof PValueScore) == false) {
return false;
}
return super.equals(obj);
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
if (super.equals(o) == false) return false;
PValueScore that = (PValueScore) o;
return normalizeAbove == that.normalizeAbove;
}

@Override
public int hashCode() {
int result = NAME.hashCode();
result = 31 * result + super.hashCode();
return result;
return Objects.hash(super.hashCode(), normalizeAbove);
}

@Override
Expand All @@ -72,6 +107,9 @@ public String getWriteableName() {
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject(NAME);
builder.field(BACKGROUND_IS_SUPERSET.getPreferredName(), backgroundIsSuperset);
if (normalizeAbove > 0) {
builder.field(NORMALIZE_ABOVE.getPreferredName(), normalizeAbove);
}
builder.endObject();
return builder;
}
Expand Down Expand Up @@ -113,6 +151,19 @@ public double getScore(long subsetFreq, long subsetSize, long supersetFreq, long
return 0.0;
}

if (normalizeAbove > 0L) {
if (allDocsInClass > normalizeAbove) {
double factor = (double) normalizeAbove / allDocsInClass;
allDocsInClass = (long)(allDocsInClass * factor);
docsContainTermInClass = (long)(docsContainTermInClass * factor);
}
if (allDocsNotInClass > normalizeAbove) {
double factor = (double) normalizeAbove / allDocsNotInClass;
allDocsNotInClass = (long)(allDocsNotInClass * factor);
docsContainTermNotInClass = (long)(docsContainTermNotInClass * factor);
}
}

// casting to `long` to round down to nearest whole number
double epsAllDocsInClass = (long)eps(allDocsInClass);
double epsAllDocsNotInClass = (long)eps(allDocsNotInClass);
Expand Down Expand Up @@ -164,15 +215,25 @@ private double eps(double value) {
}

public static class PValueScoreBuilder extends NXYBuilder {
private final long normalizeAbove;

public PValueScoreBuilder(boolean backgroundIsSuperset) {
public PValueScoreBuilder(boolean backgroundIsSuperset, Long normalizeAbove) {
super(true, backgroundIsSuperset);
this.normalizeAbove = normalizeAbove == null ? 0L : normalizeAbove;
if (normalizeAbove != null && normalizeAbove <= 0) {
throw new IllegalArgumentException(
"[" + NORMALIZE_ABOVE.getPreferredName() + "] must be a positive value, provided [" + normalizeAbove + "]"
);
}
}

@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject(NAME);
builder.field(BACKGROUND_IS_SUPERSET.getPreferredName(), backgroundIsSuperset);
if (normalizeAbove > 0) {
builder.field(NORMALIZE_ABOVE.getPreferredName(), normalizeAbove);
}
builder.endObject();
return builder;
}
Expand Down
Loading