Source code for sample_id.deprecated_ann

import logging

import numpy as np

logger = logging.getLogger(__name__)

FLANN_ALGS = ["kdtree", "kmeans", "composite", "lsh", "autotuned"]
CV_ALGS = ["kdtree", "kmeans", "composite", "lsh", "autotuned"]
SKLEARN_ALGS = ["kd_tree", "ball_tree", "brute", "auto"]


[docs]def train_matcher(data, algorithm="kdtree"):
    #    if algorithm in FLANN_ALGS:
    #        matcher = fit_flann(data, algorithm)
    #    el
    if algorithm in CV_ALGS:
        matcher = fit_cv2(data, algorithm)
    elif algorithm in SKLEARN_ALGS:
        matcher = fit_sklearn(data, algorithm)
    elif algorithm == "lshf":
        matcher = fit_lshf(data)
    elif algorithm == "annoy":
        matcher = fit_annoy(data)
    if not matcher:
        raise ValueError("Invalid matching algorithm: {}".format(algorithm))
    return matcher


[docs]def find_neighbors(matcher, data, algorithm="lshf", k=2):
    logger.info("Finding (approximate) nearest neighbors...")
    if algorithm in FLANN_ALGS:
        matches = matcher.nn_index(np.float32(data), k=k)
        distances, indices = zip(*(((n1.distance, n2.distance), (n1.trainIdx, n2.trainIdx)) for n1, n2 in matches))
    elif algorithm in CV_ALGS:
        matches = matcher.knnMatch(np.float32(data), k=k)
        distances, indices = zip(*(((n1.distance, n2.distance), (n1.trainIdx, n2.trainIdx)) for n1, n2 in matches))
    elif algorithm in SKLEARN_ALGS:
        distances, indices = matcher.kneighbors(data, n_neighbors=k)
    elif algorithm == "lshf":
        distances, indices = matcher.kneighbors(data, n_neighbors=k)
    elif algorithm == "annoy":
        indices = []
        distances = []
        for d in data:
            index, distance = matcher.get_nns_by_vector(d, k, include_distances=True)
            indices.append(index)
            distances.append(distance)
    return distances, indices


[docs]def nearest_neighbors(test, train, algorithm="lshf", k=2):
    matcher = train_matcher(train, algorithm)
    distances, indices = find_neighbors(matcher, test, algorithm, k=k)
    return distances, indices


[docs]def fit_cv2(data, algorithm):
    logger.info("Fitting cv2 FLANN...")
    from cv2 import FlannBasedMatcher

    KDTREE = 0
    index_params = {
        "algorithm": KDTREE,
        "trees": 5,
        #'target_precision': 0.9,
        #'build_weight': 0.01,
        #'memory_weight': 0,
        #'sample_fraction': 0.1,
    }
    search_params = {"checks": 5}
    flann = FlannBasedMatcher(index_params, search_params)
    flann.add(np.float32(data))
    flann.train()
    return flann


[docs]def fit_flann(data, algorithm):
    logger.info("Fitting  FLANN...")
    from pyflann import FLANN

    matcher = FLANN(
        algorithm=algorithm,
        checks=32,
        eps=0.0,
        cb_index=0.5,
        trees=1,
        leaf_max_size=4,
        branching=32,
        iterations=5,
        centers_init="random",
        target_precision=0.9,
        build_weight=0.01,
        memory_weight=0.0,
        sample_fraction=0.1,
        log_level="warning",
        random_seed=-1,
    )
    matcher.build_index(data)
    return matcher


[docs]def fit_sklearn(data, algorithm):
    logger.info("Fitting Sklearn Matcher: {}...".format(algorithm))
    from sklearn.neighbors import NearestNeighbors

    matcher = NearestNeighbors(
        algorithm=algorithm,
        n_neighbors=2,
        radius=1.0,
        leaf_size=30,
        metric="minkowski",
        p=2,
        metric_params=None,
        n_jobs=-1,
    )
    matcher.fit(data)
    return matcher


[docs]def fit_annoy(data, n_trees=-1):
    logger.info("Fitting Annoy Matcher...")
    from annoy import AnnoyIndex

    logger.info("Building Annoy index...")
    matcher = AnnoyIndex(data.shape[1], metric="euclidean")
    for i, d in enumerate(data):
        matcher.add_item(i, d)
    logger.info("Building Annoy Matcher...")
    matcher.build(n_trees)
    return matcher


[docs]def load_annoy(path, n_features=128):
    logger.info("Loading Annoy Index {}...".format(path))
    from annoy import AnnoyIndex

    matcher = AnnoyIndex(n_features, metric="euclidean")
    matcher.load(path)
    return matcher


[docs]def fit_lshf(data):
    logger.info("Fitting  LSHForest...")
    from sklearn.neighbors import LSHForest

    lshf = LSHForest(
        n_estimators=20,
        min_hash_match=4,
        n_candidates=200,
        n_neighbors=2,
        radius=1.0,
        radius_cutoff_ratio=0.9,
        random_state=None,
    )
    lshf.fit(data)
    return lshf