11.29 java中如何計算兩個字符串的相似度?

發現apache提供了現成的解決方案


java中如何計算兩個字符串的相似度?



1.Cosine similarity

package org.apache.commons.text.similarity;

import java.util.HashSet;
import java.util.Map;
import java.util.Set;

/**
* Measures the Cosine similarity of two vectors of an inner product space and
* compares the angle between them.
*
*


* For further explanation about the Cosine Similarity, refer to
* http://en.wikipedia.org/wiki/Cosine_similarity.
*


*
* @since 1.0
*/
public class CosineSimilarity {

/**
* Calculates the cosine similarity for two given vectors.
*
* @param leftVector left vector
* @param rightVector right vector
* @return cosine similarity between the two vectors
*/
public Double cosineSimilarity(final Map<charsequence> leftVector, final Map<charsequence> rightVector) {
if (leftVector == null || rightVector == null) {
throw new IllegalArgumentException("Vectors must not be null");
}

final Set<charsequence> intersection = getIntersection(leftVector, rightVector);

final double dotProduct = dot(leftVector, rightVector, intersection);
double d1 = 0.0d;
for (final Integer value : leftVector.values()) {
d1 += Math.pow(value, 2);
}
double d2 = 0.0d;
for (final Integer value : rightVector.values()) {
d2 += Math.pow(value, 2);
}
double cosineSimilarity;
if (d1 <= 0.0 || d2 <= 0.0) {
cosineSimilarity = 0.0;
} else {
cosineSimilarity = (double) (dotProduct / (double) (Math.sqrt(d1) * Math.sqrt(d2)));
}
return cosineSimilarity;
}

/**
* Returns a set with strings common to the two given maps.
*
* @param leftVector left vector map
* @param rightVector right vector map
* @return common strings
*/
private Set<charsequence> getIntersection(final Map<charsequence> leftVector,
final Map<charsequence> rightVector) {
final Set<charsequence> intersection = new HashSet<>(leftVector.keySet());
intersection.retainAll(rightVector.keySet());
return intersection;
}

/**
* Computes the dot product of two vectors. It ignores remaining elements. It means
* that if a vector is longer than other, then a smaller part of it will be used to compute
* the dot product.
*
* @param leftVector left vector
* @param rightVector right vector
* @param intersection common elements
* @return the dot product
*/
private double dot(final Map<charsequence> leftVector, final Map<charsequence> rightVector,
final Set<charsequence> intersection) {
long dotProduct = 0;
for (final CharSequence key : intersection) {
dotProduct += leftVector.get(key) * rightVector.get(key);
}
return dotProduct;
}

}/<charsequence>/<charsequence>/<charsequence>/<charsequence>/<charsequence>/<charsequence>/<charsequence>/<charsequence>/<charsequence>/<charsequence>

2.JaccardSimilarity

package org.apache.commons.text.similarity;

import java.util.HashSet;
import java.util.Set;

/**
* Measures the Jaccard similarity (aka Jaccard index) of two sets of character
* sequence. Jaccard similarity is the size of the intersection divided by the
* size of the union of the two sets.
*
*


* For further explanation about Jaccard Similarity, refer
* https://en.wikipedia.org/wiki/Jaccard_index
*


*
* @since 1.0
*/
public class JaccardSimilarity implements SimilarityScore<double> {

/**
* Calculates Jaccard Similarity of two set character sequence passed as
* input.
*
* @param left first character sequence
* @param right second character sequence
* @return index
* @throws IllegalArgumentException
* if either String input {@code null}
*/
@Override
public Double apply(CharSequence left, CharSequence right) {
if (left == null || right == null) {
throw new IllegalArgumentException("Input cannot be null");
}
return Math.round(calculateJaccardSimilarity(left, right) * 100d) / 100d;
}

/**
* Calculates Jaccard Similarity of two character sequences passed as
* input. Does the calculation by identifying the union (characters in at
* least one of the two sets) of the two sets and intersection (characters
* which are present in set one which are present in set two)
*
* @param left first character sequence
* @param right second character sequence
* @return index
*/
private Double calculateJaccardSimilarity(CharSequence left, CharSequence right) {
Set<string> intersectionSet = new HashSet<string>();
Set<string> unionSet = new HashSet<string>();
boolean unionFilled = false;
int leftLength = left.length();
int rightLength = right.length();
if (leftLength == 0 || rightLength == 0) {
return 0d;
}

for (int leftIndex = 0; leftIndex < leftLength; leftIndex++) {
unionSet.add(String.valueOf(left.charAt(leftIndex)));
for (int rightIndex = 0; rightIndex < rightLength; rightIndex++) {
if (!unionFilled) {
unionSet.add(String.valueOf(right.charAt(rightIndex)));
}
if (left.charAt(leftIndex) == right.charAt(rightIndex)) {
intersectionSet.add(String.valueOf(left.charAt(leftIndex)));
}
}
unionFilled = true;
}
return Double.valueOf(intersectionSet.size()) / Double.valueOf(unionSet.size());
}
}/<string>/<string>/<string>/<string>/<double>

3.LevenshteinDistance

/**
\t * LevenshteinDistance
* copied from https://commons.apache.org/proper/commons-lang/javadocs/api-2.5/src-html/org/apache/commons/lang/StringUtils.html#line.6162
*/
public static int getLevenshteinDistance(String s, String t) {
if (s == null || t == null) {
throw new IllegalArgumentException("Strings must not be null");
}

int n = s.length(); // length of s
int m = t.length(); // length of t

if (n == 0) {
return m;
} else if (m == 0) {
return n;
}

if (n > m) {
// swap the input strings to consume less memory
String tmp = s;
s = t;
t = tmp;
n = m;
m = t.length();
}

int p[] = new int[n + 1]; //'previous' cost array, horizontally
int d[] = new int[n + 1]; // cost array, horizontally
int _d[]; //placeholder to assist in swapping p and d

// indexes into strings s and t
int i; // iterates through s
int j; // iterates through t

char t_j; // jth character of t

int cost; // cost

for (i = 0; i <= n; i++) {
p[i] = i;
}

for (j = 1; j <= m; j++) {
t_j = t.charAt(j - 1);
d[0] = j;

for (i = 1; i <= n; i++) {
cost = s.charAt(i - 1) == t_j ? 0 : 1;
// minimum of cell to the left+1, to the top+1, diagonally left and up +cost
d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost);
}

// copy current distance counts to 'previous row' distance counts
_d = p;
p = d;
d = _d;
}

// our last action in the above loop was to switch d and p, so p now
// actually has the most recent cost counts
return p[n];
}

4.JaroWinklerDistance

/**
* JaroWinklerDistance
* Copied from https://commons.apache.org/sandbox/commons-text/jacoco/org.apache.commons.text.similarity/JaroWinklerDistance.java.html
* apply method changed to similarity
*/
public static Double similarity(final CharSequence left, final CharSequence right) {
final double defaultScalingFactor = 0.1;
final double percentageRoundValue = 100.0;

if (left == null || right == null) {
throw new IllegalArgumentException("Strings must not be null");
}

int[] mtp = matches(left, right);
double m = mtp[0];
if (m == 0) {
return 0D;
}
double j = ((m / left.length() + m / right.length() + (m - mtp[1]) / m)) / 3;
double jw = j < 0.7D ? j : j + Math.min(defaultScalingFactor, 1D / mtp[3]) * mtp[2] * (1D - j);
return Math.round(jw * percentageRoundValue) / percentageRoundValue;
}

protected static int[] matches(final CharSequence first, final CharSequence second) {
CharSequence max, min;
if (first.length() > second.length()) {
max = first;
min = second;
} else {
max = second;
min = first;
}
int range = Math.max(max.length() / 2 - 1, 0);
int[] matchIndexes = new int[min.length()];
Arrays.fill(matchIndexes, -1);
boolean[] matchFlags = new boolean[max.length()];
int matches = 0;
for (int mi = 0; mi < min.length(); mi++) {
char c1 = min.charAt(mi);
for (int xi = Math.max(mi - range, 0), xn = Math.min(mi + range + 1, max.length()); xi < xn; xi++) {
if (!matchFlags[xi] && c1 == max.charAt(xi)) {
matchIndexes[mi] = xi;
matchFlags[xi] = true;
matches++;
break;
}
}
}
char[] ms1 = new char[matches];
char[] ms2 = new char[matches];
for (int i = 0, si = 0; i < min.length(); i++) {
if (matchIndexes[i] != -1) {
ms1[si] = min.charAt(i);
si++;
}
}
for (int i = 0, si = 0; i < max.length(); i++) {
if (matchFlags[i]) {
ms2[si] = max.charAt(i);
si++;
}
}
int transpositions = 0;
for (int mi = 0; mi < ms1.length; mi++) {
if (ms1[mi] != ms2[mi]) {
transpositions++;
}
}
int prefix = 0;
for (int mi = 0; mi < min.length(); mi++) {
if (first.charAt(mi) == second.charAt(mi)) {
prefix++;
} else {
break;
}
}
return new int[] { matches, transpositions / 2, prefix, max.length() };
}





【1】https://stackoverflow.com/questions/955110/similarity-string-comparison-in-java

【2】https://zatackcoder.com/java-program-to-check-two-strings-similarity/

【3】https://commons.apache.org/sandbox/commons-text/jacoco/org.apache.commons.text.similarity/index.source.html


分享到:


相關文章: