Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F97161816
PhutilEditDistanceMatrix.php
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Fri, Jan 3, 01:36
Size
14 KB
Mime Type
text/x-php
Expires
Sun, Jan 5, 01:36 (2 d)
Engine
blob
Format
Raw Data
Handle
23340883
Attached To
rPHU libphutil
PhutilEditDistanceMatrix.php
View Options
<?php
/**
* Compute edit distance between two scalar sequences. This class uses
* Levenshtein (or Damerau-Levenshtein) to compute the edit distance between
* two inputs. The inputs are arrays containing any scalars (not just strings)
* so it can be used with, e.g., utf8 sequences.
*
* $matrix = id(new PhutilEditDistanceMatrix())
* ->setSequences(str_split('ran'), str_split('rat'));
*
* $cost = $matrix->getEditDistance();
*
* Edit distance computation is slow and requires a large amount of memory;
* both are roughly O(N * M) in the length of the strings.
*
* You can customize the cost of insertions, deletions and replacements. By
* default, each has a cost of 1.
*
* $matrix->setReplaceCost(1);
*
* By default, transpositions are not evaluated (i.e., the algorithm is
* Levenshtein). You can cause transpositions to be evaluated by setting a
* transpose cost (which will change the algorithm to Damerau-Levenshtein):
*
* $matrix->setTransposeCost(1);
*
* You can also provide a cost to alter the type of operation being applied.
* Many strings have several equivalently expensive edit sequences, but one
* some sequences are more readable to humans than others. Providing a small
* cost to alter operation type tends to smooth out the sequence and produce
* long runs of a single operation, which are generally more readable. For
* example, these strings:
*
* (x)
* ((x))
*
* ...have edit strings "issis" and "isssi", which describe edit operations with
* the same cost, but the latter string is more comprehensible to human viewers.
*
* If you set an alter cost, you must call @{method:setComputeString} to enable
* type computation. The alter cost should generally be smaller than `c / N`,
* where `c` is the smallest operational cost and `N` is the length of the
* longest string. For example, if you are using the default costs (insert = 1,
* delete = 1, replace = 1) and computing edit distances for strings of fewer
* than 1,000 characters, you might set the alter cost to 0.001.
*/
final
class
PhutilEditDistanceMatrix
extends
Phobject
{
private
$insertCost
=
1
;
private
$deleteCost
=
1
;
private
$replaceCost
=
1
;
private
$transposeCost
=
null
;
private
$alterCost
=
0
;
private
$maximumLength
;
private
$computeString
;
private
$applySmoothing
=
self
::
SMOOTHING_NONE
;
private
$reachedMaximumLength
;
private
$x
;
private
$y
;
private
$prefix
;
private
$suffix
;
private
$distanceMatrix
=
null
;
private
$typeMatrix
=
null
;
const
SMOOTHING_NONE
=
'none'
;
const
SMOOTHING_INTERNAL
=
'internal'
;
const
SMOOTHING_FULL
=
'full'
;
public
function
setMaximumLength
(
$maximum_length
)
{
$this
->
maximumLength
=
$maximum_length
;
return
$this
;
}
public
function
getMaximumLength
()
{
return
coalesce
(
$this
->
maximumLength
,
$this
->
getInfinity
());
}
public
function
didReachMaximumLength
()
{
return
$this
->
reachedMaximumLength
;
}
public
function
setComputeString
(
$compute_string
)
{
$this
->
computeString
=
$compute_string
;
return
$this
;
}
public
function
getComputeString
()
{
return
$this
->
computeString
;
}
public
function
setTransposeCost
(
$transpose_cost
)
{
$this
->
transposeCost
=
$transpose_cost
;
return
$this
;
}
public
function
getTransposeCost
()
{
return
$this
->
transposeCost
;
}
public
function
setReplaceCost
(
$replace_cost
)
{
$this
->
replaceCost
=
$replace_cost
;
return
$this
;
}
public
function
getReplaceCost
()
{
return
$this
->
replaceCost
;
}
public
function
setDeleteCost
(
$delete_cost
)
{
$this
->
deleteCost
=
$delete_cost
;
return
$this
;
}
public
function
getDeleteCost
()
{
return
$this
->
deleteCost
;
}
public
function
setInsertCost
(
$insert_cost
)
{
$this
->
insertCost
=
$insert_cost
;
return
$this
;
}
public
function
getInsertCost
()
{
return
$this
->
insertCost
;
}
public
function
setAlterCost
(
$alter_cost
)
{
$this
->
alterCost
=
$alter_cost
;
return
$this
;
}
public
function
getAlterCost
()
{
return
$this
->
alterCost
;
}
public
function
setApplySmoothing
(
$apply_smoothing
)
{
$this
->
applySmoothing
=
$apply_smoothing
;
return
$this
;
}
public
function
getApplySmoothing
()
{
return
$this
->
applySmoothing
;
}
public
function
setSequences
(
array
$x
,
array
$y
)
{
// NOTE: We strip common prefixes and suffixes from the inputs because
// the runtime of the edit distance algorithm is large and it is common
// to diff similar strings.
$xl
=
count
(
$x
);
$yl
=
count
(
$y
);
$l
=
min
(
$xl
,
$yl
);
$prefix_l
=
0
;
$suffix_l
=
0
;
for
(
$ii
=
0
;
$ii
<
$l
;
$ii
++)
{
if
(
$x
[
$ii
]
!==
$y
[
$ii
])
{
break
;
}
$prefix_l
++;
}
for
(
$ii
=
1
;
$ii
<=
(
$l
-
$prefix_l
);
$ii
++)
{
if
(
$x
[
$xl
-
$ii
]
!==
$y
[
$yl
-
$ii
])
{
break
;
}
$suffix_l
++;
}
$this
->
prefix
=
array_slice
(
$x
,
0
,
$prefix_l
);
$this
->
suffix
=
array_slice
(
$x
,
$xl
-
$suffix_l
);
$this
->
x
=
array_slice
(
$x
,
$prefix_l
,
$xl
-
(
$suffix_l
+
$prefix_l
));
$this
->
y
=
array_slice
(
$y
,
$prefix_l
,
$yl
-
(
$suffix_l
+
$prefix_l
));
$this
->
distanceMatrix
=
null
;
return
$this
;
}
private
function
requireSequences
()
{
if
(
$this
->
x
===
null
)
{
throw
new
PhutilInvalidStateException
(
'setSequences'
);
}
}
public
function
getEditDistance
()
{
$this
->
requireSequences
();
$x
=
$this
->
x
;
$y
=
$this
->
y
;
if
(!
$x
)
{
return
$this
->
insertCost
*
count
(
$y
);
}
if
(!
$y
)
{
return
$this
->
deleteCost
*
count
(
$x
);
}
$max
=
$this
->
getMaximumLength
();
if
(
count
(
$x
)
>
$max
||
count
(
$y
)
>
$max
)
{
$this
->
reachedMaximumLength
=
true
;
return
(
$this
->
insertCost
*
count
(
$y
))
+
(
$this
->
deleteCost
*
count
(
$x
));
}
if
(
$x
===
$y
)
{
return
0
;
}
$matrix
=
$this
->
getDistanceMatrix
();
return
$matrix
[
count
(
$x
)][
count
(
$y
)];
}
/**
* Return a string representing the edits between the sequences. The string
* has these characters:
*
* - s (same): Same character in both strings.
* - i (insert): Character inserted.
* - d (delete): Character deleted.
* - x (replace): Character replaced.
* - t (transpose): Character transposed.
*/
public
function
getEditString
()
{
$this
->
requireSequences
();
$x
=
$this
->
x
;
$y
=
$this
->
y
;
if
(!
$x
&&
!
$y
)
{
return
$this
->
padEditString
(
''
);
}
if
(!
$x
)
{
return
$this
->
padEditString
(
str_repeat
(
'i'
,
count
(
$y
)));
}
if
(!
$y
)
{
return
$this
->
padEditString
(
str_repeat
(
'd'
,
count
(
$x
)));
}
if
(
$x
===
$y
)
{
return
$this
->
padEditString
(
str_repeat
(
's'
,
count
(
$x
)));
}
$max
=
$this
->
getMaximumLength
();
if
(
count
(
$x
)
>
$max
||
count
(
$y
)
>
$max
)
{
$this
->
reachedMaximumLength
=
true
;
return
$this
->
padEditString
(
str_repeat
(
'd'
,
count
(
$x
)).
str_repeat
(
'i'
,
count
(
$y
)));
}
$matrix
=
$this
->
getTypeMatrix
();
$xx
=
count
(
$x
);
$yy
=
count
(
$y
);
$transposes
=
array
();
$str
=
''
;
while
(
true
)
{
$type
=
$matrix
[
$xx
][
$yy
];
if
(
is_array
(
$type
))
{
$chr
=
't'
;
$transposes
[
$type
[
0
]][
$type
[
1
]]
=
true
;
$type
=
$type
[
2
];
}
else
{
$chr
=
$type
;
}
if
(
isset
(
$transposes
[
$xx
][
$yy
]))
{
$chr
=
't'
;
}
if
(
$type
==
's'
||
$type
==
'x'
)
{
$xx
-=
1
;
$yy
-=
1
;
}
else
if
(
$type
==
'i'
)
{
$yy
-=
1
;
}
else
if
(
$type
==
'd'
)
{
$xx
-=
1
;
}
else
{
throw
new
Exception
(
pht
(
"Unknown type '%s' in type matrix."
,
$type
));
}
$str
.=
$chr
;
if
(
$xx
<=
0
&&
$yy
<=
0
)
{
break
;
}
}
$str
=
strrev
(
$str
);
// For full smoothing, we pad the edit string before smoothing it, so
// ranges of similar characters at the beginning or end of the string can
// also be smoothed.
// For internal smoothing, we only smooth ranges within the change itself.
$smoothing
=
$this
->
getApplySmoothing
();
switch
(
$smoothing
)
{
case
self
::
SMOOTHING_FULL
:
$str
=
$this
->
padEditString
(
$str
);
$str
=
$this
->
applySmoothing
(
$str
,
true
);
break
;
case
self
::
SMOOTHING_INTERNAL
:
$str
=
$this
->
applySmoothing
(
$str
,
false
);
$str
=
$this
->
padEditString
(
$str
);
break
;
case
self
::
SMOOTHING_NONE
:
$str
=
$this
->
padEditString
(
$str
);
break
;
default
:
throw
new
Exception
(
pht
(
'Unknown smoothing type "%s".'
,
$smoothing
));
}
return
$str
;
}
private
function
padEditString
(
$str
)
{
if
(
$this
->
prefix
)
{
$str
=
str_repeat
(
's'
,
count
(
$this
->
prefix
)).
$str
;
}
if
(
$this
->
suffix
)
{
$str
=
$str
.
str_repeat
(
's'
,
count
(
$this
->
suffix
));
}
return
$str
;
}
private
function
getTypeMatrix
()
{
if
(!
$this
->
computeString
)
{
throw
new
PhutilInvalidStateException
(
'setComputeString'
);
}
if
(
$this
->
typeMatrix
===
null
)
{
$this
->
computeMatrix
(
$this
->
x
,
$this
->
y
);
}
return
$this
->
typeMatrix
;
}
private
function
getDistanceMatrix
()
{
if
(
$this
->
distanceMatrix
===
null
)
{
$this
->
computeMatrix
(
$this
->
x
,
$this
->
y
);
}
return
$this
->
distanceMatrix
;
}
private
function
computeMatrix
(
array
$x
,
array
$y
)
{
$xl
=
count
(
$x
);
$yl
=
count
(
$y
);
$m
=
array
();
$infinity
=
$this
->
getInfinity
();
$use_damerau
=
(
$this
->
transposeCost
!==
null
);
if
(
$use_damerau
)
{
// Initialize the default cost of a transpose.
$m
[-
1
][-
1
]
=
$infinity
;
// Initialize the character position dictionary for Damerau.
$d
=
array
();
foreach
(
$x
as
$c
)
{
$d
[
$c
]
=
-
1
;
}
foreach
(
$y
as
$c
)
{
$d
[
$c
]
=
-
1
;
}
// Initialize the transpose costs for Damerau.
for
(
$xx
=
0
;
$xx
<=
$xl
;
$xx
++)
{
$m
[
$xx
][-
1
]
=
$infinity
;
}
for
(
$yy
=
0
;
$yy
<=
$yl
;
$yy
++)
{
$m
[-
1
][
$yy
]
=
$infinity
;
}
}
// Initialize the top row of the matrix.
for
(
$xx
=
0
;
$xx
<=
$xl
;
$xx
++)
{
$m
[
$xx
][
0
]
=
$xx
*
$this
->
deleteCost
;
}
// Initialize the left column of the matrix.
$cost
=
0
;
for
(
$yy
=
0
;
$yy
<=
$yl
;
$yy
++)
{
$m
[
0
][
$yy
]
=
$yy
*
$this
->
insertCost
;
}
$use_types
=
(
$this
->
computeString
);
if
(
$use_types
)
{
$t
=
array
();
for
(
$xx
=
0
;
$xx
<=
$xl
;
$xx
++)
{
$t
[
$xx
][
0
]
=
'd'
;
}
for
(
$yy
=
0
;
$yy
<=
$yl
;
$yy
++)
{
$t
[
0
][
$yy
]
=
'i'
;
}
$t
[
0
][
0
]
=
's'
;
}
$alt_cost
=
$this
->
getAlterCost
();
if
(
$alt_cost
&&
!
$use_types
)
{
throw
new
Exception
(
pht
(
'If you provide an alter cost with %s, you must enable '
.
'type computation with %s.'
,
'setAlterCost()'
,
'setComputeStrings()'
));
}
// Build the edit distance matrix.
for
(
$xx
=
1
;
$xx
<=
$xl
;
$xx
++)
{
$last_dy
=
-
1
;
for
(
$yy
=
1
;
$yy
<=
$yl
;
$yy
++)
{
if
(
$use_damerau
)
{
$dx
=
$d
[
$y
[
$yy
-
1
]];
$dy
=
$last_dy
;
}
if
(
$x
[
$xx
-
1
]
===
$y
[
$yy
-
1
])
{
$rep_cost
=
$m
[
$xx
-
1
][
$yy
-
1
]
+
0
;
$rep_type
=
's'
;
}
else
{
$rep_cost
=
$m
[
$xx
-
1
][
$yy
-
1
]
+
$this
->
replaceCost
;
$rep_type
=
'x'
;
}
$del_cost
=
$m
[
$xx
-
1
][
$yy
]
+
$this
->
deleteCost
;
$ins_cost
=
$m
[
$xx
][
$yy
-
1
]
+
$this
->
insertCost
;
if
(
$alt_cost
)
{
$del_char
=
$t
[
$xx
-
1
][
$yy
];
$ins_char
=
$t
[
$xx
][
$yy
-
1
];
$rep_char
=
$t
[
$xx
-
1
][
$yy
-
1
];
if
(
$del_char
!==
'd'
)
{
$del_cost
+=
$alt_cost
;
}
if
(
$ins_char
!==
'i'
)
{
$ins_cost
+=
$alt_cost
;
}
if
(
$rep_char
!==
$rep_type
)
{
$rep_cost
+=
$alt_cost
;
}
}
if
(
$rep_cost
<=
$del_cost
&&
$rep_cost
<=
$ins_cost
)
{
$cost
=
$rep_cost
;
$type
=
$rep_type
;
if
(
$rep_type
===
's'
)
{
$last_dy
=
$yy
-
1
;
}
}
else
if
(
$ins_cost
<=
$del_cost
)
{
$cost
=
$ins_cost
;
$type
=
'i'
;
}
else
{
$cost
=
$del_cost
;
$type
=
'd'
;
}
if
(
$use_damerau
)
{
$trn_count
=
((
$xx
-
$dx
-
2
)
+
(
$yy
-
$dy
-
2
)
+
1
);
$trn_cost
=
$m
[
$dx
][
$dy
]
+
(
$trn_count
*
$this
->
transposeCost
);
if
(
$trn_cost
<
$cost
)
{
$cost
=
$trn_cost
;
$type
=
array
(
$dx
+
1
,
$dy
+
1
,
$type
);
}
}
$m
[
$xx
][
$yy
]
=
$cost
;
if
(
$use_types
)
{
$t
[
$xx
][
$yy
]
=
$type
;
}
}
if
(
$use_damerau
)
{
$d
[
$x
[
$xx
-
1
]]
=
(
$xx
-
1
);
}
}
$this
->
distanceMatrix
=
$m
;
if
(
$use_types
)
{
$this
->
typeMatrix
=
$t
;
}
}
private
function
getInfinity
()
{
return
INF
;
}
private
function
printMatrix
(
array
$m
)
{
$x
=
$this
->
x
;
$y
=
$this
->
y
;
$p
=
'% 8s '
;
printf
(
$p
,
' '
);
foreach
(
head
(
$m
)
as
$yk
=>
$yv
)
{
printf
(
$p
,
idx
(
$y
,
$yk
-
1
,
'-'
));
}
echo
"
\n
"
;
foreach
(
$m
as
$xk
=>
$xv
)
{
printf
(
$p
,
idx
(
$x
,
$xk
-
1
,
'-'
));
foreach
(
$xv
as
$yk
=>
$yv
)
{
printf
(
$p
,
(
$yv
==
$this
->
getInfinity
()
?
'inf'
:
$yv
));
}
echo
"
\n
"
;
}
}
private
function
printTypeMatrix
(
array
$t
)
{
$x
=
$this
->
x
;
$y
=
$this
->
y
;
$p
=
'% 8s '
;
printf
(
$p
,
' '
);
foreach
(
head
(
$t
)
as
$yk
=>
$yv
)
{
printf
(
$p
,
idx
(
$y
,
$yk
-
1
,
'-'
));
}
echo
"
\n
"
;
foreach
(
$t
as
$xk
=>
$xv
)
{
printf
(
$p
,
idx
(
$x
,
$xk
-
1
,
'-'
));
foreach
(
$xv
as
$yk
=>
$yv
)
{
printf
(
$p
,
(
$yv
==
$this
->
getInfinity
()
?
'inf'
:
$yv
));
}
echo
"
\n
"
;
}
}
private
function
applySmoothing
(
$str
,
$full
)
{
if
(
$full
)
{
$prefix
=
'(^|[xdi])'
;
$suffix
=
'([xdi]|
\z
)'
;
}
else
{
$prefix
=
'([xdi])'
;
$suffix
=
'([xdi])'
;
}
// Smooth the string out, by replacing short runs of similar characters
// with 'x' operations. This makes the result more readable to humans,
// since there are fewer choppy runs of short added and removed substrings.
do
{
$original
=
$str
;
$str
=
preg_replace
(
'/'
.
$prefix
.
'(s{3})'
.
$suffix
.
'/'
,
'$1xxx$3'
,
$str
);
$str
=
preg_replace
(
'/'
.
$prefix
.
'(s{2})'
.
$suffix
.
'/'
,
'$1xx$3'
,
$str
);
$str
=
preg_replace
(
'/'
.
$prefix
.
'(s{1})'
.
$suffix
.
'/'
,
'$1x$3'
,
$str
);
}
while
(
$str
!=
$original
);
return
$str
;
}
}
Event Timeline
Log In to Comment