数据库内核月报－ 2018/04 - PgSQL · 应用案例 · 相似文本识别与去重 - 《数据库内核月报》

实操的例子
参考

1. 首先如何判断内容的相似度，PostgreSQL中提供了中文分词，pg_trgm(将字符串切成多个不重复的token,计算两个字符串的相似度) .

对于本题，我建议采取中文分词的方式，首先将内容拆分成词组。

2. 在拆分成词组后，首先分组聚合，去除完全重复的数据。

3. 然后自关联生成笛卡尔(矩阵)，计算出每条记录和其他记录的相似度。相似度的算法很简单，重叠的token数量除以集合的token去重后的数量。

4. 根据相似度，去除不需要的数据。

这里如果数据量非常庞大，使用专业的分析编程语言会更好例如 PL/R。

首先要安装PostgreSQL 中文分词插件

(阿里云AliCloudDB PostgreSQL已包含这个插件，用法参考官方手册)


mv pg_jieba $PGSRC/contrib/  
export PATH=/home/digoal/pgsql9.5/bin:$PATH  
cd $PGSRC/contrib/pg_jieba  
make clean;make;make install  
git clone https://github.com/jaiminpan/pg_scws.git  
mv pg_jieba $PGSRC/contrib/  
export PATH=/home/digoal/pgsql9.5/bin:$PATH  
cd $PGSRC/contrib/pg_scws  
make clean;make;make install

创建插件

psql  
# create extension pg_jieba;  
# create extension pg_scws;

创建测试CASE

create table tdup1 (id int primary key, info text);  
create extension pg_trgm;  
insert into tdup1 values (1, '银屑病怎么治？');  
insert into tdup1 values (2, '银屑病怎么治疗？');  
insert into tdup1 values (3, '银屑病怎么治疗好？');  
insert into tdup1 values (4, '银屑病怎么能治疗好？');

这两种分词插件，可以任选一种。

创建三个函数，

计算2个数组的集合（去重后的集合）

postgres=# create or replace function array_union(text[], text[]) returns text[] as $$  
  select array_agg(c1) from (select c1 from unnest($1||$2) t(c1) group by c1) t;  
$$ language sql strict;  
CREATE FUNCTION

数组去重

postgres=# create or replace function array_dist(text[]) returns text[] as $$           
  select array_agg(c1) from (select c1 from unnest($1) t(c1) group by c1) t;      
$$ language sql strict;  
CREATE FUNCTION

计算两个数组的重叠部分（去重后的重叠部分）

postgres=# create or replace function array_share(text[], text[]) returns text[] as $$  
$$ language sql strict;  
CREATE FUNCTION

笛卡尔结果是这样的：

regexp_split_to_array((regexp_replace(to_tsvector('jiebacfg',info)::text,'(:\d+)', '', 'g')),' ') 用于将info转换成数组。

以上生成的实际上是一个矩阵,simulate就是矩阵中我们需要计算的相似度：

我们在去重计算时不需要所有的笛卡尔积，只需要这个矩阵对角线的上部分或下部分数据即可。

所以加个条件就能完成。


(select id,info,array_dist(regexp_split_to_array((regexp_replace(to_tsvector('jiebacfg',info)::text,'(:\d+)', '', 'g')),' ')) from tdup1)   
select * from (select t1.c1 t1c1,t2.c1 t2c1,t1.c2 t1c2,t2.c2 t2c2,t1.c3 t1c3,t2.c3 t2c3,round(array_length(array_share(t1.c3,t2.c3),1)::numeric/array_length(array_union(t1.c3,t2.c3),1),2)   
simulate from t t1,t t2 where t1.c1<>t2.c1 and t1.c1<t2.c1) t;  
 t1c1 | t2c1 |        t1c2        |         t2c2         |       t1c3        |       t2c3        | simulate   
------+------+--------------------+----------------------+-------------------+-------------------+----------  
    1 |    2 | 银屑病怎么治？     | 银屑病怎么治疗？     | {'银屑病','治'}   | {'银屑病','治疗'} |     0.33  
    1 |    3 | 银屑病怎么治？     | 银屑病怎么治疗好？   | {'银屑病','治'}   | {'银屑病','治疗'} |     0.33  
    1 |    4 | 银屑病怎么治？     | 银屑病怎么能治疗好？ | {'银屑病','治'}   | {'银屑病','治疗'} |     0.33  
    2 |    3 | 银屑病怎么治疗？   | 银屑病怎么治疗好？   | {'银屑病','治疗'} | {'银屑病','治疗'} |     1.00  
    2 |    4 | 银屑病怎么治疗？   | 银屑病怎么能治疗好？ | {'银屑病','治疗'} | {'银屑病','治疗'} |     1.00  
    3 |    4 | 银屑病怎么治疗好？ | 银屑病怎么能治疗好？ | {'银屑病','治疗'} | {'银屑病','治疗'} |     1.00  
(6 rows)

开始对这些数据去重，去重的第一步，明确simulate, 例如相似度大于0.5的，需要去重。

postgres=# with t(c1,c2,c3) as   
(select id,info,array_dist(regexp_split_to_array((regexp_replace(to_tsvector('jiebacfg',info)::text,'(:\d+)', '', 'g')),' ')) from tdup1)   
select * from (select t1.c1 t1c1,t2.c1 t2c1,t1.c2 t1c2,t2.c2 t2c2,t1.c3 t1c3,t2.c3 t2c3,round(array_length(array_share(t1.c3,t2.c3),1)::numeric/array_length(array_union(t1.c3,t2.c3),1),2)   
simulate from t t1,t t2 where t1.c1<>t2.c1 and t1.c1<t2.c1) t where simulate>0.5;  
 t1c1 | t2c1 |        t1c2        |         t2c2         |       t1c3        |       t2c3        | simulate   
------+------+--------------------+----------------------+-------------------+-------------------+----------  
    2 |    3 | 银屑病怎么治疗？   | 银屑病怎么治疗好？   | {'银屑病','治疗'} | {'银屑病','治疗'} |     1.00  
    2 |    4 | 银屑病怎么治疗？   | 银屑病怎么能治疗好？ | {'银屑病','治疗'} | {'银屑病','治疗'} |     1.00  
    3 |    4 | 银屑病怎么治疗好？ | 银屑病怎么能治疗好？ | {'银屑病','治疗'} | {'银屑病','治疗'} |     1.00  
(3 rows)

去重第二步，将t2c1列的ID对应的记录删掉即可。

delete from tdup1 where id in (with t(c1,c2,c3) as   
(select id,info,array_dist(regexp_split_to_array((regexp_replace(to_tsvector('jiebacfg',info)::text,'(:\d+)', '', 'g')),' ')) from tdup1)   
select t2c1 from (select t1.c1 t1c1,t2.c1 t2c1,t1.c2 t1c2,t2.c2 t2c2,t1.c3 t1c3,t2.c3 t2c3,round(array_length(array_share(t1.c3,t2.c3),1)::numeric/array_length(array_union(t1.c3,t2.c3),1),2)   
simulate from t t1,t t2 where t1.c1<>t2.c1 and t1.c1<t2.c1) t where simulate>0.5);  
例如 :   
postgres=# insert into tdup1 values (11, '白血病怎么治？');  
INSERT 0 1  
INSERT 0 1  
INSERT 0 1  
postgres=# insert into tdup1 values (24, '白血病怎么能治疗好？');  
INSERT 0 1  
postgres=#   
postgres=# with t(c1,c2,c3) as                               
(select id,info,array_dist(regexp_split_to_array((regexp_replace(to_tsvector('jiebacfg',info)::text,'(:\d+)', '', 'g')),' ')) from tdup1)   
select * from (select t1.c1 t1c1,t2.c1 t2c1,t1.c2 t1c2,t2.c2 t2c2,t1.c3 t1c3,t2.c3 t2c3,round(array_length(array_share(t1.c3,t2.c3),1)::numeric/array_length(array_union(t1.c3,t2.c3),1),2)   
simulate from t t1,t t2 where t1.c1<>t2.c1 and t1.c1<t2.c1) t where simulate>0.5;  
 t1c1 | t2c1 |        t1c2        |         t2c2         |       t1c3        |       t2c3        | simulate   
------+------+--------------------+----------------------+-------------------+-------------------+----------  
    2 |    3 | 银屑病怎么治疗？   | 银屑病怎么治疗好？   | {'银屑病','治疗'} | {'银屑病','治疗'} |     1.00  
    2 |    4 | 银屑病怎么治疗？   | 银屑病怎么能治疗好？ | {'银屑病','治疗'} | {'银屑病','治疗'} |     1.00  
    3 |    4 | 银屑病怎么治疗好？ | 银屑病怎么能治疗好？ | {'银屑病','治疗'} | {'银屑病','治疗'} |     1.00  
   22 |   24 | 白血病怎么治疗？   | 白血病怎么能治疗好？ | {'治疗','白血病'} | {'治疗','白血病'} |     1.00  
   13 |   22 | 白血病怎么治疗好？ | 白血病怎么治疗？     | {'治疗','白血病'} | {'治疗','白血病'} |     1.00  
   13 |   24 | 白血病怎么治疗好？ | 白血病怎么能治疗好？ | {'治疗','白血病'} | {'治疗','白血病'} |     1.00  
(6 rows)  
postgres=# begin;  
BEGIN  
postgres=# delete from tdup1 where id in (with t(c1,c2,c3) as   
postgres(# (select id,info,array_dist(regexp_split_to_array((regexp_replace(to_tsvector('jiebacfg',info)::text,'(:\d+)', '', 'g')),' ')) from tdup1)   
postgres(# select t2c1 from (select t1.c1 t1c1,t2.c1 t2c1,t1.c2 t1c2,t2.c2 t2c2,t1.c3 t1c3,t2.c3 t2c3,round(array_length(array_share(t1.c3,t2.c3),1)::numeric/array_length(array_union(t1.c3,t2.c3),1),2)   
postgres(# simulate from t t1,t t2 where t1.c1<>t2.c1 and t1.c1<t2.c1) t where simulate>0.5);  
DELETE 4  
postgres=# select * from tdup1 ;  
 id |        info          
----+--------------------  
  1 | 银屑病怎么治？  
  2 | 银屑病怎么治疗？  
 11 | 白血病怎么治？