極簡版Java敏感詞檢測開源工具-開源-CodeUp Hub

一、使用步驟

（1）引入 Maven 依賴

引入最新的版本即可，見附錄開源地址。

<dependency>
    <groupId>com.github.houbb</groupId>
    <artifactId>sensitive-word</artifactId>
    <version>0.18.0</version>
</dependency>

（2）核心方法使用例項

包含了主要的一些功能和方法，如下所示：

① 常規用法查詢替換；

② 指定替換字串；

③ 檢測忽略大小寫，特殊字元，重複字元，簡繁體，中英文等；

④ 自定義替換檢測策略示例；

package com.example.demo;

import com.github.houbb.sensitive.word.api.IWordContext;
import com.github.houbb.sensitive.word.api.IWordReplace;
import com.github.houbb.sensitive.word.api.IWordResult;
import com.github.houbb.sensitive.word.bs.SensitiveWordBs;
import com.github.houbb.sensitive.word.core.SensitiveWordHelper;
import com.github.houbb.sensitive.word.support.result.WordResultHandlers;
import com.github.houbb.sensitive.word.utils.InnerWordCharUtils;

import java.util.List;

public class SensitiveWordTestDemo {

    public static void main(String[] args) {
        //testNormal();
        //testDefineReplace();
        //testSensitiveWordResultHandler();
        //testOtherFeatures();
        testMoreFeatures();
    }

    // 常規使用案例：替換敏感詞
    public static void testNormal() {
        final String text = "五星紅旗迎風飄揚，毛主席的畫像屹立在天安門前。";
        System.out.println("是否包含銘感詞：" + SensitiveWordHelper.contains(text));
        System.out.println("查詢第一個銘感詞：" + SensitiveWordHelper.findFirst(text));
        System.out.println("查詢所有銘感詞：" + SensitiveWordHelper.findAll(text));

        System.out.println("替換所有銘感詞：" + SensitiveWordHelper.replace(text));
        System.out.println("替換所有銘感詞(指定替換符號)：" + SensitiveWordHelper.replace(text, '⭐'));
    }

    /**
     * 此案例講解：IWordResultHandler 可以對敏感詞的結果進行處理，允許使用者自定義。
     */
    public static void testSensitiveWordResultHandler() {
        final String text = "五星紅旗迎風飄揚，毛主席的畫像屹立在天安門前。";

        List<String> wordList = SensitiveWordHelper.findAll(text);
        //Assert.assertEquals("[五星紅旗, 毛主席, 天安門]", wordList.toString());
        System.out.println("1.查詢到所有銘感詞：" + wordList);

        List<String> wordList2 = SensitiveWordHelper.findAll(text, WordResultHandlers.word());
        //Assert.assertEquals("[五星紅旗, 毛主席, 天安門]", wordList2.toString());
        System.out.println("2.預設內建處理(同直接查詢到所有敏感詞)：" + wordList2);

        List<IWordResult> wordList3 = SensitiveWordHelper.findAll(text, WordResultHandlers.raw());
        //Assert.assertEquals("[WordResult{startIndex=0, endIndex=4}, WordResult{startIndex=9, endIndex=12}, WordResult{startIndex=18, endIndex=21}]", wordList3.toString());
        System.out.println("3.查詢敏感詞單詞本身的起始位置到終止位置：" + wordList3);
    }

    // 例項：常規忽略檢測特性
    public static void testOtherFeatures() {
        System.out.println("\n其他屬性\n");
        String text = "fuCK the bad words.";
        String word = SensitiveWordHelper.findFirst(text);
        //Assert.assertEquals("fuCK", word);
        System.out.println("忽略大小寫：" + word);
        System.out.println("替換大小寫字元：" + SensitiveWordHelper.replace(text));

        text = "ｆｕｃｋ the bad words.";
        word = SensitiveWordHelper.findFirst(text);
        //Assert.assertEquals("ｆｕｃｋ", word);
        System.out.println("忽略半圓角：" + word);
        System.out.println("替換半圓角字元：" + SensitiveWordHelper.replace(text));

        text = "這個是我的微信：9⓿二肆⁹₈③⑸⒋➃㈤㊄";
        List<String> wordList = SensitiveWordBs.newInstance().enableNumCheck(true).init().findAll(text);
        //Assert.assertEquals("[9⓿二肆⁹₈③⑸⒋➃㈤㊄]", wordList.toString());
        System.out.println("忽略數字的寫法：" + wordList.toString());
        System.out.println("替換數字字元：" + SensitiveWordBs.newInstance().enableNumCheck(true).init().replace(text));

        text = "我愛我的祖國和五星紅旗。";
        List<String> wordList1 = SensitiveWordHelper.findAll(text);
        //Assert.assertEquals("[五星紅旗]", wordList1.toString());
        System.out.println("檢測敏感詞簡繁體格式是否存在：" + wordList1.toString());

        text = "Ⓕⓤc⒦ the bad words";
        List<String> wordList2 = SensitiveWordHelper.findAll(text);
        //Assert.assertEquals("[Ⓕⓤc⒦]", wordList2.toString());
        System.out.println("檢測敏感詞是否存在英文的書寫格式：" + wordList2.toString());

        text = "ⒻⒻⒻfⓤuⓤ⒰cⓒ⒦ the bad words";
        List<String> wordList3 = SensitiveWordBs.newInstance()
                .ignoreRepeat(true)
                .init()
                .findAll(text);
        //Assert.assertEquals("[ⒻⒻⒻfⓤuⓤ⒰cⓒ⒦]", wordList3.toString());
        System.out.println("檢測重複詞每個字元是否重複：" + wordList3.toString());
    }

    // 例項：更多檢測特性
    public static void testMoreFeatures() {
        // 1.郵箱檢測(郵箱等個人資訊，預設未啟用。)
        String text = "樓主好人，郵箱 sensitiveword@xx.com";
        List<String> wordList = SensitiveWordBs.newInstance().enableEmailCheck(true).init().findAll(text);
        //Assert.assertEquals("[sensitiveword@xx.com]", wordList.toString());
        System.out.println("是否存在郵箱：" + wordList.toString());

        // 2.連續數字檢測(一般用於過濾手機號/QQ等廣告資訊，預設未啟用。)
        text = "你懂得：12345678";
        // 預設檢測 8 位
        List<String> wordList1 = SensitiveWordBs.newInstance()
                .enableNumCheck(true)
                .init().findAll(text);
        //Assert.assertEquals("[12345678]", wordList.toString());
        System.out.println("是否存在連續數字字串：" + wordList1);
        // 指定數字的長度，避免誤殺
        List<String> wordList2 = SensitiveWordBs.newInstance()
                .enableNumCheck(true)
                .numCheckLen(9)
                .init().findAll(text);
        //Assert.assertEquals("[]", wordList2.toString());
        System.out.println("是否存在連續數字字串2：" + wordList2.toString());

        // 3.網址檢測(用於過濾常見的網址資訊，預設未啟用, v0.18.0 最佳化 URL 檢測，更加嚴格，降低誤判率)
        text = "點選連結 https://www.baidu.com 檢視答案";
        SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance().enableUrlCheck(true).init();
        List<String> wordList3 = sensitiveWordBs.findAll(text);
        //Assert.assertEquals("[https://www.baidu.com]", wordList3.toString());
        //Assert.assertEquals("點選連結 ********************* 檢視答案", sensitiveWordBs.replace(text));
        System.out.println("是否存在網址資訊：" + wordList3.toString());
        System.out.println("是否存在網址資訊2並替換：" + sensitiveWordBs.replace(text));

        // 4.IPv4 檢測: 避免使用者透過 ip 繞過網址檢測等，預設未啟用。
        text = "個人網站，如果網址打不開可以訪問 127.0.0.1。";
        SensitiveWordBs sensitiveWordBs2 = SensitiveWordBs.newInstance().enableIpv4Check(true).init();
        List<String> wordList4 = sensitiveWordBs2.findAll(text);
        //Assert.assertEquals("[127.0.0.1]", wordList4.toString());
        System.out.println("是否存在 IPv4：" + wordList4.toString());
    }

    // 例項：自定義檢測替換策略
    public static void testDefineReplace() {
        System.out.println("自定義敏感詞替換策略：（策略：指定敏感詞替換）");
        final String text = "五星紅旗迎風飄揚，毛主席的畫像屹立在天安門前。";
        MySensitiveWordReplace replace = new MySensitiveWordReplace();
        String result = SensitiveWordHelper.replace(text, replace);
        System.out.println("自定義替換策略結果：" + result);
    }
}

class MySensitiveWordReplace implements IWordReplace {

    @Override
    public void replace(StringBuilder stringBuilder, char[] chars, IWordResult wordResult, IWordContext iWordContext) {
        String sensitiveWord = InnerWordCharUtils.getString(chars, wordResult);
        // 自定義不同的敏感詞替換策略，可以從資料庫等地方讀取
        if ("五星紅旗".equals(sensitiveWord)) {
            stringBuilder.append("國家旗幟");
        } else if ("毛主席".equals(sensitiveWord)) {
            stringBuilder.append("教員");
        } else {
            // 其他預設使用 * 代替
            int wordLength = wordResult.endIndex() - wordResult.startIndex();
            for (int i = 0; i < wordLength; i++) {
                stringBuilder.append('*');
            }
        }
    }
}

輸出結果展示:

是否包含銘感詞：true
查詢第一個銘感詞：五星紅旗
查詢所有銘感詞：[五星紅旗, 毛主席, 天安門]
替換所有銘感詞：****迎風飄揚，***的畫像屹立在***前。
替換所有銘感詞(指定替換符號)：⭐⭐⭐⭐迎風飄揚，⭐⭐⭐的畫像屹立在⭐⭐⭐前。
自定義敏感詞替換策略：（策略：指定敏感詞替換）
自定義替換策略結果：國家旗幟迎風飄揚，教員的畫像屹立在***前。
1.查詢到所有銘感詞：[五星紅旗, 毛主席, 天安門]
2.預設內建處理(同直接查詢到所有敏感詞)：[五星紅旗, 毛主席, 天安門]
3.查詢敏感詞單詞本身的起始位置到終止位置：[WordResult{startIndex=0, endIndex=4, type='WORD'}, WordResult{startIndex=9, endIndex=12, type='WORD'}, WordResult{startIndex=18, endIndex=21, type='WORD'}]

其他屬性

忽略大小寫：fuCK
替換大小寫字元：**** the bad words.
忽略半圓角：ｆｕｃｋ
替換半圓角字元：**** the bad words.
忽略數字的寫法：[9⓿二肆⁹₈③⑸⒋➃㈤㊄]
替換數字字元：這個是我的微信：************
檢測敏感詞簡繁體格式是否存在：[五星紅旗]
檢測敏感詞是否存在英文的書寫格式：[Ⓕⓤc⒦]
檢測重複詞每個字元是否重複：[ⒻⒻⒻfⓤuⓤ⒰cⓒ⒦]
是否存在郵箱：[sensitiveword@xx.com]
是否存在連續數字字串：[12345678]
是否存在連續數字字串2：[]
是否存在網址資訊：[https://www.baidu.com]
是否存在網址資訊2並替換：點選連結 ********************* 檢視答案
是否存在 IPv4：[127.0.0.1]

二、核心方法：查詢 / 替換

方法	引數	返回值	說明
contains(String)	待驗證的字串	布林值	驗證字串是否包含敏感詞
replace(String, ISensitiveWordReplace)	使用指定的替換策略替換敏感詞	字串	返回脫敏後的字串
replace(String, char)	使用指定的 char 替換敏感詞	字串	返回脫敏後的字串
replace(String)	使用 `*` 替換敏感詞	字串	返回脫敏後的字串
findAll(String)	待驗證的字串	字串列表	返回字串中所有敏感詞
findFirst(String)	待驗證的字串	字串	返回字串中第一個敏感詞
findAll(String, IWordResultHandler)	IWordResultHandler 結果處理類	字串列表	返回字串中所有敏感詞
findFirst(String, IWordResultHandler)	IWordResultHandler 結果處理類	字串	返回字串中第一個敏感詞
tags(String)	獲取敏感詞的標籤	敏感詞字串	返回敏感詞的標籤列表

三、更多的檢測策略（自定義）

（1）郵箱-網址-IPV4-連續字元檢測

是否包含銘感詞：true
查詢第一個銘感詞：五星紅旗
查詢所有銘感詞：[五星紅旗, 毛主席, 天安門]
替換所有銘感詞：****迎風飄揚，***的畫像屹立在***前。
替換所有銘感詞(指定替換符號)：⭐⭐⭐⭐迎風飄揚，⭐⭐⭐的畫像屹立在⭐⭐⭐前。
自定義敏感詞替換策略：（策略：指定敏感詞替換）
自定義替換策略結果：國家旗幟迎風飄揚，教員的畫像屹立在***前。
1.查詢到所有銘感詞：[五星紅旗, 毛主席, 天安門]
2.預設內建處理(同直接查詢到所有敏感詞)：[五星紅旗, 毛主席, 天安門]
3.查詢敏感詞單詞本身的起始位置到終止位置：[WordResult{startIndex=0, endIndex=4, type='WORD'}, WordResult{startIndex=9, endIndex=12, type='WORD'}, WordResult{startIndex=18, endIndex=21, type='WORD'}]

其他屬性

忽略大小寫：fuCK
替換大小寫字元：**** the bad words.
忽略半圓角：ｆｕｃｋ
替換半圓角字元：**** the bad words.
忽略數字的寫法：[9⓿二肆⁹₈③⑸⒋➃㈤㊄]
替換數字字元：這個是我的微信：************
檢測敏感詞簡繁體格式是否存在：[五星紅旗]
檢測敏感詞是否存在英文的書寫格式：[Ⓕⓤc⒦]
檢測重複詞每個字元是否重複：[ⒻⒻⒻfⓤuⓤ⒰cⓒ⒦]
是否存在郵箱：[sensitiveword@xx.com]
是否存在連續數字字串：[12345678]
是否存在連續數字字串2：[]
是否存在網址資訊：[https://www.baidu.com]
是否存在網址資訊2並替換：點選連結 ********************* 檢視答案
是否存在 IPv4：[127.0.0.1]

（2）常規檢測：大小寫-特殊字元-重複字元-簡繁體等

// 例項：常規忽略檢測特性
public static void testOtherFeatures() {
    System.out.println("\n其他屬性\n");
    String text = "fuCK the bad words.";
    String word = SensitiveWordHelper.findFirst(text);
    //Assert.assertEquals("fuCK", word);
    System.out.println("忽略大小寫：" + word);
    System.out.println("替換大小寫字元：" + SensitiveWordHelper.replace(text));

    text = "ｆｕｃｋ the bad words.";
    word = SensitiveWordHelper.findFirst(text);
    //Assert.assertEquals("ｆｕｃｋ", word);
    System.out.println("忽略半圓角：" + word);
    System.out.println("替換半圓角字元：" + SensitiveWordHelper.replace(text));

    text = "這個是我的微信：9⓿二肆⁹₈③⑸⒋➃㈤㊄";
    List<String> wordList = SensitiveWordBs.newInstance().enableNumCheck(true).init().findAll(text);
    //Assert.assertEquals("[9⓿二肆⁹₈③⑸⒋➃㈤㊄]", wordList.toString());
    System.out.println("忽略數字的寫法：" + wordList.toString());
    System.out.println("替換數字字元：" + SensitiveWordBs.newInstance().enableNumCheck(true).init().replace(text));

    text = "我愛我的祖國和五星紅旗。";
    List<String> wordList1 = SensitiveWordHelper.findAll(text);
    //Assert.assertEquals("[五星紅旗]", wordList1.toString());
    System.out.println("檢測敏感詞簡繁體格式是否存在：" + wordList1.toString());

    text = "Ⓕⓤc⒦ the bad words";
    List<String> wordList2 = SensitiveWordHelper.findAll(text);
    //Assert.assertEquals("[Ⓕⓤc⒦]", wordList2.toString());
    System.out.println("檢測敏感詞是否存在英文的書寫格式：" + wordList2.toString());

    text = "ⒻⒻⒻfⓤuⓤ⒰cⓒ⒦ the bad words";
    List<String> wordList3 = SensitiveWordBs.newInstance()
        .ignoreRepeat(true)
        .init()
        .findAll(text);
    //Assert.assertEquals("[ⒻⒻⒻfⓤuⓤ⒰cⓒ⒦]", wordList3.toString());
    System.out.println("檢測重複詞每個字元是否重複：" + wordList3.toString());
}

（3）自定義檢測替換策略

自定義檢測替換

class MySensitiveWordReplace implements IWordReplace {

    @Override
    public void replace(StringBuilder stringBuilder, char[] chars, IWordResult wordResult, IWordContext iWordContext) {
        String sensitiveWord = InnerWordCharUtils.getString(chars, wordResult);
        // 自定義不同的敏感詞替換策略，可以從資料庫等地方讀取
        if ("五星紅旗".equals(sensitiveWord)) {
            stringBuilder.append("國家旗幟");
        } else if ("毛主席".equals(sensitiveWord)) {
            stringBuilder.append("教員");
        } else {
            // 其他預設使用 * 代替
            int wordLength = wordResult.endIndex() - wordResult.startIndex();
            for (int i = 0; i < wordLength; i++) {
                stringBuilder.append('*');
            }
        }
    }
}

使用例項：

// 例項：自定義檢測替換策略
public static void testDefineReplace() {
    System.out.println("自定義敏感詞替換策略：（策略：指定敏感詞替換）");
    final String text = "五星紅旗迎風飄揚，毛主席的畫像屹立在天安門前。";
    MySensitiveWordReplace replace = new MySensitiveWordReplace();
    String result = SensitiveWordHelper.replace(text, replace);
    System.out.println("自定義替換策略結果：" + result);
}