优化 ai_auto_response 模型侧定向辱骂响应策略

- 增加 directed abuse 场景识别,只作为模型输入信号,不做本地硬编码回复
- 在触发与规划层为定向挑衅场景单独标记,并强制走 social_short 短回复模式
- 将 abuse_directed 信号写入 Dify control,帮助模型稳定识别被点名挑衅场景
- 优化 Dify 主提示词与保守降级提示词,要求 abuse_directed 时默认短回且不要空掉
- 保持回复仍由模型生成,避免本地模板化回复暴露机器人痕迹
This commit is contained in:
liuwei
2026-04-24 14:44:33 +08:00
parent 058a7aec80
commit f593f5dd90
5 changed files with 82 additions and 0 deletions

View File

@@ -30,6 +30,30 @@ CODING_WORK_PATTERNS = [
r"(?i)\bimplement\b",
]
DIRECTED_ABUSE_PATTERNS = [
r"(?i)傻子",
r"(?i)傻逼",
r"(?i)煞笔",
r"(?i)蠢货",
r"(?i)智障",
r"(?i)脑残",
r"(?i)废物",
r"(?i)有病",
r"(?i)滚蛋",
r"(?i)去死",
r"(?i)弱智",
]
DIRECTED_TARGET_PATTERNS = [
r"(?i)\b你\b",
r"(?i)\b您\b",
r"(?i)小牛",
r"(?i)于谦",
r"(?i)谦哥",
r"(?i)林志玲",
r"(?i)志玲",
]
def strip_at_prefix(content: str) -> str:
return re.sub(r"@.*?[\u2005\s]+", "", str(content or "")).strip()
@@ -59,6 +83,21 @@ def is_coding_work_request(content: str) -> bool:
return any(re.search(pattern, text) for pattern in CODING_WORK_PATTERNS)
def is_directed_abuse(content: str, directed: bool = False) -> bool:
# 这里不做“内容审核”意义上的脏词识别,而是只识别一种产品场景:
# bot 被明确点名后,收到带侮辱/挑衅色彩的话。
# 这个标记只用来帮助模型选择更合理的回应策略,不做本地硬编码回复。
text = str(content or "").strip()
if not text:
return False
has_abuse = any(re.search(pattern, text) for pattern in DIRECTED_ABUSE_PATTERNS)
if not has_abuse:
return False
if directed:
return True
return any(re.search(pattern, text) for pattern in DIRECTED_TARGET_PATTERNS)
def is_targeting_other_user(message: Dict[str, Any]) -> bool:
if message.get("is_at", False):
return False