Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

import java.util.Comparator;
import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;

/**
* 系统参数应用服务
Expand All @@ -25,6 +26,7 @@
public class SysParamApplicationService {
private final SysParamRepository sysParamRepository;
private final RedisClient redisClient;
private final AtomicBoolean redisEnable = new AtomicBoolean(true);

/**
* 列表查询系统参数
Expand Down Expand Up @@ -59,17 +61,18 @@ public void deleteParamById(String paramKey) {
}

public String getParamByKey(String paramId) {
boolean redisEnable = false;
String value = null;
try {
value = redisClient.getParamWithThrow(paramId);
redisEnable = true;
} catch (Exception e) {
log.warn(e.getMessage());
if (redisEnable.get()) {
try {
value = redisClient.getParamWithThrow(paramId);
} catch (Exception e) {
redisEnable.set(false);
log.warn(e.getMessage());
}
}
if (value == null) {
SysParam sysParam = sysParamRepository.getById(paramId);
if (sysParam != null && redisEnable) {
if (sysParam != null) {
value = sysParam.getParamValue();
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import {
InputNumber,
Slider,
Space,
Switch,
} from "antd";
import { ConfigI, OperatorI } from "@/pages/OperatorMarket/operator.model";

Expand Down Expand Up @@ -215,12 +216,12 @@ const ParamConfig: React.FC<ParamConfigProps> = ({
tooltip={param.description}
key={paramKey}
>
<Checkbox
checked={value as boolean}
onChange={(e) => updateValue(e.target.checked)}
>
{param.name}
</Checkbox>
<Switch
checkedChildren={param.checkedLabel}
unCheckedChildren={param.unCheckedLabel}
defaultChecked={param.defaultVal === 'true'}
onChange={(checked) => updateValue(checked)}
/>
</Form.Item>
);
case "multiple":
Expand Down
4 changes: 2 additions & 2 deletions runtime/ops/filter/img_similar_images_cleaner/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,8 @@ def get_orb_similarity(self, des_matrix: np.ndarray, des_matrix_history: np.ndar
orb_similarity = count / len(matches)
return orb_similarity
except Exception as e:
logger.exception(f"taskId: self.task_uuid, failed to compare the similarity between "
f"file_name and file_name_history: {e}")
logger.exception(f"taskId: {self.task_uuid}, failed to compare the similarity between "
f"{file_name} and {file_name_history}: {e}")
return 0.0

def execute_sql(self, p_hash: str, des_matrix: np.ndarray, file_name: str,
Expand Down
9 changes: 9 additions & 0 deletions runtime/ops/mapper/html_tag_cleaner/metadata.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,12 @@ effect:
after: '机器学习是人工智能的一个分支。'
inputs: 'text'
outputs: 'text'
settings:
removeTableTags:
name: '是否去除表格标签'
description: '若为是,则会去除表格标签<tr><td>等。'
type: 'switch'
defaultVal: 'false'
required: false
checkedLabel: '是'
unCheckedLabel: '否'
12 changes: 10 additions & 2 deletions runtime/ops/mapper/html_tag_cleaner/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,14 @@ class HtmlTagCleaner(Mapper):
'<sup>', '<template>', '<textarea>', '<tfoot>', '<thead>', '<time>', '<title>', '<track>', '<tt>', '<u>',
'<ul>', '<var>', '<video>', '<wbr>', '<xmp>'
]
# 需要添加的表格标签
table_tags = ['<table>', '<tbody>', '<td>', '<th>', '<tr>']
preserved_attr_list = ['colspan', 'rowspan'] # 需要保留的标签属性列表

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.remove_table_tags = bool(kwargs.get('removeTableTags', False))

@staticmethod
def _remove_specified_tags(input_data: str, specified_tags: List):
"""移除指定html标签及其属性值"""
Expand Down Expand Up @@ -68,13 +74,15 @@ def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
if sample[self.filetype_key] != "xml":
sample[self.text_key] = self._remove_html_tags(sample[self.text_key])
logger.info(
f"fileName: sample[self.filename_key], method: HtmlTagCleaner costs {time.time() - start:6f} s")
f"fileName: {sample[self.filename_key]}, method: HtmlTagCleaner costs {time.time() - start:6f} s")
else:
logger.info(f"fileName: {sample[self.filename_key]}, method: HtmlTagCleaner, The file is xml!")
return sample

def _remove_html_tags(self, input_data: str):
# 去除常见的html标签及其属性值(不包括<table>、<tbody>、<tr>、<td>、<th>)
# 去除常见的html标签及其属性值
if self.remove_table_tags:
self.tag_list.extend(self.table_tags)
cleaned_text = self._remove_specified_tags(input_data, self.tag_list)
# 去除表格标签内的属性值(不包括colspan、rowspan属性),eg:<td class="td8" rowspan="3"> —> <td rowspan="3">
cleaned_text = self._remove_tag_attributes(cleaned_text, self.preserved_attr_list)
Expand Down
4 changes: 2 additions & 2 deletions runtime/ops/mapper/img_direction_correct/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def _detect_direction(image, file_name, model):
rotate_angle = int(cls_res.get("class_ids", np.array([0], dtype='int32')).item())
pro = float(cls_res.get("scores", np.array([0], dtype='int32')).item())
logger.info(
f"fileName: file_name, model model.model_name detect result is {rotate_angle} with confidence pro")
f"fileName: {file_name}, model {model.model_name} detect result is {rotate_angle} with confidence {pro}")
if rotate_angle == 90 and pro > 0.89:
return cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
if rotate_angle == 180 and pro > 0.89:
Expand Down Expand Up @@ -107,7 +107,7 @@ def execute(self, sample: Dict[str, Any]):
data = bytes_transform.bytes_to_numpy(img_bytes)
correct_data = self._img_direction_correct(data, file_name, self.model)
sample[self.data_key] = bytes_transform.numpy_to_bytes(correct_data, file_type)
logger.info(f"fileName: file_name, method: ImgDirectionCorrect costs {time.time() - start:6f} s")
logger.info(f"fileName: {file_name}, method: ImgDirectionCorrect costs {time.time() - start:6f} s")
return sample

def _img_direction_correct(self, img, file_name, standard_model):
Expand Down
4 changes: 2 additions & 2 deletions runtime/ops/mapper/img_enhanced_saturation/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def enhance_saturation(self, image_data: np.ndarray, file_name):

# 图片饱和度较高,不需要增强饱和度
if saturation_factor <= 1:
logger.info(f"fileName: file_name, method: ImgSaturation not need enhancement")
logger.info(f"fileName: {file_name}, method: ImgSaturation not need enhancement")
return image_data

# 计算图片红色通道均值, 如果过大,需要限制saturation factor大小,否则图片会泛红, 产生色彩畸变。
Expand All @@ -78,5 +78,5 @@ def execute(self, sample: Dict[str, Any]):
img_data = bytes_transform.bytes_to_numpy(img_bytes)
img_data = self.enhance_saturation(img_data, file_name)
sample[self.data_key] = bytes_transform.numpy_to_bytes(img_data, file_type)
logger.info(f"fileName: file_name, method: ImgSaturation costs {time.time() - start:6f} s")
logger.info(f"fileName: {file_name}, method: ImgSaturation costs {time.time() - start:6f} s")
return sample
4 changes: 2 additions & 2 deletions runtime/ops/mapper/xml_tag_cleaner/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,9 @@ def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
if sample[self.filetype_key] == "xml":
try:
sample[self.text_key] = self._tag_clean_xml(sample[self.text_key])
logger.info(f"fileName: file_name, method: XMLTagCleaner costs {time.time() - start:6f} s")
logger.info(f"fileName: {file_name}, method: XMLTagCleaner costs {time.time() - start:6f} s")
except ExpatError as err:
logger.error(f"fileName: {file_name} is abnormal xml form: err")
logger.error(f"fileName: {file_name} is abnormal xml form: {err}")
raise RuntimeError(81001, str(err)) from None
except Exception as err:
logger.error(f"fileName {file_name}, method: XMLTagCleaner causes other error: {err}")
Expand Down
4 changes: 2 additions & 2 deletions scripts/db/data-operator-init.sql
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ ON CONFLICT DO NOTHING;

INSERT INTO t_operator
(id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star)
VALUES ('MineruFormatter', 'MinerU PDF文本抽取', '基于MinerU API,抽取PDF中的文本。', '1.0.0', 'text', 'text', null, null, '', false),
VALUES ('MineruFormatter', 'MinerU PDF文本抽取', '基于MinerU API,抽取PDF中的文本。', '1.0.0', 'text', 'text', null, '{"exportType":{"name":"导出类型","description":"指定清洗结果文件类型。若指定为md且后续存在其他清洗算子,可能导致文件格式错乱。","type":"select","defaultVal":"markdown","required":false,"options":[{"label":"markdown","value":"md"},{"label":"txt","value":"txt"}]}}', '', false),
('FileWithHighRepeatPhraseRateFilter', '文档词重复率检查', '去除重复词过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatPhraseRatio": {"name": "文档词重复率", "description": "某个词的统计数/文档总词数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}, "hitStopwords": {"name": "去除停用词", "description": "统计重复词时,选择是否要去除停用词。", "type": "switch", "defaultVal": false, "required": true, "checkedLabel": "去除", "unCheckedLabel": "不去除"}}', '', 'false'),
('FileWithHighRepeatWordRateFilter', '文档字重复率检查', '去除重复字过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatWordRatio": {"name": "文档字重复率", "description": "某个字的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}}', '', 'false'),
('FileWithHighSpecialCharRateFilter', '文档特殊字符率检查', '去除特殊字符过多的文档。', '1.0.0', 'text', 'text', null, '{"specialCharRatio": {"name": "文档特殊字符率", "description": "特殊字符的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.3, "min": 0, "max": 1, "step": 0.1}}', '', 'false'),
Expand All @@ -148,7 +148,7 @@ VALUES ('MineruFormatter', 'MinerU PDF文本抽取', '基于MinerU API,抽取P
('ExtraSpaceCleaner', '多余空格去除', '移除文档首尾、句中或标点符号附近多余空格和 tab 等。', '1.0.0', 'text', 'text', null, null, '', 'false'),
('FullWidthCharacterCleaner', '全角转半角', '将文档中的所有全角字符转换成半角字符。', '1.0.0', 'text', 'text', null, null, '', 'false'),
('GrableCharactersCleaner', '文档乱码去除', '去除文档中的乱码和无意义的unicode。', '1.0.0', 'text', 'text', null, null, '', 'false'),
('HtmlTagCleaner', 'HTML标签去除', '移除文档中HTML标签,如 <html>、<dev>、<p> 等。', '1.0.0', 'text', 'text', null, null, '', 'false'),
('HtmlTagCleaner', 'HTML标签去除', '移除文档中HTML标签,如 <html>、<dev>、<p> 等。', '1.0.0', 'text', 'text', null, '{"removeTableTags":{"name":"是否去除表格标签","description":"若为是,则会去除表格标签<tr><td>等。","type":"switch","defaultVal":"false","required":false,"checkedLabel":"是","unCheckedLabel":"否"}}', '', 'false'),
('AnonymizedIdNumber', '身份证号匿名化', '身份证号匿名化。', '1.0.0', 'text', 'text', null, null, '', 'false'),
('InvisibleCharactersCleaner', '不可见字符去除', '去除文档中的不可见字符,例如 0-31 号字符中的部分字符。', '1.0.0', 'text', 'text', null, null, '', 'false'),
('AnonymizedIpAddress', 'IP地址匿名化', 'IP地址匿名化', '1.0.0', 'text', 'text', null, null, '', 'false'),
Expand Down
Loading