Chinese Breaking News Script/ English Breaking News Script
QuickNews - Real-time News with Impact QuickNews LIVE
AI ์ธ์ด ๋ณต์ ํ๋ก์ ํธ · ๋ง์ฃผ์ด
์ฒญ๋๋ผ๋ฅผ ํต์นํ ๋ฏผ์กฑ์ ์ธ์ด. ํ๋ ์๋ฐฑ๋ง์ด ์ฌ์ฉํ์ผ๋
์ด์ 10๋ช
๋ฏธ๋ง์ ํ์๋ง์ด ๋จ์์์ต๋๋ค.
๋ณต์ ํ์ดํ๋ผ์ธ
์ฒ๋ฆฌ ํ๋ฆ ์๊ฐํ
์ธํฐ๋ํฐ๋ธ ์ฌ์
ํํ์ ๋ถ์๊ธฐ
๊ตฌํ ์ฝ๋
# ─── ๋ง์ฃผ์ด OCR ๋ชจ๋ ─────────────────────────────────────────────── import cv2 import numpy as np import pytesseract from pathlib import Path class ManchuOCR: def __init__(self, model_path: str = "manchu_tessdata"): self.config = f"--oem 3 --psm 6 --tessdata-dir {model_path}" pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract" def preprocess(self, img_path: str) -> np.ndarray: """์ด์งํ + ๋ ธ์ด์ฆ ์ ๊ฑฐ + ๋๋น ๊ฐํ""" img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE) # ์ ์ํ ์ด์งํ (๊ณ ๋ฌธ์ ์กฐ๋ช ๋ถ๊ท ์ผ ๋์) binary = cv2.adaptiveThreshold( img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2 ) # ๋ชจํด๋ก์ง ๋ ธ์ด์ฆ ์ ๊ฑฐ kernel = np.ones((2, 2), np.uint8) cleaned = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel) return cleaned def extract_contours(self, img: np.ndarray) -> list: """๋ง์ฃผ ๋ฌธ์ ์ปจํฌ์ด ๋ถ์ (ํ์ ๊ฐ์ง)""" contours, _ = cv2.findContours( img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE ) # ์ธ๋ก์ฐ๊ธฐ ๋ง์ฃผ ๋ฌธ์: ์→์๋ ์ ๋ ฌ return sorted(contours, key=lambda c: cv2.boundingRect(c)[1]) def recognize(self, img_path: str) -> str: """์ด๋ฏธ์ง → ๋ง์ฃผ ๋ก๋ง์ ๋ณํ""" processed = self.preprocess(img_path) text = pytesseract.image_to_string( processed, lang="manchu", config=self.config ) return self._postprocess(text) def _postprocess(self, raw: str) -> str: """ํ์ ๋ณ๊ธฐ ํจํด์ผ๋ก ์ค๋ฅ ์์ """ corrections = { "gvrun": "gurun", # ๋๋ผ "amba1": "amba", # ํฌ๋ค "han9": "han", # ํฉ์ } for wrong, right in corrections.items(): raw = raw.replace(wrong, right) return raw.strip()
# ─── ๋ง์ฃผ์ด ํํ์ ๋ถ์ (๊ต์ฐฉ์ด ์ฒ๋ฆฌ) ───────────────────────────── import re from dataclasses import dataclass from typing import List, Tuple @dataclass class Morpheme: form: str type: str # root | suffix | particle | verb meaning: str pos: str # ํ์ฌ # ๋ง์ฃผ์ด ์ ์ฌ ์ฌ์ (๊ต์ฐฉ์ด ํต์ฌ) SUFFIXES = { "-mbi": ("ํ์ฌํ ๋์ฌ ์ด๋ฏธ", "VERB.PRES"), "-ha": ("์๋ฃํ", "VERB.PERF"), "-me": ("์ฐ๊ฒฐํ", "CONV"), "-ngge": ("๋ช ์ฌํ", "NMLZ"), "-i": ("์๊ฒฉ ์กฐ์ฌ", "GEN"), "-be": ("๋๊ฒฉ ์กฐ์ฌ", "ACC"), "-de": ("์ฌ๊ฒฉ/์ฒ๊ฒฉ", "DAT/LOC"), "-ci": ("ํ๊ฒฉ", "ABL"), } ROOT_DICT = { "gurun": ("๋๋ผ, ๊ตญ๊ฐ", "NOUN"), "niyalma": ("์ฌ๋", "NOUN"), "han": ("ํฉ์ , ์นธ", "NOUN"), "amba": ("ํฌ๋ค, ์๋ํ", "ADJ"), "gisun": ("๋ง, ์ธ์ด", "NOUN"), "manju": ("๋ง์ฃผ", "PROPN"), "boo": ("์ง", "NOUN"), "alin": ("์ฐ", "NOUN"), } class ManchuParser: def tokenize(self, sentence: str) -> List[str]: """๊ณต๋ฐฑ ๊ธฐ๋ฐ ํ ํฌ๋์ด์ (๋ง์ฃผ์ด๋ ๊ณต๋ฐฑ ๊ตฌ๋ถ) ์ค์ ๊ตฌํ: BPE + ์ดํ ์ฌ์ ๊ฒฐํฉ""" tokens = sentence.lower().split() return [t.strip(".,;:") for t in tokens] def analyze(self, token: str) -> List[Morpheme]: """์ด๊ทผ + ์ ์ฌ ๋ถ๋ฆฌ ๋ถ์""" morphemes = [] remaining = token # ์ด๊ทผ ๋งค์นญ (์ต์ฅ ์ผ์น) matched_root = None for root in sorted(ROOT_DICT, key=len, reverse=True): if remaining.startswith(root): meaning, pos = ROOT_DICT[root] matched_root = Morpheme(root, "root", meaning, pos) remaining = remaining[len(root):] break if matched_root: morphemes.append(matched_root) # ์ ์ฌ ์ฒด์ธ ๋ถ์ while remaining: found = False for suf in sorted(SUFFIXES, key=len, reverse=True): clean_suf = suf.lstrip("-") if remaining.startswith(clean_suf): meaning, pos = SUFFIXES[suf] morphemes.append(Morpheme(clean_suf, "suffix", meaning, pos)) remaining = remaining[len(clean_suf):] found = True break if not found: morphemes.append(Morpheme(remaining, "unknown", "?", "UNK")) break return morphemes
# ─── mBART ๊ธฐ๋ฐ ๋ง์ฃผ์ด ๋ฒ์ญ ํ์ดํ๋ผ์ธ ──────────────────────────── import torch from transformers import MBartForConditionalGeneration, MBart50TokenizerFast from datasets import Dataset from manchu_parser import ManchuParser class ManchuTranslator: MODEL_ID = "facebook/mbart-large-50-many-to-many-mmt" def __init__(self, fine_tuned_path: str = None): self.tokenizer = MBart50TokenizerFast.from_pretrained(self.MODEL_ID) self.model = MBartForConditionalGeneration.from_pretrained( fine_tuned_path or self.MODEL_ID ) self.parser = ManchuParser() self.device = "cuda" if torch.cuda.is_available() else "cpu" self.model.to(self.device) def translate( self, manchu_text: str, target_lang: str = "ko_KR" # ํ๊ตญ์ด ) -> dict: """๋ง์ฃผ์ด → ํ๋์ด ๋ฒ์ญ""" # 1. ํํ์ ๋ถ์ ์ ํ ์ฒ๋ฆฌ tokens = self.parser.tokenize(manchu_text) morpheme_analysis = {t: self.parser.analyze(t) for t in tokens} # 2. ์ธ์ฝ๋ฉ (๋ง์ฃผ์ด๋ ์ปค์คํ src_lang ํ์) self.tokenizer.src_lang = "manchu_romanized" # ์ปค์คํ ๋ฑ๋ก inputs = self.tokenizer(manchu_text, return_tensors="pt").to(self.device) # 3. ์์ฑ (beam search, length penalty) with torch.no_grad(): generated = self.model.generate( **inputs, forced_bos_token_id=self.tokenizer.lang_code_to_id[target_lang], num_beams=5, length_penalty=1.2, max_new_tokens=128, early_stopping=True, ) translation = self.tokenizer.batch_decode(generated, skip_special_tokens=True)[0] return { "input": manchu_text, "translation": translation, "morphemes": morpheme_analysis, "lang": target_lang, } def fine_tune(self, parallel_corpus: Dataset, output_dir: str): """๋ณ๊ธฐ ๋ฌธํ์ผ๋ก fine-tuning (HuggingFace Trainer API)""" from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments args = Seq2SeqTrainingArguments( output_dir=output_dir, num_train_epochs=10, per_device_train_batch_size=16, warmup_steps=500, predict_with_generate=True, fp16=torch.cuda.is_available(), save_strategy="epoch", evaluation_strategy="epoch", load_best_model_at_end=True, ) trainer = Seq2SeqTrainer( model=self.model, args=args, train_dataset=parallel_corpus["train"], eval_dataset=parallel_corpus["validation"], tokenizer=self.tokenizer, ) trainer.train() trainer.save_model(output_dir) print(f"✓ Fine-tuning ์๋ฃ → {output_dir}")
๊ฐ๋ฐ ๋ก๋๋งต
๋น๊ต ๋ฒค์น๋งํน
| ํ๋ก์ ํธ | ์ธ์ด | ๋ฐฉ๋ฒ๋ก | ๋ฐ์ดํฐ | ์ ํ๋ |
|---|---|---|---|---|
| Perseus Project | ๋ผํด์ด·๊ณ ๋๊ทธ๋ฆฌ์ค์ด | ๊ท์น ๊ธฐ๋ฐ ํ์ + ํํ์ DB | ํ๋ถ (์๋ฐฑ๋ง ๋จ์ด) | ~95% |
| Google ํ๋ธ๋ฆฌ์ด ๋ณต์ | ์ฌํด๋ฌธ์ ํ๋ธ๋ฆฌ์ด | ์ปจํฌ์ด OCR + ๋ฅ๋ฌ๋ | ์ ํ์ ์ค์บ๋ณธ | ~90% |
| ํ์์ด์ด ๋ถํฅ ์ฑ | ํ์์ด์ด | NMT + ์์ฑํฉ์ฑ | ์ค๊ฐ (์ค๋์ค+ํ ์คํธ) | ~88% |
| ์ํธ๋ฃจ๋ฆฌ์์ด AI | ์ํธ๋ฃจ๋ฆฌ์์ด | few-shot + ๋น๊ต์ธ์ดํ | ํฌ์ (๋ฏธํด๋ ๋ค์) | ~60% |
| ๋ณธ ํ๋ก์ ํธ | ๋ง์ฃผ์ด (๋ชฉํ) | OCR + ํํ์ + mBART NMT | ๋ณ๊ธฐ ๋ฌธํ (ํฌ์) | ๋ชฉํ 80% |
"์ธ์ด๊ฐ ์ฃฝ์ผ๋ฉด, ๊ทธ ๋ฏผ์กฑ์ด ์ธ์์ ๋ฐ๋ผ๋ณด๋
๊ณ ์ ํ ์ฐฝ๋ฌธ ํ๋๊ฐ ์์ํ ๋ซํ๋ค."
— ์ธ์ดํ์ ์ผ ํค์ผ (Ken Hale) / ๋ง์ฃผ์กฑ์ ๋ง์ง๋ง ๋ชฉ์๋ฆฌ๋ค์ ๊ธฐ์ตํ๋ฉฐ
ํ๊ตญ์ด → ๋ง์ฃผ์ด · AI ๋ฒ์ญ · ์๋ฉธ ์๊ธฐ ์ธ์ด ๋ณต์ ํ๋ก์ ํธ
์์ ๋ฌธ์ฅ
๋ฒ์ญ ๊ธฐ๋ก
์ฌ๋ฌ๋ถ์ ํ์์ด ํฐ ํ์ด ๋ฉ๋๋ค!
ํ์ํ๋ฌ ๊ฐ๊ธฐ