ad-user-creator/ad_user_creator/input_parser.py

from __future__ import annotations

import re
from pathlib import Path
from typing import Dict, List, Tuple

import pandas as pd

from ad_user_creator.exceptions import InputValidationError
from ad_user_creator.models import UserInputRecord

REQUIRED_HEADERS = ["姓名", "用户名", "邮箱", "部门 OU", "基础组", "项目组", "资源组"]
USERNAME_PATTERN = re.compile(r"^[A-Za-z0-9_-]+$")
EMAIL_PATTERN = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$")


def _split_groups(value: object) -> List[str]:
    if value is None:
        return []
    text = str(value).strip()
    if not text or text.lower() == "nan":
        return []
    normalized = text.replace("，", ",")
    groups = [item.strip() for item in normalized.split(",") if item.strip()]
    deduped: List[str] = []
    seen = set()
    for group in groups:
        if group not in seen:
            deduped.append(group)
            seen.add(group)
    return deduped


def _read_table(input_path: str) -> pd.DataFrame:
    file_path = Path(input_path)
    if not file_path.exists():
        raise InputValidationError(f"输入文件不存在: {input_path}")
    suffix = file_path.suffix.lower()
    if suffix not in {".csv", ".xlsx"}:
        raise InputValidationError(f"仅支持 .csv 和 .xlsx，当前为: {suffix}")

    if suffix == ".csv":
        try:
            return pd.read_csv(file_path, encoding="utf-8-sig")
        except UnicodeDecodeError:
            return pd.read_csv(file_path, encoding="utf-8")
    return pd.read_excel(file_path, engine="openpyxl")


def _validate_headers(df: pd.DataFrame) -> None:
    missing = [header for header in REQUIRED_HEADERS if header not in df.columns]
    if missing:
        raise InputValidationError(f"输入文件缺少列: {', '.join(missing)}")


def parse_input_file(input_path: str) -> List[Tuple[UserInputRecord, Dict[str, str]]]:
    df = _read_table(input_path)
    _validate_headers(df)
    df = df.fillna("")

    parsed: List[Tuple[UserInputRecord, Dict[str, str]]] = []
    for index, row in df.iterrows():
        line_no = index + 2
        display_name = str(row["姓名"]).strip()
        sam_account_name = str(row["用户名"]).strip()
        email = str(row["邮箱"]).strip()
        dept_ou = str(row["部门 OU"]).strip()
        base_group = str(row["基础组"]).strip()
        project_groups = _split_groups(row["项目组"])
        resource_groups = _split_groups(row["资源组"])

        required_missing = []
        if not display_name:
            required_missing.append("姓名")
        if not sam_account_name:
            required_missing.append("用户名")
        if not email:
            required_missing.append("邮箱")
        if not dept_ou:
            required_missing.append("部门 OU")
        if not base_group:
            required_missing.append("基础组")
        if required_missing:
            raise InputValidationError(
                f"第 {line_no} 行缺少必填字段: {', '.join(required_missing)}"
            )
        if not USERNAME_PATTERN.match(sam_account_name):
            raise InputValidationError(
                f"第 {line_no} 行用户名非法: {sam_account_name}，只允许字母数字下划线短横线"
            )
        if not EMAIL_PATTERN.match(email):
            raise InputValidationError(f"第 {line_no} 行邮箱格式非法: {email}")

        record = UserInputRecord(
            display_name=display_name,
            sam_account_name=sam_account_name,
            email=email,
            dept_ou=dept_ou,
            base_group=base_group,
            project_groups=project_groups,
            resource_groups=resource_groups,
        )
        raw = {
            "姓名": display_name,
            "用户名": sam_account_name,
            "邮箱": email,
            "部门 OU": dept_ou,
            "基础组": base_group,
            "项目组": ",".join(project_groups),
            "资源组": ",".join(resource_groups),
        }
        parsed.append((record, raw))

    return parsed