- Add code-interpreter, kokoro-tts, remotion-best-practices, research-to-paper-slides, summarize, tavily-tool to source repo - skill-review: add main/xiaoming agent mapping in handler.ts + SKILL.md - tts-voice: handler.ts updates from agent workspace Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
286 lines
10 KiB
Python
286 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
import argparse
|
|
import json
|
|
import math
|
|
import os
|
|
from pathlib import Path
|
|
|
|
try:
|
|
import pandas as pd
|
|
except ImportError:
|
|
raise SystemExit(
|
|
'pandas is required. Run with the code-interpreter venv:\n'
|
|
' ~/.openclaw/workspace/.venv-code-interpreter/bin/python analyze_data.py ...'
|
|
)
|
|
|
|
try:
|
|
import matplotlib
|
|
import matplotlib.pyplot as plt
|
|
HAS_MPL = True
|
|
except Exception:
|
|
HAS_MPL = False
|
|
|
|
ZH_FONT_CANDIDATES = [
|
|
'/home/selig/.openclaw/workspace/skills/code-interpreter/assets/fonts/NotoSansCJKtc-Regular.otf',
|
|
'/usr/share/fonts/truetype/droid/DroidSansFallbackFull.ttf',
|
|
]
|
|
|
|
|
|
def configure_matplotlib_fonts() -> tuple[str | None, object | None]:
|
|
if not HAS_MPL:
|
|
return None, None
|
|
chosen = None
|
|
chosen_prop = None
|
|
for path in ZH_FONT_CANDIDATES:
|
|
if Path(path).exists():
|
|
try:
|
|
from matplotlib import font_manager
|
|
font_manager.fontManager.addfont(path)
|
|
font_prop = font_manager.FontProperties(fname=path)
|
|
font_name = font_prop.get_name()
|
|
matplotlib.rcParams['font.family'] = [font_name]
|
|
matplotlib.rcParams['axes.unicode_minus'] = False
|
|
chosen = font_name
|
|
chosen_prop = font_prop
|
|
break
|
|
except Exception:
|
|
continue
|
|
return chosen, chosen_prop
|
|
|
|
|
|
def apply_font(ax, font_prop) -> None:
|
|
if not font_prop:
|
|
return
|
|
title = ax.title
|
|
if title:
|
|
title.set_fontproperties(font_prop)
|
|
ax.xaxis.label.set_fontproperties(font_prop)
|
|
ax.yaxis.label.set_fontproperties(font_prop)
|
|
for label in ax.get_xticklabels():
|
|
label.set_fontproperties(font_prop)
|
|
for label in ax.get_yticklabels():
|
|
label.set_fontproperties(font_prop)
|
|
legend = ax.get_legend()
|
|
if legend:
|
|
for text in legend.get_texts():
|
|
text.set_fontproperties(font_prop)
|
|
legend.get_title().set_fontproperties(font_prop)
|
|
|
|
|
|
def detect_format(path: Path) -> str:
|
|
ext = path.suffix.lower()
|
|
if ext in {'.csv', '.tsv', '.txt'}:
|
|
return 'delimited'
|
|
if ext == '.json':
|
|
return 'json'
|
|
if ext in {'.xlsx', '.xls'}:
|
|
return 'excel'
|
|
raise SystemExit(f'Unsupported file type: {ext}')
|
|
|
|
|
|
def load_df(path: Path) -> pd.DataFrame:
|
|
fmt = detect_format(path)
|
|
if fmt == 'delimited':
|
|
sep = '\t' if path.suffix.lower() == '.tsv' else ','
|
|
return pd.read_csv(path, sep=sep)
|
|
if fmt == 'json':
|
|
try:
|
|
return pd.read_json(path)
|
|
except ValueError:
|
|
return pd.DataFrame(json.loads(path.read_text(encoding='utf-8')))
|
|
if fmt == 'excel':
|
|
return pd.read_excel(path)
|
|
raise SystemExit('Unsupported format')
|
|
|
|
|
|
def safe_name(s: str) -> str:
|
|
keep = []
|
|
for ch in s:
|
|
if ch.isalnum() or ch in ('-', '_'):
|
|
keep.append(ch)
|
|
elif ch in (' ', '/'):
|
|
keep.append('_')
|
|
out = ''.join(keep).strip('_')
|
|
return out[:80] or 'column'
|
|
|
|
|
|
def series_stats(s: pd.Series) -> dict:
|
|
non_null = s.dropna()
|
|
result = {
|
|
'dtype': str(s.dtype),
|
|
'nonNull': int(non_null.shape[0]),
|
|
'nulls': int(s.isna().sum()),
|
|
'unique': int(non_null.nunique()) if len(non_null) else 0,
|
|
}
|
|
if pd.api.types.is_numeric_dtype(s):
|
|
result.update({
|
|
'min': None if non_null.empty else float(non_null.min()),
|
|
'max': None if non_null.empty else float(non_null.max()),
|
|
'mean': None if non_null.empty else float(non_null.mean()),
|
|
'sum': None if non_null.empty else float(non_null.sum()),
|
|
})
|
|
else:
|
|
top = non_null.astype(str).value_counts().head(5)
|
|
result['topValues'] = [{
|
|
'value': str(idx),
|
|
'count': int(val),
|
|
} for idx, val in top.items()]
|
|
return result
|
|
|
|
|
|
def maybe_parse_dates(df: pd.DataFrame) -> tuple[pd.DataFrame, list[str]]:
|
|
parsed = []
|
|
out = df.copy()
|
|
for col in out.columns:
|
|
if out[col].dtype == 'object':
|
|
sample = out[col].dropna().astype(str).head(20)
|
|
if sample.empty:
|
|
continue
|
|
parsed_col = pd.to_datetime(out[col], errors='coerce')
|
|
success_ratio = float(parsed_col.notna().mean()) if len(out[col]) else 0.0
|
|
if success_ratio >= 0.6:
|
|
out[col] = parsed_col
|
|
parsed.append(str(col))
|
|
return out, parsed
|
|
|
|
|
|
def write_report(df: pd.DataFrame, summary: dict, out_dir: Path) -> Path:
|
|
lines = []
|
|
lines.append('# Data Analysis Report')
|
|
lines.append('')
|
|
lines.append(f"- Source: `{summary['source']}`")
|
|
lines.append(f"- Rows: **{summary['rows']}**")
|
|
lines.append(f"- Columns: **{summary['columns']}**")
|
|
lines.append(f"- Generated plots: **{len(summary['plots'])}**")
|
|
if summary['parsedDateColumns']:
|
|
lines.append(f"- Parsed date columns: {', '.join(summary['parsedDateColumns'])}")
|
|
lines.append('')
|
|
lines.append('## Columns')
|
|
lines.append('')
|
|
for name, meta in summary['columnProfiles'].items():
|
|
lines.append(f"### {name}")
|
|
lines.append(f"- dtype: `{meta['dtype']}`")
|
|
lines.append(f"- non-null: {meta['nonNull']}")
|
|
lines.append(f"- nulls: {meta['nulls']}")
|
|
lines.append(f"- unique: {meta['unique']}")
|
|
if 'mean' in meta:
|
|
lines.append(f"- min / max: {meta['min']} / {meta['max']}")
|
|
lines.append(f"- mean / sum: {meta['mean']} / {meta['sum']}")
|
|
elif meta.get('topValues'):
|
|
preview = ', '.join([f"{x['value']} ({x['count']})" for x in meta['topValues'][:5]])
|
|
lines.append(f"- top values: {preview}")
|
|
lines.append('')
|
|
report = out_dir / 'report.md'
|
|
report.write_text('\n'.join(lines).strip() + '\n', encoding='utf-8')
|
|
return report
|
|
|
|
|
|
def generate_plots(df: pd.DataFrame, out_dir: Path, font_prop=None) -> list[str]:
|
|
if not HAS_MPL:
|
|
return []
|
|
plots = []
|
|
numeric_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
|
|
date_cols = [c for c in df.columns if pd.api.types.is_datetime64_any_dtype(df[c])]
|
|
cat_cols = [c for c in df.columns if not pd.api.types.is_numeric_dtype(df[c]) and not pd.api.types.is_datetime64_any_dtype(df[c])]
|
|
|
|
if numeric_cols:
|
|
col = numeric_cols[0]
|
|
plt.figure(figsize=(7, 4))
|
|
bins = min(20, max(5, int(math.sqrt(max(1, df[col].dropna().shape[0])))))
|
|
df[col].dropna().hist(bins=bins)
|
|
plt.title(f'Histogram of {col}', fontproperties=font_prop)
|
|
plt.xlabel(str(col), fontproperties=font_prop)
|
|
plt.ylabel('Count', fontproperties=font_prop)
|
|
apply_font(plt.gca(), font_prop)
|
|
path = out_dir / f'hist_{safe_name(str(col))}.png'
|
|
plt.tight_layout()
|
|
plt.savefig(path, dpi=160)
|
|
plt.close()
|
|
plots.append(str(path))
|
|
|
|
if cat_cols and numeric_cols:
|
|
cat, num = cat_cols[0], numeric_cols[0]
|
|
grp = df.groupby(cat, dropna=False)[num].sum().sort_values(ascending=False).head(12)
|
|
if not grp.empty:
|
|
plt.figure(figsize=(8, 4.5))
|
|
grp.plot(kind='bar')
|
|
plt.title(f'{num} by {cat}', fontproperties=font_prop)
|
|
plt.xlabel(str(cat), fontproperties=font_prop)
|
|
plt.ylabel(f'Sum of {num}', fontproperties=font_prop)
|
|
apply_font(plt.gca(), font_prop)
|
|
plt.tight_layout()
|
|
path = out_dir / f'bar_{safe_name(str(num))}_by_{safe_name(str(cat))}.png'
|
|
plt.savefig(path, dpi=160)
|
|
plt.close()
|
|
plots.append(str(path))
|
|
|
|
if date_cols and numeric_cols:
|
|
date_col, num = date_cols[0], numeric_cols[0]
|
|
grp = df[[date_col, num]].dropna().sort_values(date_col)
|
|
if not grp.empty:
|
|
plt.figure(figsize=(8, 4.5))
|
|
plt.plot(grp[date_col], grp[num], marker='o')
|
|
plt.title(f'{num} over time', fontproperties=font_prop)
|
|
plt.xlabel(str(date_col), fontproperties=font_prop)
|
|
plt.ylabel(str(num), fontproperties=font_prop)
|
|
apply_font(plt.gca(), font_prop)
|
|
plt.tight_layout()
|
|
path = out_dir / f'line_{safe_name(str(num))}_over_time.png'
|
|
plt.savefig(path, dpi=160)
|
|
plt.close()
|
|
plots.append(str(path))
|
|
|
|
return plots
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(description='Automatic data analysis report generator')
|
|
parser.add_argument('input', help='Input data file (csv/json/xlsx)')
|
|
parser.add_argument('--artifact-dir', required=True, help='Output artifact directory')
|
|
args = parser.parse_args()
|
|
|
|
input_path = Path(args.input).expanduser().resolve()
|
|
artifact_dir = Path(args.artifact_dir).expanduser().resolve()
|
|
artifact_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
df = load_df(input_path)
|
|
original_columns = [str(c) for c in df.columns]
|
|
df, parsed_dates = maybe_parse_dates(df)
|
|
chosen_font, chosen_font_prop = configure_matplotlib_fonts()
|
|
|
|
preview_path = artifact_dir / 'preview.csv'
|
|
df.head(50).to_csv(preview_path, index=False)
|
|
|
|
summary = {
|
|
'source': str(input_path),
|
|
'rows': int(df.shape[0]),
|
|
'columns': int(df.shape[1]),
|
|
'columnNames': original_columns,
|
|
'parsedDateColumns': parsed_dates,
|
|
'columnProfiles': {str(c): series_stats(df[c]) for c in df.columns},
|
|
'plots': [],
|
|
'plotFont': chosen_font,
|
|
}
|
|
|
|
summary['plots'] = generate_plots(df, artifact_dir, chosen_font_prop)
|
|
|
|
summary_path = artifact_dir / 'summary.json'
|
|
summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding='utf-8')
|
|
report_path = write_report(df, summary, artifact_dir)
|
|
|
|
result = {
|
|
'ok': True,
|
|
'input': str(input_path),
|
|
'artifactDir': str(artifact_dir),
|
|
'summary': str(summary_path),
|
|
'report': str(report_path),
|
|
'preview': str(preview_path),
|
|
'plots': summary['plots'],
|
|
}
|
|
print(json.dumps(result, ensure_ascii=False, indent=2))
|
|
return 0
|
|
|
|
|
|
if __name__ == '__main__':
|
|
raise SystemExit(main())
|