df["domain"] = df["url"].apply(lambda u: urlparse(u).netloc.change("www.", "") if isinstance(u, str) else "?")
top_domains = df["domain"].value_counts().head(15)
print("n--- Prime 15 domains in pattern ---")
print(top_domains)
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes[0, 0].hist(df["token_count"].clip(higher=4000), bins=50, colour="#7b2d26")
axes[0, 0].set_title("Token depend per doc (gpt2)")
axes[0, 0].set_xlabel("tokens"); axes[0, 0].set_ylabel("docs")
axes[0, 1].hist(df["language_score"], bins=40, colour="#2d5d7b")
axes[0, 1].axvline(0.65, colour="pink", ls="--", label="FineWeb cutoff 0.65")
axes[0, 1].set_title("fastText English language rating")
axes[0, 1].set_xlabel("rating"); axes[0, 1].legend()
axes[1, 0].hist(df["chars_per_token"].clip(higher=8), bins=40, colour="#3f7b2d")
axes[1, 0].set_title("Characters per token (compression)")
axes[1, 0].set_xlabel("chars / token")
top_domains.iloc[::-1].plot(variety="barh", ax=axes[1, 1], colour="#7b5d2d")
axes[1, 1].set_title("Prime domains")
plt.tight_layout()
plt.present()
print("n" + "=" * 70)
print("SUMMARY")
print("=" * 70)
print(f"Docs streamed : {len(df):,}")
print(f"Whole gpt2 tokens : {df['token_count'].sum():,}")
print(f"Median tokens/doc : {int(df['token_count'].median())}")
print(f"Distinctive domains : {df['domain'].nunique():,}")
print(f"Imply language_score : {df['language_score'].imply():.3f}")
print(f"Close to-duplicate pairs : {len(dup_pairs)}")
print(f"Docs flagged by filters : {(pd.Collection(outcomes) != 'stored').sum()} / {len(outcomes)}")
print("nNext steps:")
print(" • Swap identify="sample-10BT" for an actual crawl, e.g. identify="CC-MAIN-2024-10"")
print(" • Increase N_DOCS for stronger statistics")
print(" • Use the total datatrove pipeline to breed FineWeb end-to-end")





![How creators and entrepreneurs are utilizing AI to hurry up & succeed [data]](https://blog.aimactgrow.com/wp-content/uploads/2025/06/Untitled20design-Apr-07-2023-08-24-35-4586-PM-120x86.png)


