Exploring Research Subfields with the OpenAlex API

This notebook uses the /subfields endpoint to retrieve and visualise the research subfields that make up four major academic fields: Physics and Astronomy, Agricultural and Biological Sciences, Biochemistry, Genetics and Molecular Biology, and Economics, Econometrics and Finance.

Author

Simon

Published

February 20, 2026

Setup

We start by defining the fields we want to explore and two helper functions. FIELDS maps each field name to its OpenAlex identifier. get_subfields() queries the /subfields endpoint for a given field and returns the results as a DataFrame with columns for the field name, subfield name, and work count. plot_subfields() renders a horizontal bar chart from that DataFrame.

Code

import os
import time
import httpx
import pandas as pd
import matplotlib.pyplot as plt
from dotenv import load_dotenv

load_dotenv()

BASE_URL = "https://api.openalex.org"

api = httpx.Client(
    base_url=BASE_URL,
    params={
        "mailto": os.environ.get("OPENALEX_MAILTO", ""),
        "api_key": os.environ["OPENALEX_API_KEY"],
    },
    timeout=30,
)

FIELDS = {
    "Physics and Astronomy": "fields/31",
    "Agricultural and Biological Sciences": "fields/11",
    "Biochemistry, Genetics and Molecular Biology": "fields/13",
    "Economics, Econometrics and Finance": "fields/20",
}


def _get(path: str, **kwargs) -> dict:
    """GET with automatic retry on 429 and connection errors."""
    for attempt in range(5):
        try:
            r = api.get(path, **kwargs)
        except httpx.ReadError:
            time.sleep(2 ** attempt)
            continue
        if r.status_code == 429:
            time.sleep(2 ** attempt)
            continue
        r.raise_for_status()
        return r.json()
    raise RuntimeError("Max retries exceeded")


def get_subfields(field_name: str, field_id: str) -> pd.DataFrame:
    data = _get("/subfields", params={
        "filter": f"field.id:{field_id}",
        "select": "id,display_name,works_count",
        "per-page": 50,
        "sort": "works_count:desc",
    })
    df = pd.DataFrame(data["results"])[["display_name", "works_count"]]
    df.columns = ["Subfield", "Works"]
    df.insert(0, "Field", field_name)
    return df


BAR_HEIGHT = 0.25  # inches per bar — keeps bars the same size across charts


def plot_subfields(df: pd.DataFrame):
    fig_h = max(2, len(df) * BAR_HEIGHT + 1)
    fig, ax = plt.subplots(figsize=(6, fig_h))
    ax.barh(df["Subfield"][::-1], df["Works"][::-1], color="#9e9e9e")
    ax.set_xlabel("Number of works")
    ax.grid(axis="x", color="#b0bec5", linewidth=0.5, linestyle="--")
    ax.set_axisbelow(True)
    for spine in ax.spines.values():
        spine.set_visible(False)
    plt.tight_layout()
    plt.show()


def get_works_by_year(field_name: str, field_id: str) -> pd.DataFrame:
    data = _get("/works", params={
        "filter": f"primary_topic.field.id:{field_id}",
        "group_by": "publication_year",
    })
    rows = [
        {"Year": int(g["key"]), "Works": g["count"]}
        for g in data["group_by"]
        if g["key"] != "unknown"
    ]
    df = pd.DataFrame(rows).sort_values("Year")
    df.insert(0, "Field", field_name)
    return df


def plot_works_by_year(df: pd.DataFrame):
    fig, ax = plt.subplots(figsize=(6, 3))
    ax.fill_between(df["Year"], df["Works"], alpha=0.3, color="#e67e22")
    ax.plot(df["Year"], df["Works"], color="#e67e22", linewidth=1.5)
    ax.set_xlabel("Publication year")
    ax.set_ylabel("Number of works")
    ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f"{x:,.0f}"))
    ax.grid(axis="y", color="#b0bec5", linewidth=0.5, linestyle="--")
    ax.set_axisbelow(True)
    for spine in ax.spines.values():
        spine.set_visible(False)
    plt.tight_layout()
    plt.show()


def get_works_by_source(field_name: str, field_id: str) -> pd.DataFrame:
    params = {
        "filter": f"primary_topic.field.id:{field_id}",
        "group_by": "primary_location.source.id",
        "per-page": 200,
        "cursor": "*",
    }
    rows = []
    while True:
        data = _get("/works", params=params)
        for g in data["group_by"]:
            rows.append({"Source": g["key_display_name"], "Works": g["count"]})
        cursor = data["meta"].get("next_cursor")
        if not cursor or len(data["group_by"]) == 0:
            break
        params["cursor"] = cursor
        time.sleep(0.1)
    df = pd.DataFrame(rows).sort_values("Works", ascending=False).reset_index(drop=True)
    df.insert(0, "Field", field_name)
    return df


TOP_N_SOURCES = 20


def plot_works_by_source(df: pd.DataFrame):
    top = df.head(TOP_N_SOURCES)
    fig_h = max(2, len(top) * BAR_HEIGHT + 1)
    fig, ax = plt.subplots(figsize=(6, fig_h))
    ax.barh(top["Source"][::-1], top["Works"][::-1], color="#2980b9")
    ax.set_xlabel("Number of works")
    ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f"{x:,.0f}"))
    ax.grid(axis="x", color="#b0bec5", linewidth=0.5, linestyle="--")
    ax.set_axisbelow(True)
    for spine in ax.spines.values():
        spine.set_visible(False)
    plt.tight_layout()
    plt.show()

Fetch data

We loop over the four fields and collect subfield breakdowns, yearly publication counts, and top sources for each one.

Code

from tqdm.auto import tqdm

dfs = {}
yearly = {}
sources = {}
for name, fid in tqdm(FIELDS.items(), total=len(FIELDS), desc="Fields"):
    dfs[name] = get_subfields(name, fid)
    yearly[name] = get_works_by_year(name, fid)
    sources[name] = get_works_by_source(name, fid)

Save to CSV

Persists all three datasets to the data/ directory — one CSV per field plus a combined file for each dataset (subfields, works-by-year, works-by-source), giving 15 files in total. The combined files are what the Field-level analysis section reads back via pd.read_csv().

Code

from pathlib import Path

data_dir = Path("data")
data_dir.mkdir(exist_ok=True)

for name, df in dfs.items():
    slug = name.lower().replace(", ", "-").replace(" ", "-")
    path = data_dir / f"subfields-{slug}.csv"
    df.to_csv(path, index=False)
    print(f"Saved {len(df):>2} rows to {path}")

all_subfields = pd.concat(dfs.values(), ignore_index=True)
path = data_dir / "subfields.csv"
all_subfields.to_csv(path, index=False)
print(f"Saved {len(all_subfields):>2} rows to {path} (combined)")

for name, df in yearly.items():
    slug = name.lower().replace(", ", "-").replace(" ", "-")
    path = data_dir / f"works-by-year-{slug}.csv"
    df.to_csv(path, index=False)
    print(f"Saved {len(df):>2} rows to {path}")

all_yearly = pd.concat(yearly.values(), ignore_index=True)
path = data_dir / "works-by-year.csv"
all_yearly.to_csv(path, index=False)
print(f"Saved {len(all_yearly):>2} rows to {path} (combined)")

for name, df in sources.items():
    slug = name.lower().replace(", ", "-").replace(" ", "-")
    path = data_dir / f"works-by-source-{slug}.csv"
    df.to_csv(path, index=False)
    print(f"Saved {len(df):>5,} rows to {path}")

all_sources = pd.concat(sources.values(), ignore_index=True)
path = data_dir / "works-by-source.csv"
all_sources.to_csv(path, index=False)
print(f"Saved {len(all_sources):>5,} rows to {path} (combined)")

Field-level analysis

The visualisations below are driven by the CSV files saved in the previous steps. Re-run the Fetch data and Save to CSV cells when you need to refresh the underlying data.

Code

import pandas as pd
import matplotlib.pyplot as plt

all_subfields = pd.read_csv("data/subfields.csv")
all_yearly = pd.read_csv("data/works-by-year.csv")
all_sources = pd.read_csv("data/works-by-source.csv")

dfs = {name: g.reset_index(drop=True) for name, g in all_subfields.groupby("Field")}
yearly = {name: g.reset_index(drop=True) for name, g in all_yearly.groupby("Field")}
sources = {name: g.reset_index(drop=True) for name, g in all_sources.groupby("Field")}

BAR_HEIGHT = 0.25


def plot_subfields(df: pd.DataFrame):
    fig_h = max(2, len(df) * BAR_HEIGHT + 1)
    fig, ax = plt.subplots(figsize=(6, fig_h))
    ax.barh(df["Subfield"][::-1], df["Works"][::-1], color="#9e9e9e")
    ax.set_xlabel("Number of works")
    ax.grid(axis="x", color="#b0bec5", linewidth=0.5, linestyle="--")
    ax.set_axisbelow(True)
    for spine in ax.spines.values():
        spine.set_visible(False)
    plt.tight_layout()
    plt.show()


def plot_works_by_year(df: pd.DataFrame):
    fig, ax = plt.subplots(figsize=(6, 3))
    ax.fill_between(df["Year"], df["Works"], alpha=0.3, color="#e67e22")
    ax.plot(df["Year"], df["Works"], color="#e67e22", linewidth=1.5)
    ax.set_xlabel("Publication year")
    ax.set_ylabel("Number of works")
    ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f"{x:,.0f}"))
    ax.grid(axis="y", color="#b0bec5", linewidth=0.5, linestyle="--")
    ax.set_axisbelow(True)
    for spine in ax.spines.values():
        spine.set_visible(False)
    plt.tight_layout()
    plt.show()


TOP_N_SOURCES = 20


def plot_works_by_source(df: pd.DataFrame):
    top = df.head(TOP_N_SOURCES)
    fig_h = max(2, len(top) * BAR_HEIGHT + 1)
    fig, ax = plt.subplots(figsize=(6, fig_h))
    ax.barh(top["Source"][::-1], top["Works"][::-1], color="#2980b9")
    ax.set_xlabel("Number of works")
    ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f"{x:,.0f}"))
    ax.grid(axis="x", color="#b0bec5", linewidth=0.5, linestyle="--")
    ax.set_axisbelow(True)
    for spine in ax.spines.values():
        spine.set_visible(False)
    plt.tight_layout()
    plt.show()


def table_top_sources(df: pd.DataFrame):
    top = df.head(TOP_N_SOURCES)[["Source", "Works"]].copy()
    top.index = range(1, len(top) + 1)
    top.index.name = "Rank"
    top["Works"] = top["Works"].map("{:,}".format)
    print(top.to_markdown())


def plot_source_rank_freq(df: pd.DataFrame):
    rank = range(1, len(df) + 1)
    fig, ax = plt.subplots(figsize=(6, 3))
    ax.scatter(rank, df["Works"], s=12, alpha=0.4, facecolors="#8e44ad", edgecolors="#8e44ad", linewidths=0.5)
    ax.set_xscale("log")
    ax.set_yscale("log")
    ax.set_xlabel("Source rank")
    ax.set_ylabel("Number of works")
    ax.grid(True, which="major", color="#b0bec5", linewidth=0.5, linestyle="--")
    ax.set_axisbelow(True)
    for spine in ax.spines.values():
        spine.set_visible(False)
    plt.tight_layout()
    plt.show()

Physics and Astronomy

Code

total = dfs["Physics and Astronomy"]["Works"].sum()
n_sources = len(sources["Physics and Astronomy"])
print(f"OpenAlex entity: [fields/31](https://openalex.org/fields/31) | {total:,} total works | {n_sources:,} unique journals | Subfields data: [CSV](data/subfields.csv)")

OpenAlex entity: fields/31 | 24,429,050 total works | 62,267 unique journals | Subfields data: CSV

Code

plot_works_by_year(yearly["Physics and Astronomy"])

Figure 1: Annual distribution of works in Physics and Astronomy.

Code

table_top_sources(sources["Physics and Astronomy"])

Rank	Source	Works
1	arXiv (Cornell University)	409,029
2	The Astrophysical Journal	118,870
3	Physical Review Letters	94,100
4	Zenodo (CERN European Organization for Nuclear Research)	83,026
5	Monthly Notices of the Royal Astronomical Society	80,450
6	OSTI OAI (U.S. Department of Energy Office of Scientific and Technical Information)	79,307
7	Bulletin of the American Physical Society	68,927
8	Proceedings of SPIE, the International Society for Optical Engineering/Proceedings of SPIE	63,961
9	Physics Letters B	60,256
10	The Journal of Chemical Physics	59,294
11	Physical review. B, Condensed matter	58,862
12	AIP conference proceedings	58,575
13	Astronomy and Astrophysics	49,332
14	Applied Physics Letters	43,785
15	Journal of Applied Physics	43,460
16	Journal of High Energy Physics	43,057
17	Nature	42,423
18	Journal of Physics Conference Series	39,543
19	Physical review. D. Particles, fields, gravitation, and cosmology/Physical review. D. Particles and fields	39,263
20	Physical review. D/Physical review. D.	38,847

Code

plot_source_rank_freq(sources["Physics and Astronomy"])

Figure 2: Rank–frequency distribution of sources in Physics and Astronomy (log–log).

Code

plot_subfields(dfs["Physics and Astronomy"])

Figure 3: Subfields of Physics and Astronomy ranked by number of works.

Agricultural and Biological Sciences

Code

total = dfs["Agricultural and Biological Sciences"]["Works"].sum()
n_sources = len(sources["Agricultural and Biological Sciences"])
print(f"OpenAlex entity: [fields/11](https://openalex.org/fields/11) | {total:,} total works | {n_sources:,} unique journals | Subfields data: [CSV](data/subfields.csv)")

OpenAlex entity: fields/11 | 21,496,854 total works | 131,819 unique journals | Subfields data: CSV

Code

plot_works_by_year(yearly["Agricultural and Biological Sciences"])

Figure 4: Annual distribution of works in Agricultural and Biological Sciences.

Code

table_top_sources(sources["Agricultural and Biological Sciences"])

Rank	Source	Works
1	PubMed	185,410
2	IUCN Red List of Threatened Species	141,396
3	HAL (Le Centre pour la Communication Scientifique Directe)	111,827
4	Zenodo (CERN European Organization for Nuclear Research)	95,049
5	SHILAP Revista de lepidopterología	81,333
6	LA Referencia (Red Federada de Repositorios Institucionales de Publicaciones Científicas)	78,899
7	Figshare	71,481
8	Acta Horticulturae	55,800
9	Medical Entomology and Zoology	46,690
10	Dialnet (Universidad de la Rioja)	46,034
11	Agritrop (Cirad)	43,254
12	SSRN Electronic Journal	42,855
13	Elsevier eBooks	40,407
14	Biodiversity Heritage Library (Smithsonian Institution)	40,194
15	The NamesforLife Abstracts	37,663
16	RePEc: Research Papers in Economics	36,190
17	Americanae (AECID Library)	31,722
18	Nature	28,615
19	Journal of Animal Science	25,836
20	Journal of Economic Entomology	25,541

Code

plot_source_rank_freq(sources["Agricultural and Biological Sciences"])

Figure 5: Rank–frequency distribution of sources in Agricultural and Biological Sciences (log–log).

Code

plot_subfields(dfs["Agricultural and Biological Sciences"])

Figure 6: Subfields of Agricultural and Biological Sciences ranked by number of works.

Biochemistry, Genetics and Molecular Biology

Code

total = dfs["Biochemistry, Genetics and Molecular Biology"]["Works"].sum()
n_sources = len(sources["Biochemistry, Genetics and Molecular Biology"])
print(f"OpenAlex entity: [fields/13](https://openalex.org/fields/13) | {total:,} total works | {n_sources:,} unique journals | Subfields data: [CSV](data/subfields.csv)")

OpenAlex entity: fields/13 | 19,971,736 total works | 95,131 unique journals | Subfields data: CSV

Code

plot_works_by_year(yearly["Biochemistry, Genetics and Molecular Biology"])

Figure 7: Annual distribution of works in Biochemistry, Genetics and Molecular Biology.

Code

table_top_sources(sources["Biochemistry, Genetics and Molecular Biology"])

Rank	Source	Works
1	PubMed	743,952
2	Figshare	116,315
3	Journal of Biological Chemistry	115,980
4	Elsevier eBooks	71,437
5	Cancer Research	68,226
6	Proceedings of the National Academy of Sciences	64,967
7	Faculty Opinions – Post-Publication Peer Review of the Biomedical Literature	64,646
8	PLoS ONE	63,245
9	Zenodo (CERN European Organization for Nuclear Research)	50,398
10	Biochemical and Biophysical Research Communications	50,207
11	Nature	49,111
12	ChemInform	46,221
13	Scientific Reports	44,618
14	Biophysical Journal	44,257
15	The FASEB Journal	44,039
16	HAL (Le Centre pour la Communication Scientifique Directe)	43,533
17	Nucleic Acids Research	42,310
18	Biochemistry	41,454
19	Medical Entomology and Zoology	40,922
20	Harvard Dataverse	37,587

Code

plot_source_rank_freq(sources["Biochemistry, Genetics and Molecular Biology"])

Figure 8: Rank–frequency distribution of sources in Biochemistry, Genetics and Molecular Biology (log–log).

Code

plot_subfields(dfs["Biochemistry, Genetics and Molecular Biology"])

Figure 9: Subfields of Biochemistry, Genetics and Molecular Biology ranked by number of works.

Economics, Econometrics and Finance

Code

total = dfs["Economics, Econometrics and Finance"]["Works"].sum()
n_sources = len(sources["Economics, Econometrics and Finance"])
print(f"OpenAlex entity: [fields/20](https://openalex.org/fields/20) | {total:,} total works | {n_sources:,} unique journals | Subfields data: [CSV](data/subfields.csv)")

OpenAlex entity: fields/20 | 10,569,794 total works | 121,718 unique journals | Subfields data: CSV

Code

plot_works_by_year(yearly["Economics, Econometrics and Finance"])

Figure 10: Annual distribution of works in Economics, Econometrics and Finance.

Code

table_top_sources(sources["Economics, Econometrics and Finance"])

Rank	Source	Works
1	SSRN Electronic Journal	272,734
2	RePEc: Research Papers in Economics	261,611
3	PubMed	115,204
4	Columbia Academic Commons (Columbia University)	65,007
5	Dialnet (Universidad de la Rioja)	50,769
6	Medical Entomology and Zoology	43,519
7	SHILAP Revista de lepidopterología	33,454
8	arXiv (Cornell University)	32,812
9	Cambridge University Press eBooks	32,032
10	Econstor (Econstor)	30,335
11	Palgrave Macmillan UK eBooks	29,734
12	Zenodo (CERN European Organization for Nuclear Research)	29,369
13	Edward Elgar Publishing eBooks	24,269
14	HAL (Le Centre pour la Communication Scientifique Directe)	23,689
15	Oxford University Press eBooks	21,903
16	Harvard Dataverse	21,148
17	University of Zagreb University Computing Centre (SRCE)	20,876
18	Value in Health	18,861
19	Data Archiving and Networked Services (DANS)	16,773
20	LA Referencia (Red Federada de Repositorios Institucionales de Publicaciones Científicas)	16,199

Code

plot_source_rank_freq(sources["Economics, Econometrics and Finance"])

Figure 11: Rank–frequency distribution of sources in Economics, Econometrics and Finance (log–log).

Code

plot_subfields(dfs["Economics, Econometrics and Finance"])

Figure 12: Subfields of Economics, Econometrics and Finance ranked by number of works.