Exploring Research Subfields with the OpenAlex API

This notebook uses the /subfields endpoint to retrieve and visualise the research subfields that make up four major academic fields: Physics and Astronomy, Agricultural and Biological Sciences, Biochemistry, Genetics and Molecular Biology, and Economics, Econometrics and Finance.

Author

Simon

Published

February 20, 2026

Setup

We start by defining the fields we want to explore and two helper functions. FIELDS maps each field name to its OpenAlex identifier. get_subfields() queries the /subfields endpoint for a given field and returns the results as a DataFrame with columns for the field name, subfield name, and work count. plot_subfields() renders a horizontal bar chart from that DataFrame.

Code
import os
import time
import httpx
import pandas as pd
import matplotlib.pyplot as plt
from dotenv import load_dotenv

load_dotenv()

BASE_URL = "https://api.openalex.org"

api = httpx.Client(
    base_url=BASE_URL,
    params={
        "mailto": os.environ.get("OPENALEX_MAILTO", ""),
        "api_key": os.environ["OPENALEX_API_KEY"],
    },
    timeout=30,
)

FIELDS = {
    "Physics and Astronomy": "fields/31",
    "Agricultural and Biological Sciences": "fields/11",
    "Biochemistry, Genetics and Molecular Biology": "fields/13",
    "Economics, Econometrics and Finance": "fields/20",
}


def _get(path: str, **kwargs) -> dict:
    """GET with automatic retry on 429 and connection errors."""
    for attempt in range(5):
        try:
            r = api.get(path, **kwargs)
        except httpx.ReadError:
            time.sleep(2 ** attempt)
            continue
        if r.status_code == 429:
            time.sleep(2 ** attempt)
            continue
        r.raise_for_status()
        return r.json()
    raise RuntimeError("Max retries exceeded")


def get_subfields(field_name: str, field_id: str) -> pd.DataFrame:
    data = _get("/subfields", params={
        "filter": f"field.id:{field_id}",
        "select": "id,display_name,works_count",
        "per-page": 50,
        "sort": "works_count:desc",
    })
    df = pd.DataFrame(data["results"])[["display_name", "works_count"]]
    df.columns = ["Subfield", "Works"]
    df.insert(0, "Field", field_name)
    return df


BAR_HEIGHT = 0.25  # inches per bar — keeps bars the same size across charts


def plot_subfields(df: pd.DataFrame):
    fig_h = max(2, len(df) * BAR_HEIGHT + 1)
    fig, ax = plt.subplots(figsize=(6, fig_h))
    ax.barh(df["Subfield"][::-1], df["Works"][::-1], color="#9e9e9e")
    ax.set_xlabel("Number of works")
    ax.grid(axis="x", color="#b0bec5", linewidth=0.5, linestyle="--")
    ax.set_axisbelow(True)
    for spine in ax.spines.values():
        spine.set_visible(False)
    plt.tight_layout()
    plt.show()


def get_works_by_year(field_name: str, field_id: str) -> pd.DataFrame:
    data = _get("/works", params={
        "filter": f"primary_topic.field.id:{field_id}",
        "group_by": "publication_year",
    })
    rows = [
        {"Year": int(g["key"]), "Works": g["count"]}
        for g in data["group_by"]
        if g["key"] != "unknown"
    ]
    df = pd.DataFrame(rows).sort_values("Year")
    df.insert(0, "Field", field_name)
    return df


def plot_works_by_year(df: pd.DataFrame):
    fig, ax = plt.subplots(figsize=(6, 3))
    ax.fill_between(df["Year"], df["Works"], alpha=0.3, color="#e67e22")
    ax.plot(df["Year"], df["Works"], color="#e67e22", linewidth=1.5)
    ax.set_xlabel("Publication year")
    ax.set_ylabel("Number of works")
    ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f"{x:,.0f}"))
    ax.grid(axis="y", color="#b0bec5", linewidth=0.5, linestyle="--")
    ax.set_axisbelow(True)
    for spine in ax.spines.values():
        spine.set_visible(False)
    plt.tight_layout()
    plt.show()


def get_works_by_source(field_name: str, field_id: str) -> pd.DataFrame:
    params = {
        "filter": f"primary_topic.field.id:{field_id}",
        "group_by": "primary_location.source.id",
        "per-page": 200,
        "cursor": "*",
    }
    rows = []
    while True:
        data = _get("/works", params=params)
        for g in data["group_by"]:
            rows.append({"Source": g["key_display_name"], "Works": g["count"]})
        cursor = data["meta"].get("next_cursor")
        if not cursor or len(data["group_by"]) == 0:
            break
        params["cursor"] = cursor
        time.sleep(0.1)
    df = pd.DataFrame(rows).sort_values("Works", ascending=False).reset_index(drop=True)
    df.insert(0, "Field", field_name)
    return df


TOP_N_SOURCES = 20


def plot_works_by_source(df: pd.DataFrame):
    top = df.head(TOP_N_SOURCES)
    fig_h = max(2, len(top) * BAR_HEIGHT + 1)
    fig, ax = plt.subplots(figsize=(6, fig_h))
    ax.barh(top["Source"][::-1], top["Works"][::-1], color="#2980b9")
    ax.set_xlabel("Number of works")
    ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f"{x:,.0f}"))
    ax.grid(axis="x", color="#b0bec5", linewidth=0.5, linestyle="--")
    ax.set_axisbelow(True)
    for spine in ax.spines.values():
        spine.set_visible(False)
    plt.tight_layout()
    plt.show()

Fetch data

We loop over the four fields and collect subfield breakdowns, yearly publication counts, and top sources for each one.

Code
from tqdm.auto import tqdm

dfs = {}
yearly = {}
sources = {}
for name, fid in tqdm(FIELDS.items(), total=len(FIELDS), desc="Fields"):
    dfs[name] = get_subfields(name, fid)
    yearly[name] = get_works_by_year(name, fid)
    sources[name] = get_works_by_source(name, fid)

Save to CSV

Persists all three datasets to the data/ directory — one CSV per field plus a combined file for each dataset (subfields, works-by-year, works-by-source), giving 15 files in total. The combined files are what the Field-level analysis section reads back via pd.read_csv().

Code
from pathlib import Path

data_dir = Path("data")
data_dir.mkdir(exist_ok=True)

for name, df in dfs.items():
    slug = name.lower().replace(", ", "-").replace(" ", "-")
    path = data_dir / f"subfields-{slug}.csv"
    df.to_csv(path, index=False)
    print(f"Saved {len(df):>2} rows to {path}")

all_subfields = pd.concat(dfs.values(), ignore_index=True)
path = data_dir / "subfields.csv"
all_subfields.to_csv(path, index=False)
print(f"Saved {len(all_subfields):>2} rows to {path} (combined)")

for name, df in yearly.items():
    slug = name.lower().replace(", ", "-").replace(" ", "-")
    path = data_dir / f"works-by-year-{slug}.csv"
    df.to_csv(path, index=False)
    print(f"Saved {len(df):>2} rows to {path}")

all_yearly = pd.concat(yearly.values(), ignore_index=True)
path = data_dir / "works-by-year.csv"
all_yearly.to_csv(path, index=False)
print(f"Saved {len(all_yearly):>2} rows to {path} (combined)")

for name, df in sources.items():
    slug = name.lower().replace(", ", "-").replace(" ", "-")
    path = data_dir / f"works-by-source-{slug}.csv"
    df.to_csv(path, index=False)
    print(f"Saved {len(df):>5,} rows to {path}")

all_sources = pd.concat(sources.values(), ignore_index=True)
path = data_dir / "works-by-source.csv"
all_sources.to_csv(path, index=False)
print(f"Saved {len(all_sources):>5,} rows to {path} (combined)")

Field-level analysis

The visualisations below are driven by the CSV files saved in the previous steps. Re-run the Fetch data and Save to CSV cells when you need to refresh the underlying data.

Code
import pandas as pd
import matplotlib.pyplot as plt

all_subfields = pd.read_csv("data/subfields.csv")
all_yearly = pd.read_csv("data/works-by-year.csv")
all_sources = pd.read_csv("data/works-by-source.csv")

dfs = {name: g.reset_index(drop=True) for name, g in all_subfields.groupby("Field")}
yearly = {name: g.reset_index(drop=True) for name, g in all_yearly.groupby("Field")}
sources = {name: g.reset_index(drop=True) for name, g in all_sources.groupby("Field")}

BAR_HEIGHT = 0.25


def plot_subfields(df: pd.DataFrame):
    fig_h = max(2, len(df) * BAR_HEIGHT + 1)
    fig, ax = plt.subplots(figsize=(6, fig_h))
    ax.barh(df["Subfield"][::-1], df["Works"][::-1], color="#9e9e9e")
    ax.set_xlabel("Number of works")
    ax.grid(axis="x", color="#b0bec5", linewidth=0.5, linestyle="--")
    ax.set_axisbelow(True)
    for spine in ax.spines.values():
        spine.set_visible(False)
    plt.tight_layout()
    plt.show()


def plot_works_by_year(df: pd.DataFrame):
    fig, ax = plt.subplots(figsize=(6, 3))
    ax.fill_between(df["Year"], df["Works"], alpha=0.3, color="#e67e22")
    ax.plot(df["Year"], df["Works"], color="#e67e22", linewidth=1.5)
    ax.set_xlabel("Publication year")
    ax.set_ylabel("Number of works")
    ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f"{x:,.0f}"))
    ax.grid(axis="y", color="#b0bec5", linewidth=0.5, linestyle="--")
    ax.set_axisbelow(True)
    for spine in ax.spines.values():
        spine.set_visible(False)
    plt.tight_layout()
    plt.show()


TOP_N_SOURCES = 20


def plot_works_by_source(df: pd.DataFrame):
    top = df.head(TOP_N_SOURCES)
    fig_h = max(2, len(top) * BAR_HEIGHT + 1)
    fig, ax = plt.subplots(figsize=(6, fig_h))
    ax.barh(top["Source"][::-1], top["Works"][::-1], color="#2980b9")
    ax.set_xlabel("Number of works")
    ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f"{x:,.0f}"))
    ax.grid(axis="x", color="#b0bec5", linewidth=0.5, linestyle="--")
    ax.set_axisbelow(True)
    for spine in ax.spines.values():
        spine.set_visible(False)
    plt.tight_layout()
    plt.show()


def table_top_sources(df: pd.DataFrame):
    top = df.head(TOP_N_SOURCES)[["Source", "Works"]].copy()
    top.index = range(1, len(top) + 1)
    top.index.name = "Rank"
    top["Works"] = top["Works"].map("{:,}".format)
    print(top.to_markdown())


def plot_source_rank_freq(df: pd.DataFrame):
    rank = range(1, len(df) + 1)
    fig, ax = plt.subplots(figsize=(6, 3))
    ax.scatter(rank, df["Works"], s=12, alpha=0.4, facecolors="#8e44ad", edgecolors="#8e44ad", linewidths=0.5)
    ax.set_xscale("log")
    ax.set_yscale("log")
    ax.set_xlabel("Source rank")
    ax.set_ylabel("Number of works")
    ax.grid(True, which="major", color="#b0bec5", linewidth=0.5, linestyle="--")
    ax.set_axisbelow(True)
    for spine in ax.spines.values():
        spine.set_visible(False)
    plt.tight_layout()
    plt.show()

Physics and Astronomy

Code
total = dfs["Physics and Astronomy"]["Works"].sum()
n_sources = len(sources["Physics and Astronomy"])
print(f"OpenAlex entity: [fields/31](https://openalex.org/fields/31) | {total:,} total works | {n_sources:,} unique journals | Subfields data: [CSV](data/subfields.csv)")

OpenAlex entity: fields/31 | 24,429,050 total works | 62,267 unique journals | Subfields data: CSV

Code
plot_works_by_year(yearly["Physics and Astronomy"])
Figure 1: Annual distribution of works in Physics and Astronomy.
Code
table_top_sources(sources["Physics and Astronomy"])
Rank Source Works
1 arXiv (Cornell University) 409,029
2 The Astrophysical Journal 118,870
3 Physical Review Letters 94,100
4 Zenodo (CERN European Organization for Nuclear Research) 83,026
5 Monthly Notices of the Royal Astronomical Society 80,450
6 OSTI OAI (U.S. Department of Energy Office of Scientific and Technical Information) 79,307
7 Bulletin of the American Physical Society 68,927
8 Proceedings of SPIE, the International Society for Optical Engineering/Proceedings of SPIE 63,961
9 Physics Letters B 60,256
10 The Journal of Chemical Physics 59,294
11 Physical review. B, Condensed matter 58,862
12 AIP conference proceedings 58,575
13 Astronomy and Astrophysics 49,332
14 Applied Physics Letters 43,785
15 Journal of Applied Physics 43,460
16 Journal of High Energy Physics 43,057
17 Nature 42,423
18 Journal of Physics Conference Series 39,543
19 Physical review. D. Particles, fields, gravitation, and cosmology/Physical review. D. Particles and fields 39,263
20 Physical review. D/Physical review. D. 38,847
Code
plot_source_rank_freq(sources["Physics and Astronomy"])
Figure 2: Rank–frequency distribution of sources in Physics and Astronomy (log–log).
Code
plot_subfields(dfs["Physics and Astronomy"])
Figure 3: Subfields of Physics and Astronomy ranked by number of works.

Agricultural and Biological Sciences

Code
total = dfs["Agricultural and Biological Sciences"]["Works"].sum()
n_sources = len(sources["Agricultural and Biological Sciences"])
print(f"OpenAlex entity: [fields/11](https://openalex.org/fields/11) | {total:,} total works | {n_sources:,} unique journals | Subfields data: [CSV](data/subfields.csv)")

OpenAlex entity: fields/11 | 21,496,854 total works | 131,819 unique journals | Subfields data: CSV

Code
plot_works_by_year(yearly["Agricultural and Biological Sciences"])
Figure 4: Annual distribution of works in Agricultural and Biological Sciences.
Code
table_top_sources(sources["Agricultural and Biological Sciences"])
Rank Source Works
1 PubMed 185,410
2 IUCN Red List of Threatened Species 141,396
3 HAL (Le Centre pour la Communication Scientifique Directe) 111,827
4 Zenodo (CERN European Organization for Nuclear Research) 95,049
5 SHILAP Revista de lepidopterología 81,333
6 LA Referencia (Red Federada de Repositorios Institucionales de Publicaciones Científicas) 78,899
7 Figshare 71,481
8 Acta Horticulturae 55,800
9 Medical Entomology and Zoology 46,690
10 Dialnet (Universidad de la Rioja) 46,034
11 Agritrop (Cirad) 43,254
12 SSRN Electronic Journal 42,855
13 Elsevier eBooks 40,407
14 Biodiversity Heritage Library (Smithsonian Institution) 40,194
15 The NamesforLife Abstracts 37,663
16 RePEc: Research Papers in Economics 36,190
17 Americanae (AECID Library) 31,722
18 Nature 28,615
19 Journal of Animal Science 25,836
20 Journal of Economic Entomology 25,541
Code
plot_source_rank_freq(sources["Agricultural and Biological Sciences"])
Figure 5: Rank–frequency distribution of sources in Agricultural and Biological Sciences (log–log).
Code
plot_subfields(dfs["Agricultural and Biological Sciences"])
Figure 6: Subfields of Agricultural and Biological Sciences ranked by number of works.

Biochemistry, Genetics and Molecular Biology

Code
total = dfs["Biochemistry, Genetics and Molecular Biology"]["Works"].sum()
n_sources = len(sources["Biochemistry, Genetics and Molecular Biology"])
print(f"OpenAlex entity: [fields/13](https://openalex.org/fields/13) | {total:,} total works | {n_sources:,} unique journals | Subfields data: [CSV](data/subfields.csv)")

OpenAlex entity: fields/13 | 19,971,736 total works | 95,131 unique journals | Subfields data: CSV

Code
plot_works_by_year(yearly["Biochemistry, Genetics and Molecular Biology"])
Figure 7: Annual distribution of works in Biochemistry, Genetics and Molecular Biology.
Code
table_top_sources(sources["Biochemistry, Genetics and Molecular Biology"])
Rank Source Works
1 PubMed 743,952
2 Figshare 116,315
3 Journal of Biological Chemistry 115,980
4 Elsevier eBooks 71,437
5 Cancer Research 68,226
6 Proceedings of the National Academy of Sciences 64,967
7 Faculty Opinions – Post-Publication Peer Review of the Biomedical Literature 64,646
8 PLoS ONE 63,245
9 Zenodo (CERN European Organization for Nuclear Research) 50,398
10 Biochemical and Biophysical Research Communications 50,207
11 Nature 49,111
12 ChemInform 46,221
13 Scientific Reports 44,618
14 Biophysical Journal 44,257
15 The FASEB Journal 44,039
16 HAL (Le Centre pour la Communication Scientifique Directe) 43,533
17 Nucleic Acids Research 42,310
18 Biochemistry 41,454
19 Medical Entomology and Zoology 40,922
20 Harvard Dataverse 37,587
Code
plot_source_rank_freq(sources["Biochemistry, Genetics and Molecular Biology"])
Figure 8: Rank–frequency distribution of sources in Biochemistry, Genetics and Molecular Biology (log–log).
Code
plot_subfields(dfs["Biochemistry, Genetics and Molecular Biology"])
Figure 9: Subfields of Biochemistry, Genetics and Molecular Biology ranked by number of works.

Economics, Econometrics and Finance

Code
total = dfs["Economics, Econometrics and Finance"]["Works"].sum()
n_sources = len(sources["Economics, Econometrics and Finance"])
print(f"OpenAlex entity: [fields/20](https://openalex.org/fields/20) | {total:,} total works | {n_sources:,} unique journals | Subfields data: [CSV](data/subfields.csv)")

OpenAlex entity: fields/20 | 10,569,794 total works | 121,718 unique journals | Subfields data: CSV

Code
plot_works_by_year(yearly["Economics, Econometrics and Finance"])
Figure 10: Annual distribution of works in Economics, Econometrics and Finance.
Code
table_top_sources(sources["Economics, Econometrics and Finance"])
Rank Source Works
1 SSRN Electronic Journal 272,734
2 RePEc: Research Papers in Economics 261,611
3 PubMed 115,204
4 Columbia Academic Commons (Columbia University) 65,007
5 Dialnet (Universidad de la Rioja) 50,769
6 Medical Entomology and Zoology 43,519
7 SHILAP Revista de lepidopterología 33,454
8 arXiv (Cornell University) 32,812
9 Cambridge University Press eBooks 32,032
10 Econstor (Econstor) 30,335
11 Palgrave Macmillan UK eBooks 29,734
12 Zenodo (CERN European Organization for Nuclear Research) 29,369
13 Edward Elgar Publishing eBooks 24,269
14 HAL (Le Centre pour la Communication Scientifique Directe) 23,689
15 Oxford University Press eBooks 21,903
16 Harvard Dataverse 21,148
17 University of Zagreb University Computing Centre (SRCE) 20,876
18 Value in Health 18,861
19 Data Archiving and Networked Services (DANS) 16,773
20 LA Referencia (Red Federada de Repositorios Institucionales de Publicaciones Científicas) 16,199
Code
plot_source_rank_freq(sources["Economics, Econometrics and Finance"])
Figure 11: Rank–frequency distribution of sources in Economics, Econometrics and Finance (log–log).
Code
plot_subfields(dfs["Economics, Econometrics and Finance"])
Figure 12: Subfields of Economics, Econometrics and Finance ranked by number of works.