tournesol-app · GresilleSiffle · Jul 22, 2024 · Jun 24, 2024 · Jun 25, 2024 · Jun 25, 2024
diff --git a/data-visualization/pages/03_Community_evolution.py b/data-visualization/pages/03_Community_evolution.py
@@ -8,10 +8,16 @@
 
     Comparison:
         A public comparison between two videos involves one or more quality criteria.
+
+    Active contributor (during a period of time):
+        An active contributor is a user who makes at least one public comparison during a given
+        period of time.
 """
 
 import pandas as pd
+import plotly.express as px
 import streamlit as st
+from dateutil.relativedelta import relativedelta
 from utils import set_df
 
 st.set_page_config(
@@ -49,7 +55,7 @@ def add_contributors_evolution():
         st.plotly_chart(fig)
 
     with total_tab:
-        fig = (df.groupby("public_username").first().groupby("week_date").size().cumsum().plot())
+        fig = df.groupby("public_username").first().groupby("week_date").size().cumsum().plot()
         fig.update_xaxes(title="Week")
         fig.update_yaxes(title="Total number of public contributors")
         fig.update_layout(showlegend=False)
@@ -82,7 +88,106 @@ def add_comparisons_evolution():
         st.plotly_chart(fig)
 
 
+def add_comparisons_evolution_grouped_by_contributors_age():
+    """
+    Display the number of active contributors per week, grouped by age.
+    """
+
+    st.markdown("#### Active contributors grouped by age")
+    st.info(
+        "An active contributor is a user who makes at least one public comparison during a given"
+        " period of time.",
+    )
+
+    df = st.session_state.df
+
+    df = df.drop_duplicates(subset=["public_username", "week_date"])[
+        ["public_username", "week_date"]
+    ].reset_index(
+        drop=True
+    )  # Keep only the required data, remove duplicates.
+
+    df.week_date = pd.to_datetime(df.week_date, infer_datetime_format=True, utc=True).astype(
+        "datetime64[ns]"
+    )  # Convert dates to sortable dates.
+
+    weeks = pd.date_range(
+        start=df.week_date.min(), end=df.week_date.max(), freq="W-MON"
+    ).to_list()  # List of all weeks.
+
+    # Categories: one category for each season between min(week_date) and max(week_date). A season
+    # is a period of 3 months.
+    seasons = pd.date_range(
+        start=df.week_date.min().replace(month=1, day=1),
+        end=df.week_date.max(),
+        freq="3M",
+    ).to_list()
+
+    # For each season, create a new dataframe.
+    sub_dfs = []
+
+    # Generate a new dataframe. For each public_username, assign the season of their first public
+    # comparison.
+    users_seasons = (
+        df.groupby("public_username", as_index=False)
+        .min()
+        .rename(columns={"week_date": "first_week"})
+    )
+    last_user_weeks = (
+        df.groupby("public_username", as_index=False)
+        .max()
+        .rename(columns={"week_date": "last_week"})
+    )
+    users_seasons["last_week"] = last_user_weeks["last_week"]
+
+    # Add new column in users_seasons. The value is the minimum season such as the week_date is
+    # greater than the season date.
+    users_seasons["season"] = users_seasons.first_week.apply(
+        lambda first_week: max((s for s in seasons if s <= first_week), default=seasons[0])
+    ).reindex()
+
+    # If user min week_date is same as user max week_date, change its season by 'single week'.
+    users_seasons.loc[
+        users_seasons.loc[users_seasons.first_week.eq(users_seasons.last_week)].index, "season"
+    ] = "single week"
+    seasons_users = users_seasons.groupby("season")["public_username"].aggregate(list).to_dict()
+
+    for s in seasons_users:
+        # Filter df to keep only users of season s.
+        season_df = (
+            df.loc[df.public_username.isin(seasons_users[s])]
+            .groupby("week_date")
+            .public_username.nunique()
+        )
+
+        if s == "single week":
+            sub_dfs.append(("= last comparison date", season_df))
+        else:
+            category = s.strftime("%Y %b") + " to " + (s + relativedelta(months=2)).strftime("%b")
+            sub_dfs.append((category, season_df))
+
+    # Merge previous computed series into one, by week_date.
+    dtf = pd.DataFrame({"week_date": weeks}).reset_index()
+    for name, sub_df in sub_dfs:
+        dtf = pd.merge(dtf, sub_df.to_frame(name=name), on="week_date", how="left").fillna(0)
+
+    fig = px.bar(
+        dtf,
+        x="week_date",
+        y=[name for name, _ in sub_dfs],
+        labels={
+            "value": "Active contributors",
+            "week_date": "Week",
+            "variable": "First comparison date",
+        },
+        color_discrete_sequence=px.colors.sample_colorscale("turbo", samplepoints=len(sub_dfs)),
+        color_discrete_map={"= last comparison date": "grey"},
+    )
+    st.plotly_chart(fig)
+
+
 pd.options.plotting.backend = "plotly"
 
 add_contributors_evolution()
 add_comparisons_evolution()
+add_comparisons_evolution_grouped_by_contributors_age()