Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[dataviz] feat: add graph Active contributors grouped by age #1991

Merged
merged 9 commits into from
Jul 22, 2024
107 changes: 106 additions & 1 deletion data-visualization/pages/03_Community_evolution.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,16 @@

Comparison:
A public comparison between two videos involves one or more quality criteria.

Active contributor (during a period of time):
An active contributor is a user who makes at least one public comparison during a given
period of time.
"""

import pandas as pd
import plotly.express as px
import streamlit as st
from dateutil.relativedelta import relativedelta
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not a direct dependency of the project, should we add it to the file requirements.txt ?

from utils import set_df

st.set_page_config(
Expand Down Expand Up @@ -49,7 +55,7 @@ def add_contributors_evolution():
st.plotly_chart(fig)

with total_tab:
fig = (df.groupby("public_username").first().groupby("week_date").size().cumsum().plot())
fig = df.groupby("public_username").first().groupby("week_date").size().cumsum().plot()
fig.update_xaxes(title="Week")
fig.update_yaxes(title="Total number of public contributors")
fig.update_layout(showlegend=False)
Expand Down Expand Up @@ -82,7 +88,106 @@ def add_comparisons_evolution():
st.plotly_chart(fig)


def add_comparisons_evolution_grouped_by_contributors_age():
"""
Display the number of active contributors per week, grouped by age.
"""

st.markdown("#### Active contributors grouped by age")
st.info(
"An active contributor is a user who makes at least one public comparison during a given"
" period of time.",
)

df = st.session_state.df

df = df.drop_duplicates(subset=["public_username", "week_date"])[
["public_username", "week_date"]
].reset_index(
drop=True
) # Keep only the required data, remove duplicates.

df.week_date = pd.to_datetime(df.week_date, infer_datetime_format=True, utc=True).astype(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using astype raises a warning in the console, and will raise an exception in the future.

Using .astype to convert from timezone-aware dtype to timezone-naive dtype is deprecated and will raise in a future version. Use obj.tz_localize(None) or obj.tz_convert('UTC').tz_localize(None) instead

I don't know the best practice here (I have nearly zero experience with pandas), but it looks like the astype is used to make methods such as df.week_date.min() and .max() work. It should be possible to keep the timezone aware datetime, right? Maybe by explicitly sorting the column week_date?

If there is no side effect we may want to explicitly convert the timezone aware datetime to naive dates.

In case it helps:

# works
df.week_date = pd.to_datetime(df.week_date, infer_datetime_format=True, utc=True)
df.week_date.cat.as_ordered().max()

# doesn't work
df.week_date = pd.to_datetime(df.week_date, infer_datetime_format=True, utc=True)
df.week_date.max()
*** TypeError: Categorical is not ordered for operation max
you can use .as_ordered() to change the Categorical to an ordered one

"datetime64[ns]"
) # Convert dates to sortable dates.

weeks = pd.date_range(
start=df.week_date.min(), end=df.week_date.max(), freq="W-MON"
).to_list() # List of all weeks.

# Categories: one category for each season between min(week_date) and max(week_date). A season
# is a period of 3 months.
seasons = pd.date_range(
start=df.week_date.min().replace(month=1, day=1),
end=df.week_date.max(),
freq="3M",
).to_list()

# For each season, create a new dataframe.
sub_dfs = []

# Generate a new dataframe. For each public_username, assign the season of their first public
# comparison.
users_seasons = (
df.groupby("public_username", as_index=False)
.min()
.rename(columns={"week_date": "first_week"})
)
last_user_weeks = (
df.groupby("public_username", as_index=False)
.max()
.rename(columns={"week_date": "last_week"})
)
users_seasons["last_week"] = last_user_weeks["last_week"]

# Add new column in users_seasons. The value is the minimum season such as the week_date is
# greater than the season date.
users_seasons["season"] = users_seasons.first_week.apply(
lambda first_week: max((s for s in seasons if s <= first_week), default=seasons[0])
).reindex()

# If user min week_date is same as user max week_date, change its season by 'single week'.
users_seasons.loc[
users_seasons.loc[users_seasons.first_week.eq(users_seasons.last_week)].index, "season"
] = "single week"
seasons_users = users_seasons.groupby("season")["public_username"].aggregate(list).to_dict()

for s in seasons_users:
# Filter df to keep only users of season s.
season_df = (
df.loc[df.public_username.isin(seasons_users[s])]
.groupby("week_date")
.public_username.nunique()
)

if s == "single week":
sub_dfs.append(("= last comparison date", season_df))
else:
category = s.strftime("%Y %b") + " to " + (s + relativedelta(months=2)).strftime("%b")
sub_dfs.append((category, season_df))

# Merge previous computed series into one, by week_date.
dtf = pd.DataFrame({"week_date": weeks}).reset_index()
for name, sub_df in sub_dfs:
dtf = pd.merge(dtf, sub_df.to_frame(name=name), on="week_date", how="left").fillna(0)

fig = px.bar(
dtf,
x="week_date",
y=[name for name, _ in sub_dfs],
labels={
"value": "Active contributors",
"week_date": "Week",
"variable": "First comparison date",
},
color_discrete_sequence=px.colors.sample_colorscale("turbo", samplepoints=len(sub_dfs)),
color_discrete_map={"= last comparison date": "grey"},
)
st.plotly_chart(fig)


pd.options.plotting.backend = "plotly"

add_contributors_evolution()
add_comparisons_evolution()
add_comparisons_evolution_grouped_by_contributors_age()
Loading