I've created example data and script for you. Example data and your attempt was your job. Normally on Stack Overflow people don't do such things and such question could have been closed as lacking details or a script to debug. Just because I'm in the mood...
Make something along these lines...
Example data:
from pyspark.sql import functions as F
df = spark.createDataFrame(
[(101, '2020-01-01 13:01:01', '2020-01-01 14:01:01'),
(101, '2020-01-01 15:01:01', '2020-01-01 16:01:01'), # same user 2nd time
(101, '2020-01-01 17:01:01', '2020-01-02 11:01:01'), # same user 3rd time, but overnight
(102, '2020-01-02 05:01:01', '2020-01-02 07:01:01'),
(103, '2020-01-02 05:01:01', '2020-01-02 07:01:01'),
(104, None, '2020-01-02 07:01:01'), # No connected_at time
(105, '2020-01-04 10:01:01', None)], # No disconnected_at time
['device_id', 'connected_at', 'disconnected_at'])
Script:
connection_timeout_days = -1 # If negative, not applied
logged_days_if_no_start_date = 3
# Change timestamps to dates
df = df.withColumn('connected_at', F.to_date('connected_at')) \
.withColumn('disconnected_at', F.to_date('disconnected_at'))
# If no connected_at, subtract default number2
df = df.withColumn('connected_at', F.coalesce('connected_at',
F.date_sub('disconnected_at', logged_days_if_no_start_date)))
# If no disconnected_at, add today's date or timeout days
if connection_timeout_days < 0:
df = df.withColumn('disconnected_at', F.coalesce('disconnected_at', F.current_date()))
else:
df = df.withColumn(
'disconnected_at',
F.coalesce('disconnected_at',
F.least(F.current_date(), F.date_add('connected_at', connection_timeout_days))))
# Create a separate df with dates for the whole period
min_date = df.agg(F.min('connected_at')).head()[0]
max_date = df.agg(F.max('disconnected_at')).head()[0]
df_dates = spark.range(1).select(
F.explode(F.sequence(F.lit(min_date), F.lit(max_date))).alias('date')
)
# Transform original df - count distinct users per dates
df = (df
.select('device_id',
F.explode(F.sequence('connected_at', 'disconnected_at')).alias('date'))
.groupBy('date')
.agg(F.countDistinct('device_id').alias('device_distinct_count'))
)
# Join both dfs
df = df_dates.join(df, 'date', 'left') \
.fillna(0).sort('date')
Result:
df.show()
# +----------+---------------------+
# | date|device_distinct_count|
# +----------+---------------------+
# |2019-12-30| 1|
# |2019-12-31| 1|
# |2020-01-01| 2|
# |2020-01-02| 4|
# |2020-01-03| 0|
# |2020-01-04| 1|
# |2020-01-05| 1|
# |2020-01-06| 1|
# |2020-01-07| 1|
# |2020-01-08| 1|
# |2020-01-09| 1|
# |2020-01-10| 1|
# |2020-01-11| 1|
# |2020-01-12| 1|
# |2020-01-13| 1|
# |2020-01-14| 1|
# |2020-01-15| 1|
# |2020-01-16| 1|
# |2020-01-17| 1|
# |2020-01-18| 1|
# +----------+---------------------+
# only showing top 20 rows